diff --git a/.github/workflows/nightly.yml b/.github/workflows/nightly.yml index acf6bbe8f6a..cf61012f2bc 100644 --- a/.github/workflows/nightly.yml +++ b/.github/workflows/nightly.yml @@ -121,8 +121,6 @@ jobs: docker ps --all --quiet | xargs --no-run-if-empty docker rm -f ||: sudo rm -fr "$TEMP_PATH" "$CACHES_PATH" SonarCloud: - # TODO: Remove if: whenever SonarCloud supports c++23 - if: ${{ false }} runs-on: [self-hosted, builder] env: SONAR_SCANNER_VERSION: 4.8.0.2856 @@ -159,7 +157,7 @@ jobs: - name: Set Up Build Tools run: | sudo apt-get update - sudo apt-get install -yq git cmake ccache ninja-build python3 yasm + sudo apt-get install -yq git cmake ccache ninja-build python3 yasm nasm sudo bash -c "$(wget -O - https://apt.llvm.org/llvm.sh)" - name: Run build-wrapper run: | @@ -178,4 +176,5 @@ jobs: --define sonar.cfamily.build-wrapper-output="${{ env.BUILD_WRAPPER_OUT_DIR }}" \ --define sonar.projectKey="ClickHouse_ClickHouse" \ --define sonar.organization="clickhouse-java" \ - --define sonar.exclusions="**/*.java,**/*.ts,**/*.js,**/*.css,**/*.sql" \ + --define sonar.cfamily.cpp23.enabled=true \ + --define sonar.exclusions="**/*.java,**/*.ts,**/*.js,**/*.css,**/*.sql" diff --git a/CHANGELOG.md b/CHANGELOG.md index 72372c8fac4..bf6b309ef2c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,4 +1,5 @@ ### Table of Contents +**[ClickHouse release v23.6, 2023-06-30](#236)**
**[ClickHouse release v23.5, 2023-06-08](#235)**
**[ClickHouse release v23.4, 2023-04-26](#234)**
**[ClickHouse release v23.3 LTS, 2023-03-30](#233)**
@@ -8,6 +9,106 @@ # 2023 Changelog +### ClickHouse release 23.6, 2023-06-29 + +#### Backward Incompatible Change +* Delete feature `do_not_evict_index_and_mark_files` in the fs cache. This feature was only making things worse. [#51253](https://github.com/ClickHouse/ClickHouse/pull/51253) ([Kseniia Sumarokova](https://github.com/kssenii)). +* Remove ALTER support for experimental LIVE VIEW. [#51287](https://github.com/ClickHouse/ClickHouse/pull/51287) ([Alexey Milovidov](https://github.com/alexey-milovidov)). +* Decrease the default values for `http_max_field_value_size` and `http_max_field_name_size` to 128 KiB. [#51163](https://github.com/ClickHouse/ClickHouse/pull/51163) ([Mikhail f. Shiryaev](https://github.com/Felixoid)). +* CGroups metrics related to CPU are replaced with one metric, `CGroupMaxCPU` for better usability. The `Normalized` CPU usage metrics will be normalized to CGroups limits instead of the total number of CPUs when they are set. This closes [#50836](https://github.com/ClickHouse/ClickHouse/issues/50836). [#50835](https://github.com/ClickHouse/ClickHouse/pull/50835) ([Alexey Milovidov](https://github.com/alexey-milovidov)). + +#### New Feature +* The function `transform` as well as `CASE` with value matching started to support all data types. This closes [#29730](https://github.com/ClickHouse/ClickHouse/issues/29730). This closes [#32387](https://github.com/ClickHouse/ClickHouse/issues/32387). This closes [#50827](https://github.com/ClickHouse/ClickHouse/issues/50827). This closes [#31336](https://github.com/ClickHouse/ClickHouse/issues/31336). This closes [#40493](https://github.com/ClickHouse/ClickHouse/issues/40493). [#51351](https://github.com/ClickHouse/ClickHouse/pull/51351) ([Alexey Milovidov](https://github.com/alexey-milovidov)). +* Added option `--rename_files_after_processing `. This closes [#34207](https://github.com/ClickHouse/ClickHouse/issues/34207). [#49626](https://github.com/ClickHouse/ClickHouse/pull/49626) ([alekseygolub](https://github.com/alekseygolub)). +* Add support for `TRUNCATE` modifier in `INTO OUTFILE` clause. Suggest using `APPEND` or `TRUNCATE` for `INTO OUTFILE` when file exists. [#50950](https://github.com/ClickHouse/ClickHouse/pull/50950) ([alekar](https://github.com/alekar)). +* Add table engine `Redis` and table function `redis`. It allows querying external Redis servers. [#50150](https://github.com/ClickHouse/ClickHouse/pull/50150) ([JackyWoo](https://github.com/JackyWoo)). +* Allow to skip empty files in file/s3/url/hdfs table functions using settings `s3_skip_empty_files`, `hdfs_skip_empty_files`, `engine_file_skip_empty_files`, `engine_url_skip_empty_files`. [#50364](https://github.com/ClickHouse/ClickHouse/pull/50364) ([Kruglov Pavel](https://github.com/Avogar)). +* Add a new setting named `use_mysql_types_in_show_columns` to alter the `SHOW COLUMNS` SQL statement to display MySQL equivalent types when a client is connected via the MySQL compatibility port. [#49577](https://github.com/ClickHouse/ClickHouse/pull/49577) ([Thomas Panetti](https://github.com/tpanetti)). +* Clickhouse-client can now be called with a connection string instead of "--host", "--port", "--user" etc. [#50689](https://github.com/ClickHouse/ClickHouse/pull/50689) ([Alexey Gerasimchuck](https://github.com/Demilivor)). +* Add setting `session_timezone`; it is used as the default timezone for a session when not explicitly specified. [#44149](https://github.com/ClickHouse/ClickHouse/pull/44149) ([Andrey Zvonov](https://github.com/zvonand)). +* Codec DEFLATE_QPL is now controlled via server setting "enable_deflate_qpl_codec" (default: false) instead of setting "allow_experimental_codecs". This marks DEFLATE_QPL non-experimental. [#50775](https://github.com/ClickHouse/ClickHouse/pull/50775) ([Robert Schulze](https://github.com/rschu1ze)). + +#### Performance Improvement +* Improved scheduling of merge selecting and cleanup tasks in `ReplicatedMergeTree`. The tasks will not be executed too frequently when there's nothing to merge or cleanup. Added settings `max_merge_selecting_sleep_ms`, `merge_selecting_sleep_slowdown_factor`, `max_cleanup_delay_period` and `cleanup_thread_preferred_points_per_iteration`. It should close [#31919](https://github.com/ClickHouse/ClickHouse/issues/31919). [#50107](https://github.com/ClickHouse/ClickHouse/pull/50107) ([Alexander Tokmakov](https://github.com/tavplubix)). +* Make filter push down through cross join. [#50605](https://github.com/ClickHouse/ClickHouse/pull/50605) ([Han Fei](https://github.com/hanfei1991)). +* Improve performance with enabled QueryProfiler using thread-local timer_id instead of global object. [#48778](https://github.com/ClickHouse/ClickHouse/pull/48778) ([Jiebin Sun](https://github.com/jiebinn)). +* Rewrite CapnProto input/output format to improve its performance. Map column names and CapnProto fields case insensitive, fix reading/writing of nested structure fields. [#49752](https://github.com/ClickHouse/ClickHouse/pull/49752) ([Kruglov Pavel](https://github.com/Avogar)). +* Optimize parquet write performance for parallel threads. [#50102](https://github.com/ClickHouse/ClickHouse/pull/50102) ([Hongbin Ma](https://github.com/binmahone)). +* Disable `parallelize_output_from_storages` for processing MATERIALIZED VIEWs and storages with one block only. [#50214](https://github.com/ClickHouse/ClickHouse/pull/50214) ([Azat Khuzhin](https://github.com/azat)). +* Merge PR [#46558](https://github.com/ClickHouse/ClickHouse/pull/46558). Avoid block permutation during sort if the block is already sorted. [#50697](https://github.com/ClickHouse/ClickHouse/pull/50697) ([Alexey Milovidov](https://github.com/alexey-milovidov), [Maksim Kita](https://github.com/kitaisreal)). +* Make multiple list requests to ZooKeeper in parallel to speed up reading from system.zookeeper table. [#51042](https://github.com/ClickHouse/ClickHouse/pull/51042) ([Alexander Gololobov](https://github.com/davenger)). +* Speedup initialization of DateTime lookup tables for time zones. This should reduce startup/connect time of clickhouse-client especially in debug build as it is rather heavy. [#51347](https://github.com/ClickHouse/ClickHouse/pull/51347) ([Alexander Gololobov](https://github.com/davenger)). +* Fix data lakes slowness because of synchronous head requests. (Related to Iceberg/Deltalake/Hudi being slow with a lot of files). [#50976](https://github.com/ClickHouse/ClickHouse/pull/50976) ([Kseniia Sumarokova](https://github.com/kssenii)). +* Do not read all the columns from right GLOBAL JOIN table. [#50721](https://github.com/ClickHouse/ClickHouse/pull/50721) ([Nikolai Kochetov](https://github.com/KochetovNicolai)). + +#### Experimental Feature +* Support parallel replicas with the analyzer. [#50441](https://github.com/ClickHouse/ClickHouse/pull/50441) ([Raúl Marín](https://github.com/Algunenano)). +* Add random sleep before large merges/mutations execution to split load more evenly between replicas in case of zero-copy replication. [#51282](https://github.com/ClickHouse/ClickHouse/pull/51282) ([alesapin](https://github.com/alesapin)). +* Do not replicate `ALTER PARTITION` queries and mutations through `Replicated` database if it has only one shard and the underlying table is `ReplicatedMergeTree`. [#51049](https://github.com/ClickHouse/ClickHouse/pull/51049) ([Alexander Tokmakov](https://github.com/tavplubix)). + +#### Improvement +* Relax the thresholds for "too many parts" to be more modern. Return the backpressure during long-running insert queries. [#50856](https://github.com/ClickHouse/ClickHouse/pull/50856) ([Alexey Milovidov](https://github.com/alexey-milovidov)). +* Allow to cast IPv6 to IPv4 address for CIDR ::ffff:0:0/96 (IPv4-mapped addresses). [#49759](https://github.com/ClickHouse/ClickHouse/pull/49759) ([Yakov Olkhovskiy](https://github.com/yakov-olkhovskiy)). +* Update MongoDB protocol to support MongoDB 5.1 version and newer. Support for the versions with the old protocol (<3.6) is preserved. Closes [#45621](https://github.com/ClickHouse/ClickHouse/issues/45621), [#49879](https://github.com/ClickHouse/ClickHouse/issues/49879). [#50061](https://github.com/ClickHouse/ClickHouse/pull/50061) ([Nikolay Degterinsky](https://github.com/evillique)). +* Add setting `input_format_max_bytes_to_read_for_schema_inference` to limit the number of bytes to read in schema inference. Closes [#50577](https://github.com/ClickHouse/ClickHouse/issues/50577). [#50592](https://github.com/ClickHouse/ClickHouse/pull/50592) ([Kruglov Pavel](https://github.com/Avogar)). +* Respect setting `input_format_null_as_default` in schema inference. [#50602](https://github.com/ClickHouse/ClickHouse/pull/50602) ([Kruglov Pavel](https://github.com/Avogar)). +* Allow to skip trailing empty lines in CSV/TSV/CustomSeparated formats via settings `input_format_csv_skip_trailing_empty_lines`, `input_format_tsv_skip_trailing_empty_lines` and `input_format_custom_skip_trailing_empty_lines` (disabled by default). Closes [#49315](https://github.com/ClickHouse/ClickHouse/issues/49315). [#50635](https://github.com/ClickHouse/ClickHouse/pull/50635) ([Kruglov Pavel](https://github.com/Avogar)). +* Functions "toDateOrDefault|OrNull" and "accuateCast[OrDefault|OrNull]" now correctly parse numeric arguments. [#50709](https://github.com/ClickHouse/ClickHouse/pull/50709) ([Dmitry Kardymon](https://github.com/kardymonds)). +* Support CSV with whitespace or `\t` field delimiters, and these delimiters are supported in Spark. [#50712](https://github.com/ClickHouse/ClickHouse/pull/50712) ([KevinyhZou](https://github.com/KevinyhZou)). +* Settings `number_of_mutations_to_delay` and `number_of_mutations_to_throw` are enabled by default now with values 500 and 1000 respectively. [#50726](https://github.com/ClickHouse/ClickHouse/pull/50726) ([Anton Popov](https://github.com/CurtizJ)). +* The dashboard correctly shows missing values. This closes [#50831](https://github.com/ClickHouse/ClickHouse/issues/50831). [#50832](https://github.com/ClickHouse/ClickHouse/pull/50832) ([Alexey Milovidov](https://github.com/alexey-milovidov)). +* Added the possibility to use date and time arguments in the syslog timestamp format in functions `parseDateTimeBestEffort*` and `parseDateTime64BestEffort*`. [#50925](https://github.com/ClickHouse/ClickHouse/pull/50925) ([Victor Krasnov](https://github.com/sirvickr)). +* Command line parameter "--password" in clickhouse-client can now be specified only once. [#50966](https://github.com/ClickHouse/ClickHouse/pull/50966) ([Alexey Gerasimchuck](https://github.com/Demilivor)). +* Use `hash_of_all_files` from `system.parts` to check identity of parts during on-cluster backups. [#50997](https://github.com/ClickHouse/ClickHouse/pull/50997) ([Vitaly Baranov](https://github.com/vitlibar)). +* The system table zookeeper_connection connected_time identifies the time when the connection is established (standard format), and session_uptime_elapsed_seconds is added, which labels the duration of the established connection session (in seconds). [#51026](https://github.com/ClickHouse/ClickHouse/pull/51026) ([郭小龙](https://github.com/guoxiaolongzte)). +* Improve the progress bar for file/s3/hdfs/url table functions by using chunk size from source data and using incremental total size counting in each thread. Fix the progress bar for *Cluster functions. This closes [#47250](https://github.com/ClickHouse/ClickHouse/issues/47250). [#51088](https://github.com/ClickHouse/ClickHouse/pull/51088) ([Kruglov Pavel](https://github.com/Avogar)). +* Add total_bytes_to_read to the Progress packet in TCP protocol for better Progress bar. [#51158](https://github.com/ClickHouse/ClickHouse/pull/51158) ([Kruglov Pavel](https://github.com/Avogar)). +* Better checking of data parts on disks with filesystem cache. [#51164](https://github.com/ClickHouse/ClickHouse/pull/51164) ([Anton Popov](https://github.com/CurtizJ)). +* Fix sometimes not correct current_elements_num in fs cache. [#51242](https://github.com/ClickHouse/ClickHouse/pull/51242) ([Kseniia Sumarokova](https://github.com/kssenii)). + +#### Build/Testing/Packaging Improvement +* Add embedded keeper-client to standalone keeper binary. [#50964](https://github.com/ClickHouse/ClickHouse/pull/50964) ([pufit](https://github.com/pufit)). +* Actual LZ4 version is used now. [#50621](https://github.com/ClickHouse/ClickHouse/pull/50621) ([Nikita Taranov](https://github.com/nickitat)). +* ClickHouse server will print the list of changed settings on fatal errors. This closes [#51137](https://github.com/ClickHouse/ClickHouse/issues/51137). [#51138](https://github.com/ClickHouse/ClickHouse/pull/51138) ([Alexey Milovidov](https://github.com/alexey-milovidov)). +* Allow building ClickHouse with clang-17. [#51300](https://github.com/ClickHouse/ClickHouse/pull/51300) ([Alexey Milovidov](https://github.com/alexey-milovidov)). +* [SQLancer](https://github.com/sqlancer/sqlancer) check is considered stable as bugs that were triggered by it are fixed. Now failures of SQLancer check will be reported as failed check status. [#51340](https://github.com/ClickHouse/ClickHouse/pull/51340) ([Ilya Yatsishin](https://github.com/qoega)). +* Split huge `RUN` in Dockerfile into smaller conditional. Install the necessary tools on demand in the same `RUN` layer, and remove them after that. Upgrade the OS only once at the beginning. Use a modern way to check the signed repository. Downgrade the base repo to ubuntu:20.04 to address the issues on older docker versions. Upgrade golang version to address golang vulnerabilities. [#51504](https://github.com/ClickHouse/ClickHouse/pull/51504) ([Mikhail f. Shiryaev](https://github.com/Felixoid)). + +#### Bug Fix (user-visible misbehavior in an official stable release) + +* Report loading status for executable dictionaries correctly [#48775](https://github.com/ClickHouse/ClickHouse/pull/48775) ([Anton Kozlov](https://github.com/tonickkozlov)). +* Proper mutation of skip indices and projections [#50104](https://github.com/ClickHouse/ClickHouse/pull/50104) ([Amos Bird](https://github.com/amosbird)). +* Cleanup moving parts [#50489](https://github.com/ClickHouse/ClickHouse/pull/50489) ([vdimir](https://github.com/vdimir)). +* Fix backward compatibility for IP types hashing in aggregate functions [#50551](https://github.com/ClickHouse/ClickHouse/pull/50551) ([Yakov Olkhovskiy](https://github.com/yakov-olkhovskiy)). +* Fix Log family table return wrong rows count after truncate [#50585](https://github.com/ClickHouse/ClickHouse/pull/50585) ([flynn](https://github.com/ucasfl)). +* Fix bug in `uniqExact` parallel merging [#50590](https://github.com/ClickHouse/ClickHouse/pull/50590) ([Nikita Taranov](https://github.com/nickitat)). +* Revert recent grace hash join changes [#50699](https://github.com/ClickHouse/ClickHouse/pull/50699) ([vdimir](https://github.com/vdimir)). +* Query Cache: Try to fix bad cast from `ColumnConst` to `ColumnVector` [#50704](https://github.com/ClickHouse/ClickHouse/pull/50704) ([Robert Schulze](https://github.com/rschu1ze)). +* Avoid storing logs in Keeper containing unknown operation [#50751](https://github.com/ClickHouse/ClickHouse/pull/50751) ([Antonio Andelic](https://github.com/antonio2368)). +* SummingMergeTree support for DateTime64 [#50797](https://github.com/ClickHouse/ClickHouse/pull/50797) ([Jordi Villar](https://github.com/jrdi)). +* Add compatibility setting for non-const timezones [#50834](https://github.com/ClickHouse/ClickHouse/pull/50834) ([Robert Schulze](https://github.com/rschu1ze)). +* Fix hashing of LDAP params in the cache entries [#50865](https://github.com/ClickHouse/ClickHouse/pull/50865) ([Julian Maicher](https://github.com/jmaicher)). +* Fallback to parsing big integer from String instead of exception in Parquet format [#50873](https://github.com/ClickHouse/ClickHouse/pull/50873) ([Kruglov Pavel](https://github.com/Avogar)). +* Fix checking the lock file too often while writing a backup [#50889](https://github.com/ClickHouse/ClickHouse/pull/50889) ([Vitaly Baranov](https://github.com/vitlibar)). +* Do not apply projection if read-in-order was enabled. [#50923](https://github.com/ClickHouse/ClickHouse/pull/50923) ([Nikolai Kochetov](https://github.com/KochetovNicolai)). +* Fix race in the Azure blob storage iterator [#50936](https://github.com/ClickHouse/ClickHouse/pull/50936) ([SmitaRKulkarni](https://github.com/SmitaRKulkarni)). +* Fix erroneous `sort_description` propagation in `CreatingSets` [#50955](https://github.com/ClickHouse/ClickHouse/pull/50955) ([Nikita Taranov](https://github.com/nickitat)). +* Fix Iceberg v2 optional metadata parsing [#50974](https://github.com/ClickHouse/ClickHouse/pull/50974) ([Kseniia Sumarokova](https://github.com/kssenii)). +* MaterializedMySQL: Keep parentheses for empty table overrides [#50977](https://github.com/ClickHouse/ClickHouse/pull/50977) ([Val Doroshchuk](https://github.com/valbok)). +* Fix crash in BackupCoordinationStageSync::setError() [#51012](https://github.com/ClickHouse/ClickHouse/pull/51012) ([Vitaly Baranov](https://github.com/vitlibar)). +* Fix subtly broken copy-on-write of ColumnLowCardinality dictionary [#51064](https://github.com/ClickHouse/ClickHouse/pull/51064) ([Michael Kolupaev](https://github.com/al13n321)). +* Generate safe IVs [#51086](https://github.com/ClickHouse/ClickHouse/pull/51086) ([Salvatore Mesoraca](https://github.com/aiven-sal)). +* Fix ineffective query cache for SELECTs with subqueries [#51132](https://github.com/ClickHouse/ClickHouse/pull/51132) ([Robert Schulze](https://github.com/rschu1ze)). +* Fix Set index with constant nullable comparison. [#51205](https://github.com/ClickHouse/ClickHouse/pull/51205) ([Nikolai Kochetov](https://github.com/KochetovNicolai)). +* Fix a crash in s3 and s3Cluster functions [#51209](https://github.com/ClickHouse/ClickHouse/pull/51209) ([Nikolay Degterinsky](https://github.com/evillique)). +* Fix a crash with compiled expressions [#51231](https://github.com/ClickHouse/ClickHouse/pull/51231) ([LiuNeng](https://github.com/liuneng1994)). +* Fix use-after-free in StorageURL when switching URLs [#51260](https://github.com/ClickHouse/ClickHouse/pull/51260) ([Michael Kolupaev](https://github.com/al13n321)). +* Updated check for parameterized view [#51272](https://github.com/ClickHouse/ClickHouse/pull/51272) ([SmitaRKulkarni](https://github.com/SmitaRKulkarni)). +* Fix multiple writing of same file to backup [#51299](https://github.com/ClickHouse/ClickHouse/pull/51299) ([Vitaly Baranov](https://github.com/vitlibar)). +* Fix fuzzer failure in ActionsDAG [#51301](https://github.com/ClickHouse/ClickHouse/pull/51301) ([Alexey Milovidov](https://github.com/alexey-milovidov)). +* Remove garbage from function `transform` [#51350](https://github.com/ClickHouse/ClickHouse/pull/51350) ([Alexey Milovidov](https://github.com/alexey-milovidov)). + + ### ClickHouse release 23.5, 2023-06-08 #### Upgrade Notes diff --git a/README.md b/README.md index eac036c2d9b..1036e1a97e1 100644 --- a/README.md +++ b/README.md @@ -16,18 +16,19 @@ curl https://clickhouse.com/ | sh * [YouTube channel](https://www.youtube.com/c/ClickHouseDB) has a lot of content about ClickHouse in video format. * [Slack](https://clickhouse.com/slack) and [Telegram](https://telegram.me/clickhouse_en) allow chatting with ClickHouse users in real-time. * [Blog](https://clickhouse.com/blog/) contains various ClickHouse-related articles, as well as announcements and reports about events. -* [Code Browser (Woboq)](https://clickhouse.com/codebrowser/ClickHouse/index.html) with syntax highlight and navigation. -* [Code Browser (github.dev)](https://github.dev/ClickHouse/ClickHouse) with syntax highlight, powered by github.dev. +* [Code Browser (Woboq)](https://clickhouse.com/codebrowser/ClickHouse/index.html) with syntax highlighting and navigation. +* [Code Browser (github.dev)](https://github.dev/ClickHouse/ClickHouse) with syntax highlighting, powered by github.dev. +* [Static Analysis (SonarCloud)](https://sonarcloud.io/project/issues?resolved=false&id=ClickHouse_ClickHouse) proposes C++ quality improvements. * [Contacts](https://clickhouse.com/company/contact) can help to get your questions answered if there are any. ## Upcoming Events * [**v23.6 Release Webinar**](https://clickhouse.com/company/events/v23-6-release-call?utm_source=github&utm_medium=social&utm_campaign=release-webinar-2023-06) - Jun 29 - 23.6 is rapidly approaching. Original creator, co-founder, and CTO of ClickHouse Alexey Milovidov will walk us through the highlights of the release. -* [**ClickHouse Meetup in Paris**](https://www.meetup.com/clickhouse-france-user-group/events/294283460) - Jul 4 * [**ClickHouse Meetup in Boston**](https://www.meetup.com/clickhouse-boston-user-group/events/293913596) - Jul 18 * [**ClickHouse Meetup in NYC**](https://www.meetup.com/clickhouse-new-york-user-group/events/293913441) - Jul 19 * [**ClickHouse Meetup in Toronto**](https://www.meetup.com/clickhouse-toronto-user-group/events/294183127) - Jul 20 * [**ClickHouse Meetup in Singapore**](https://www.meetup.com/clickhouse-singapore-meetup-group/events/294428050/) - Jul 27 +* [**ClickHouse Meetup in Paris**](https://www.meetup.com/clickhouse-france-user-group/events/294283460) - Sep 12 Also, keep an eye out for upcoming meetups around the world. Somewhere else you want us to be? Please feel free to reach out to tyler clickhouse com. diff --git a/SECURITY.md b/SECURITY.md index 1864eb6e9e5..4ba5f13d09c 100644 --- a/SECURITY.md +++ b/SECURITY.md @@ -13,6 +13,7 @@ The following versions of ClickHouse server are currently being supported with s | Version | Supported | |:-|:-| +| 23.6 | ✔️ | | 23.5 | ✔️ | | 23.4 | ✔️ | | 23.3 | ✔️ | diff --git a/base/base/IPv4andIPv6.h b/base/base/IPv4andIPv6.h index 7b745ec7b84..e2f93b54124 100644 --- a/base/base/IPv4andIPv6.h +++ b/base/base/IPv4andIPv6.h @@ -2,21 +2,23 @@ #include #include +#include #include namespace DB { - using IPv4 = StrongTypedef; + struct IPv4 : StrongTypedef + { + using StrongTypedef::StrongTypedef; + using StrongTypedef::operator=; + constexpr explicit IPv4(UInt64 value): StrongTypedef(static_cast(value)) {} + }; struct IPv6 : StrongTypedef { - constexpr IPv6() = default; - constexpr explicit IPv6(const UInt128 & x) : StrongTypedef(x) {} - constexpr explicit IPv6(UInt128 && x) : StrongTypedef(std::move(x)) {} - - IPv6 & operator=(const UInt128 & rhs) { StrongTypedef::operator=(rhs); return *this; } - IPv6 & operator=(UInt128 && rhs) { StrongTypedef::operator=(std::move(rhs)); return *this; } + using StrongTypedef::StrongTypedef; + using StrongTypedef::operator=; bool operator<(const IPv6 & rhs) const { @@ -54,12 +56,22 @@ namespace DB namespace std { + /// For historical reasons we hash IPv6 as a FixedString(16) template <> struct hash { size_t operator()(const DB::IPv6 & x) const { - return std::hash()(x.toUnderType()); + return std::hash{}(std::string_view(reinterpret_cast(&x.toUnderType()), IPV6_BINARY_LENGTH)); + } + }; + + template <> + struct hash + { + size_t operator()(const DB::IPv4 & x) const + { + return std::hash()(x.toUnderType()); } }; } diff --git a/base/base/hex.h b/base/base/hex.h index b8cf95db893..937218fec5a 100644 --- a/base/base/hex.h +++ b/base/base/hex.h @@ -4,212 +4,288 @@ #include #include "types.h" -/// Maps 0..15 to 0..9A..F or 0..9a..f correspondingly. +namespace CityHash_v1_0_2 { struct uint128; } -constexpr inline std::string_view hex_digit_to_char_uppercase_table = "0123456789ABCDEF"; -constexpr inline std::string_view hex_digit_to_char_lowercase_table = "0123456789abcdef"; +namespace wide +{ + template + class integer; +} + +namespace impl +{ + /// Maps 0..15 to 0..9A..F or 0..9a..f correspondingly. + constexpr inline std::string_view hex_digit_to_char_uppercase_table = "0123456789ABCDEF"; + constexpr inline std::string_view hex_digit_to_char_lowercase_table = "0123456789abcdef"; + + /// Maps 0..255 to 00..FF or 00..ff correspondingly. + constexpr inline std::string_view hex_byte_to_char_uppercase_table = // + "000102030405060708090A0B0C0D0E0F" + "101112131415161718191A1B1C1D1E1F" + "202122232425262728292A2B2C2D2E2F" + "303132333435363738393A3B3C3D3E3F" + "404142434445464748494A4B4C4D4E4F" + "505152535455565758595A5B5C5D5E5F" + "606162636465666768696A6B6C6D6E6F" + "707172737475767778797A7B7C7D7E7F" + "808182838485868788898A8B8C8D8E8F" + "909192939495969798999A9B9C9D9E9F" + "A0A1A2A3A4A5A6A7A8A9AAABACADAEAF" + "B0B1B2B3B4B5B6B7B8B9BABBBCBDBEBF" + "C0C1C2C3C4C5C6C7C8C9CACBCCCDCECF" + "D0D1D2D3D4D5D6D7D8D9DADBDCDDDEDF" + "E0E1E2E3E4E5E6E7E8E9EAEBECEDEEEF" + "F0F1F2F3F4F5F6F7F8F9FAFBFCFDFEFF"; + + constexpr inline std::string_view hex_byte_to_char_lowercase_table = // + "000102030405060708090a0b0c0d0e0f" + "101112131415161718191a1b1c1d1e1f" + "202122232425262728292a2b2c2d2e2f" + "303132333435363738393a3b3c3d3e3f" + "404142434445464748494a4b4c4d4e4f" + "505152535455565758595a5b5c5d5e5f" + "606162636465666768696a6b6c6d6e6f" + "707172737475767778797a7b7c7d7e7f" + "808182838485868788898a8b8c8d8e8f" + "909192939495969798999a9b9c9d9e9f" + "a0a1a2a3a4a5a6a7a8a9aaabacadaeaf" + "b0b1b2b3b4b5b6b7b8b9babbbcbdbebf" + "c0c1c2c3c4c5c6c7c8c9cacbcccdcecf" + "d0d1d2d3d4d5d6d7d8d9dadbdcdddedf" + "e0e1e2e3e4e5e6e7e8e9eaebecedeeef" + "f0f1f2f3f4f5f6f7f8f9fafbfcfdfeff"; + + /// Maps 0..255 to 00000000..11111111 correspondingly. + constexpr inline std::string_view bin_byte_to_char_table = // + "0000000000000001000000100000001100000100000001010000011000000111" + "0000100000001001000010100000101100001100000011010000111000001111" + "0001000000010001000100100001001100010100000101010001011000010111" + "0001100000011001000110100001101100011100000111010001111000011111" + "0010000000100001001000100010001100100100001001010010011000100111" + "0010100000101001001010100010101100101100001011010010111000101111" + "0011000000110001001100100011001100110100001101010011011000110111" + "0011100000111001001110100011101100111100001111010011111000111111" + "0100000001000001010000100100001101000100010001010100011001000111" + "0100100001001001010010100100101101001100010011010100111001001111" + "0101000001010001010100100101001101010100010101010101011001010111" + "0101100001011001010110100101101101011100010111010101111001011111" + "0110000001100001011000100110001101100100011001010110011001100111" + "0110100001101001011010100110101101101100011011010110111001101111" + "0111000001110001011100100111001101110100011101010111011001110111" + "0111100001111001011110100111101101111100011111010111111001111111" + "1000000010000001100000101000001110000100100001011000011010000111" + "1000100010001001100010101000101110001100100011011000111010001111" + "1001000010010001100100101001001110010100100101011001011010010111" + "1001100010011001100110101001101110011100100111011001111010011111" + "1010000010100001101000101010001110100100101001011010011010100111" + "1010100010101001101010101010101110101100101011011010111010101111" + "1011000010110001101100101011001110110100101101011011011010110111" + "1011100010111001101110101011101110111100101111011011111010111111" + "1100000011000001110000101100001111000100110001011100011011000111" + "1100100011001001110010101100101111001100110011011100111011001111" + "1101000011010001110100101101001111010100110101011101011011010111" + "1101100011011001110110101101101111011100110111011101111011011111" + "1110000011100001111000101110001111100100111001011110011011100111" + "1110100011101001111010101110101111101100111011011110111011101111" + "1111000011110001111100101111001111110100111101011111011011110111" + "1111100011111001111110101111101111111100111111011111111011111111"; + + /// Maps 0..9, A..F, a..f to 0..15. Other chars are mapped to implementation specific value. + constexpr inline std::string_view hex_char_to_digit_table + = {"\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff" + "\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff" + "\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff" + "\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\xff\xff\xff\xff\xff\xff" //0-9 + "\xff\x0a\x0b\x0c\x0d\x0e\x0f\xff\xff\xff\xff\xff\xff\xff\xff\xff" //A-Z + "\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff" + "\xff\x0a\x0b\x0c\x0d\x0e\x0f\xff\xff\xff\xff\xff\xff\xff\xff\xff" //a-z + "\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff" + "\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff" + "\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff" + "\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff" + "\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff" + "\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff" + "\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff" + "\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff" + "\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff", + 256}; + + /// Converts a hex digit '0'..'f' or '0'..'F' to its value 0..15. + constexpr UInt8 unhexDigit(char c) + { + return hex_char_to_digit_table[static_cast(c)]; + } + + /// Converts an unsigned integer in the native endian to hexadecimal representation and back. Used as a base class for HexConversion. + template + struct HexConversionUInt + { + static const constexpr size_t num_hex_digits = sizeof(TUInt) * 2; + + static void hex(TUInt uint_, char * out, std::string_view table) + { + union + { + TUInt value; + UInt8 uint8[sizeof(TUInt)]; + }; + + value = uint_; + + for (size_t i = 0; i < sizeof(TUInt); ++i) + { + if constexpr (std::endian::native == std::endian::little) + memcpy(out + i * 2, &table[static_cast(uint8[sizeof(TUInt) - 1 - i]) * 2], 2); + else + memcpy(out + i * 2, &table[static_cast(uint8[i]) * 2], 2); + } + } + + static TUInt unhex(const char * data) + { + TUInt res; + if constexpr (sizeof(TUInt) == 1) + { + res = static_cast(unhexDigit(data[0])) * 0x10 + static_cast(unhexDigit(data[1])); + } + else if constexpr (sizeof(TUInt) == 2) + { + res = static_cast(unhexDigit(data[0])) * 0x1000 + static_cast(unhexDigit(data[1])) * 0x100 + + static_cast(unhexDigit(data[2])) * 0x10 + static_cast(unhexDigit(data[3])); + } + else if constexpr ((sizeof(TUInt) <= 8) || ((sizeof(TUInt) % 8) != 0)) + { + res = 0; + for (size_t i = 0; i < sizeof(TUInt) * 2; ++i, ++data) + { + res <<= 4; + res += unhexDigit(*data); + } + } + else + { + res = 0; + for (size_t i = 0; i < sizeof(TUInt) / 8; ++i, data += 16) + { + res <<= 64; + res += HexConversionUInt::unhex(data); + } + } + return res; + } + }; + + /// Helper template class to convert a value of any supported type to hexadecimal representation and back. + template + struct HexConversion; + + template + struct HexConversion>> : public HexConversionUInt {}; + + template + struct HexConversion> : public HexConversionUInt> {}; + + template /// Partial specialization here allows not to include in this header. + struct HexConversion>> + { + static const constexpr size_t num_hex_digits = 32; + + static void hex(const CityHashUInt128 & uint_, char * out, std::string_view table) + { + HexConversion::hex(uint_.high64, out, table); + HexConversion::hex(uint_.low64, out + 16, table); + } + + static CityHashUInt128 unhex(const char * data) + { + CityHashUInt128 res; + res.high64 = HexConversion::unhex(data); + res.low64 = HexConversion::unhex(data + 16); + return res; + } + }; +} + +/// Produces a hexadecimal representation of an integer value with leading zeros (for checksums). +/// The function supports native integer types, wide::integer, CityHash_v1_0_2::uint128. +/// It can be used with signed types as well, however they are written as corresponding unsigned numbers +/// using two's complement (i.e. for example "-1" is written as "0xFF", not as "-0x01"). +template +void writeHexUIntUppercase(const T & value, char * out) +{ + impl::HexConversion::hex(value, out, impl::hex_byte_to_char_uppercase_table); +} + +template +void writeHexUIntLowercase(const T & value, char * out) +{ + impl::HexConversion::hex(value, out, impl::hex_byte_to_char_lowercase_table); +} + +template +std::string getHexUIntUppercase(const T & value) +{ + std::string res(impl::HexConversion::num_hex_digits, '\0'); + writeHexUIntUppercase(value, res.data()); + return res; +} + +template +std::string getHexUIntLowercase(const T & value) +{ + std::string res(impl::HexConversion::num_hex_digits, '\0'); + writeHexUIntLowercase(value, res.data()); + return res; +} constexpr char hexDigitUppercase(unsigned char c) { - return hex_digit_to_char_uppercase_table[c]; + return impl::hex_digit_to_char_uppercase_table[c]; } + constexpr char hexDigitLowercase(unsigned char c) { - return hex_digit_to_char_lowercase_table[c]; + return impl::hex_digit_to_char_lowercase_table[c]; } -/// Maps 0..255 to 00..FF or 00..ff correspondingly - -constexpr inline std::string_view hex_byte_to_char_uppercase_table = // - "000102030405060708090A0B0C0D0E0F" - "101112131415161718191A1B1C1D1E1F" - "202122232425262728292A2B2C2D2E2F" - "303132333435363738393A3B3C3D3E3F" - "404142434445464748494A4B4C4D4E4F" - "505152535455565758595A5B5C5D5E5F" - "606162636465666768696A6B6C6D6E6F" - "707172737475767778797A7B7C7D7E7F" - "808182838485868788898A8B8C8D8E8F" - "909192939495969798999A9B9C9D9E9F" - "A0A1A2A3A4A5A6A7A8A9AAABACADAEAF" - "B0B1B2B3B4B5B6B7B8B9BABBBCBDBEBF" - "C0C1C2C3C4C5C6C7C8C9CACBCCCDCECF" - "D0D1D2D3D4D5D6D7D8D9DADBDCDDDEDF" - "E0E1E2E3E4E5E6E7E8E9EAEBECEDEEEF" - "F0F1F2F3F4F5F6F7F8F9FAFBFCFDFEFF"; - -constexpr inline std::string_view hex_byte_to_char_lowercase_table = // - "000102030405060708090a0b0c0d0e0f" - "101112131415161718191a1b1c1d1e1f" - "202122232425262728292a2b2c2d2e2f" - "303132333435363738393a3b3c3d3e3f" - "404142434445464748494a4b4c4d4e4f" - "505152535455565758595a5b5c5d5e5f" - "606162636465666768696a6b6c6d6e6f" - "707172737475767778797a7b7c7d7e7f" - "808182838485868788898a8b8c8d8e8f" - "909192939495969798999a9b9c9d9e9f" - "a0a1a2a3a4a5a6a7a8a9aaabacadaeaf" - "b0b1b2b3b4b5b6b7b8b9babbbcbdbebf" - "c0c1c2c3c4c5c6c7c8c9cacbcccdcecf" - "d0d1d2d3d4d5d6d7d8d9dadbdcdddedf" - "e0e1e2e3e4e5e6e7e8e9eaebecedeeef" - "f0f1f2f3f4f5f6f7f8f9fafbfcfdfeff"; - inline void writeHexByteUppercase(UInt8 byte, void * out) { - memcpy(out, &hex_byte_to_char_uppercase_table[static_cast(byte) * 2], 2); + memcpy(out, &impl::hex_byte_to_char_uppercase_table[static_cast(byte) * 2], 2); } inline void writeHexByteLowercase(UInt8 byte, void * out) { - memcpy(out, &hex_byte_to_char_lowercase_table[static_cast(byte) * 2], 2); + memcpy(out, &impl::hex_byte_to_char_lowercase_table[static_cast(byte) * 2], 2); } -constexpr inline std::string_view bin_byte_to_char_table = // - "0000000000000001000000100000001100000100000001010000011000000111" - "0000100000001001000010100000101100001100000011010000111000001111" - "0001000000010001000100100001001100010100000101010001011000010111" - "0001100000011001000110100001101100011100000111010001111000011111" - "0010000000100001001000100010001100100100001001010010011000100111" - "0010100000101001001010100010101100101100001011010010111000101111" - "0011000000110001001100100011001100110100001101010011011000110111" - "0011100000111001001110100011101100111100001111010011111000111111" - "0100000001000001010000100100001101000100010001010100011001000111" - "0100100001001001010010100100101101001100010011010100111001001111" - "0101000001010001010100100101001101010100010101010101011001010111" - "0101100001011001010110100101101101011100010111010101111001011111" - "0110000001100001011000100110001101100100011001010110011001100111" - "0110100001101001011010100110101101101100011011010110111001101111" - "0111000001110001011100100111001101110100011101010111011001110111" - "0111100001111001011110100111101101111100011111010111111001111111" - "1000000010000001100000101000001110000100100001011000011010000111" - "1000100010001001100010101000101110001100100011011000111010001111" - "1001000010010001100100101001001110010100100101011001011010010111" - "1001100010011001100110101001101110011100100111011001111010011111" - "1010000010100001101000101010001110100100101001011010011010100111" - "1010100010101001101010101010101110101100101011011010111010101111" - "1011000010110001101100101011001110110100101101011011011010110111" - "1011100010111001101110101011101110111100101111011011111010111111" - "1100000011000001110000101100001111000100110001011100011011000111" - "1100100011001001110010101100101111001100110011011100111011001111" - "1101000011010001110100101101001111010100110101011101011011010111" - "1101100011011001110110101101101111011100110111011101111011011111" - "1110000011100001111000101110001111100100111001011110011011100111" - "1110100011101001111010101110101111101100111011011110111011101111" - "1111000011110001111100101111001111110100111101011111011011110111" - "1111100011111001111110101111101111111100111111011111111011111111"; - -inline void writeBinByte(UInt8 byte, void * out) +/// Converts a hex representation with leading zeros back to an integer value. +/// The function supports native integer types, wide::integer, CityHash_v1_0_2::uint128. +template +constexpr T unhexUInt(const char * data) { - memcpy(out, &bin_byte_to_char_table[static_cast(byte) * 8], 8); + return impl::HexConversion::unhex(data); } -/// Produces hex representation of an unsigned int with leading zeros (for checksums) -template -inline void writeHexUIntImpl(TUInt uint_, char * out, std::string_view table) -{ - union - { - TUInt value; - UInt8 uint8[sizeof(TUInt)]; - }; - - value = uint_; - - for (size_t i = 0; i < sizeof(TUInt); ++i) - { - if constexpr (std::endian::native == std::endian::little) - memcpy(out + i * 2, &table[static_cast(uint8[sizeof(TUInt) - 1 - i]) * 2], 2); - else - memcpy(out + i * 2, &table[static_cast(uint8[i]) * 2], 2); - } -} - -template -inline void writeHexUIntUppercase(TUInt uint_, char * out) -{ - writeHexUIntImpl(uint_, out, hex_byte_to_char_uppercase_table); -} - -template -inline void writeHexUIntLowercase(TUInt uint_, char * out) -{ - writeHexUIntImpl(uint_, out, hex_byte_to_char_lowercase_table); -} - -template -std::string getHexUIntUppercase(TUInt uint_) -{ - std::string res(sizeof(TUInt) * 2, '\0'); - writeHexUIntUppercase(uint_, res.data()); - return res; -} - -template -std::string getHexUIntLowercase(TUInt uint_) -{ - std::string res(sizeof(TUInt) * 2, '\0'); - writeHexUIntLowercase(uint_, res.data()); - return res; -} - -/// Maps 0..9, A..F, a..f to 0..15. Other chars are mapped to implementation specific value. - -constexpr inline std::string_view hex_char_to_digit_table - = {"\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff" - "\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff" - "\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff" - "\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\xff\xff\xff\xff\xff\xff" //0-9 - "\xff\x0a\x0b\x0c\x0d\x0e\x0f\xff\xff\xff\xff\xff\xff\xff\xff\xff" //A-Z - "\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff" - "\xff\x0a\x0b\x0c\x0d\x0e\x0f\xff\xff\xff\xff\xff\xff\xff\xff\xff" //a-z - "\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff" - "\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff" - "\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff" - "\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff" - "\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff" - "\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff" - "\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff" - "\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff" - "\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff", - 256}; - +/// Converts a hexadecimal digit '0'..'f' or '0'..'F' to UInt8. constexpr UInt8 unhex(char c) { - return hex_char_to_digit_table[static_cast(c)]; + return impl::unhexDigit(c); } +/// Converts two hexadecimal digits to UInt8. constexpr UInt8 unhex2(const char * data) { - return static_cast(unhex(data[0])) * 0x10 + static_cast(unhex(data[1])); + return unhexUInt(data); } +/// Converts four hexadecimal digits to UInt16. constexpr UInt16 unhex4(const char * data) { - return static_cast(unhex(data[0])) * 0x1000 + static_cast(unhex(data[1])) * 0x100 - + static_cast(unhex(data[2])) * 0x10 + static_cast(unhex(data[3])); + return unhexUInt(data); } -template -constexpr TUInt unhexUInt(const char * data) +/// Produces a binary representation of a single byte. +inline void writeBinByte(UInt8 byte, void * out) { - TUInt res = 0; - if constexpr ((sizeof(TUInt) <= 8) || ((sizeof(TUInt) % 8) != 0)) - { - for (size_t i = 0; i < sizeof(TUInt) * 2; ++i, ++data) - { - res <<= 4; - res += unhex(*data); - } - } - else - { - for (size_t i = 0; i < sizeof(TUInt) / 8; ++i, data += 16) - { - res <<= 64; - res += unhexUInt(data); - } - } - return res; + memcpy(out, &impl::bin_byte_to_char_table[static_cast(byte) * 8], 8); } diff --git a/base/base/wide_integer_impl.h b/base/base/wide_integer_impl.h index 4a80c176829..411841e6d9f 100644 --- a/base/base/wide_integer_impl.h +++ b/base/base/wide_integer_impl.h @@ -27,6 +27,8 @@ using FromDoubleIntermediateType = long double; using FromDoubleIntermediateType = boost::multiprecision::cpp_bin_float_double_extended; #endif +namespace CityHash_v1_0_2 { struct uint128; } + namespace wide { @@ -281,6 +283,17 @@ struct integer::_impl } } + template + constexpr static void wide_integer_from_cityhash_uint128(integer & self, const CityHashUInt128 & value) noexcept + { + static_assert(sizeof(item_count) >= 2); + + if constexpr (std::endian::native == std::endian::little) + wide_integer_from_tuple_like(self, std::make_pair(value.low64, value.high64)); + else + wide_integer_from_tuple_like(self, std::make_pair(value.high64, value.low64)); + } + /** * N.B. t is constructed from double, so max(t) = max(double) ~ 2^310 * the recursive call happens when t / 2^64 > 2^64, so there won't be more than 5 of them. @@ -1036,6 +1049,8 @@ constexpr integer::integer(T rhs) noexcept _impl::wide_integer_from_wide_integer(*this, rhs); else if constexpr (IsTupleLike::value) _impl::wide_integer_from_tuple_like(*this, rhs); + else if constexpr (std::is_same_v, CityHash_v1_0_2::uint128>) + _impl::wide_integer_from_cityhash_uint128(*this, rhs); else _impl::wide_integer_from_builtin(*this, rhs); } @@ -1051,6 +1066,8 @@ constexpr integer::integer(std::initializer_list il) noexcept _impl::wide_integer_from_wide_integer(*this, *il.begin()); else if constexpr (IsTupleLike::value) _impl::wide_integer_from_tuple_like(*this, *il.begin()); + else if constexpr (std::is_same_v, CityHash_v1_0_2::uint128>) + _impl::wide_integer_from_cityhash_uint128(*this, *il.begin()); else _impl::wide_integer_from_builtin(*this, *il.begin()); } @@ -1088,6 +1105,8 @@ constexpr integer & integer::operator=(T rhs) noexce { if constexpr (IsTupleLike::value) _impl::wide_integer_from_tuple_like(*this, rhs); + else if constexpr (std::is_same_v, CityHash_v1_0_2::uint128>) + _impl::wide_integer_from_cityhash_uint128(*this, rhs); else _impl::wide_integer_from_builtin(*this, rhs); return *this; diff --git a/base/poco/Foundation/CMakeLists.txt b/base/poco/Foundation/CMakeLists.txt index 358f49ed055..d0dde8a51a5 100644 --- a/base/poco/Foundation/CMakeLists.txt +++ b/base/poco/Foundation/CMakeLists.txt @@ -87,7 +87,6 @@ set (SRCS src/LoggingRegistry.cpp src/LogStream.cpp src/MD5Engine.cpp - src/MemoryPool.cpp src/MemoryStream.cpp src/Message.cpp src/Mutex.cpp diff --git a/base/poco/Foundation/include/Poco/MemoryPool.h b/base/poco/Foundation/include/Poco/MemoryPool.h deleted file mode 100644 index 9ab12081b5f..00000000000 --- a/base/poco/Foundation/include/Poco/MemoryPool.h +++ /dev/null @@ -1,116 +0,0 @@ -// -// MemoryPool.h -// -// Library: Foundation -// Package: Core -// Module: MemoryPool -// -// Definition of the MemoryPool class. -// -// Copyright (c) 2005-2006, Applied Informatics Software Engineering GmbH. -// and Contributors. -// -// SPDX-License-Identifier: BSL-1.0 -// - - -#ifndef Foundation_MemoryPool_INCLUDED -#define Foundation_MemoryPool_INCLUDED - - -#include -#include -#include "Poco/Foundation.h" -#include "Poco/Mutex.h" - - -namespace Poco -{ - - -class Foundation_API MemoryPool -/// A simple pool for fixed-size memory blocks. -/// -/// The main purpose of this class is to speed-up -/// memory allocations, as well as to reduce memory -/// fragmentation in situations where the same blocks -/// are allocated all over again, such as in server -/// applications. -/// -/// All allocated blocks are retained for future use. -/// A limit on the number of blocks can be specified. -/// Blocks can be preallocated. -{ -public: - MemoryPool(std::size_t blockSize, int preAlloc = 0, int maxAlloc = 0); - /// Creates a MemoryPool for blocks with the given blockSize. - /// The number of blocks given in preAlloc are preallocated. - - ~MemoryPool(); - - void * get(); - /// Returns a memory block. If there are no more blocks - /// in the pool, a new block will be allocated. - /// - /// If maxAlloc blocks are already allocated, an - /// OutOfMemoryException is thrown. - - void release(void * ptr); - /// Releases a memory block and returns it to the pool. - - std::size_t blockSize() const; - /// Returns the block size. - - int allocated() const; - /// Returns the number of allocated blocks. - - int available() const; - /// Returns the number of available blocks in the pool. - -private: - MemoryPool(); - MemoryPool(const MemoryPool &); - MemoryPool & operator=(const MemoryPool &); - - void clear(); - - enum - { - BLOCK_RESERVE = 128 - }; - - typedef std::vector BlockVec; - - std::size_t _blockSize; - int _maxAlloc; - int _allocated; - BlockVec _blocks; - FastMutex _mutex; -}; - - -// -// inlines -// -inline std::size_t MemoryPool::blockSize() const -{ - return _blockSize; -} - - -inline int MemoryPool::allocated() const -{ - return _allocated; -} - - -inline int MemoryPool::available() const -{ - return (int)_blocks.size(); -} - - -} // namespace Poco - - -#endif // Foundation_MemoryPool_INCLUDED diff --git a/base/poco/Foundation/src/MemoryPool.cpp b/base/poco/Foundation/src/MemoryPool.cpp deleted file mode 100644 index 01c477be525..00000000000 --- a/base/poco/Foundation/src/MemoryPool.cpp +++ /dev/null @@ -1,105 +0,0 @@ -// -// MemoryPool.cpp -// -// Library: Foundation -// Package: Core -// Module: MemoryPool -// -// Copyright (c) 2005-2006, Applied Informatics Software Engineering GmbH. -// and Contributors. -// -// SPDX-License-Identifier: BSL-1.0 -// - - -#include "Poco/MemoryPool.h" -#include "Poco/Exception.h" - - -namespace Poco { - - -MemoryPool::MemoryPool(std::size_t blockSize, int preAlloc, int maxAlloc): - _blockSize(blockSize), - _maxAlloc(maxAlloc), - _allocated(preAlloc) -{ - poco_assert (maxAlloc == 0 || maxAlloc >= preAlloc); - poco_assert (preAlloc >= 0 && maxAlloc >= 0); - - int r = BLOCK_RESERVE; - if (preAlloc > r) - r = preAlloc; - if (maxAlloc > 0 && maxAlloc < r) - r = maxAlloc; - _blocks.reserve(r); - - try - { - for (int i = 0; i < preAlloc; ++i) - { - _blocks.push_back(new char[_blockSize]); - } - } - catch (...) - { - clear(); - throw; - } -} - - -MemoryPool::~MemoryPool() -{ - clear(); -} - - -void MemoryPool::clear() -{ - for (BlockVec::iterator it = _blocks.begin(); it != _blocks.end(); ++it) - { - delete [] *it; - } - _blocks.clear(); -} - - -void* MemoryPool::get() -{ - FastMutex::ScopedLock lock(_mutex); - - if (_blocks.empty()) - { - if (_maxAlloc == 0 || _allocated < _maxAlloc) - { - ++_allocated; - return new char[_blockSize]; - } - else throw OutOfMemoryException("MemoryPool exhausted"); - } - else - { - char* ptr = _blocks.back(); - _blocks.pop_back(); - return ptr; - } -} - - -void MemoryPool::release(void* ptr) -{ - FastMutex::ScopedLock lock(_mutex); - - try - { - _blocks.push_back(reinterpret_cast(ptr)); - } - catch (...) - { - delete [] reinterpret_cast(ptr); - } -} - - -} // namespace Poco diff --git a/base/poco/Net/include/Poco/Net/HTTPBasicStreamBuf.h b/base/poco/Net/include/Poco/Net/HTTPBasicStreamBuf.h index c4872d95353..c87719b63a4 100644 --- a/base/poco/Net/include/Poco/Net/HTTPBasicStreamBuf.h +++ b/base/poco/Net/include/Poco/Net/HTTPBasicStreamBuf.h @@ -19,7 +19,6 @@ #include "Poco/BufferedStreamBuf.h" -#include "Poco/Net/HTTPBufferAllocator.h" #include "Poco/Net/Net.h" @@ -27,9 +26,9 @@ namespace Poco { namespace Net { + constexpr size_t HTTP_DEFAULT_BUFFER_SIZE = 8 * 1024; - - typedef Poco::BasicBufferedStreamBuf, HTTPBufferAllocator> HTTPBasicStreamBuf; + typedef Poco::BasicBufferedStreamBuf> HTTPBasicStreamBuf; } diff --git a/base/poco/Net/include/Poco/Net/HTTPBufferAllocator.h b/base/poco/Net/include/Poco/Net/HTTPBufferAllocator.h deleted file mode 100644 index 5d088e35297..00000000000 --- a/base/poco/Net/include/Poco/Net/HTTPBufferAllocator.h +++ /dev/null @@ -1,53 +0,0 @@ -// -// HTTPBufferAllocator.h -// -// Library: Net -// Package: HTTP -// Module: HTTPBufferAllocator -// -// Definition of the HTTPBufferAllocator class. -// -// Copyright (c) 2005-2006, Applied Informatics Software Engineering GmbH. -// and Contributors. -// -// SPDX-License-Identifier: BSL-1.0 -// - - -#ifndef Net_HTTPBufferAllocator_INCLUDED -#define Net_HTTPBufferAllocator_INCLUDED - - -#include -#include "Poco/MemoryPool.h" -#include "Poco/Net/Net.h" - - -namespace Poco -{ -namespace Net -{ - - - class Net_API HTTPBufferAllocator - /// A BufferAllocator for HTTP streams. - { - public: - static char * allocate(std::streamsize size); - static void deallocate(char * ptr, std::streamsize size); - - enum - { - BUFFER_SIZE = 128 * 1024 - }; - - private: - static Poco::MemoryPool _pool; - }; - - -} -} // namespace Poco::Net - - -#endif // Net_HTTPBufferAllocator_INCLUDED diff --git a/base/poco/Net/include/Poco/Net/HTTPChunkedStream.h b/base/poco/Net/include/Poco/Net/HTTPChunkedStream.h index 47987b18817..5f4729c9278 100644 --- a/base/poco/Net/include/Poco/Net/HTTPChunkedStream.h +++ b/base/poco/Net/include/Poco/Net/HTTPChunkedStream.h @@ -21,7 +21,6 @@ #include #include #include -#include "Poco/MemoryPool.h" #include "Poco/Net/HTTPBasicStreamBuf.h" #include "Poco/Net/Net.h" @@ -80,12 +79,6 @@ namespace Net public: HTTPChunkedInputStream(HTTPSession & session); ~HTTPChunkedInputStream(); - - void * operator new(std::size_t size); - void operator delete(void * ptr); - - private: - static Poco::MemoryPool _pool; }; @@ -95,12 +88,6 @@ namespace Net public: HTTPChunkedOutputStream(HTTPSession & session); ~HTTPChunkedOutputStream(); - - void * operator new(std::size_t size); - void operator delete(void * ptr); - - private: - static Poco::MemoryPool _pool; }; diff --git a/base/poco/Net/include/Poco/Net/HTTPFixedLengthStream.h b/base/poco/Net/include/Poco/Net/HTTPFixedLengthStream.h index 4de211fdb92..2f4df102605 100644 --- a/base/poco/Net/include/Poco/Net/HTTPFixedLengthStream.h +++ b/base/poco/Net/include/Poco/Net/HTTPFixedLengthStream.h @@ -78,12 +78,6 @@ namespace Net public: HTTPFixedLengthInputStream(HTTPSession & session, HTTPFixedLengthStreamBuf::ContentLength length); ~HTTPFixedLengthInputStream(); - - void * operator new(std::size_t size); - void operator delete(void * ptr); - - private: - static Poco::MemoryPool _pool; }; @@ -93,12 +87,6 @@ namespace Net public: HTTPFixedLengthOutputStream(HTTPSession & session, HTTPFixedLengthStreamBuf::ContentLength length); ~HTTPFixedLengthOutputStream(); - - void * operator new(std::size_t size); - void operator delete(void * ptr); - - private: - static Poco::MemoryPool _pool; }; diff --git a/base/poco/Net/include/Poco/Net/HTTPHeaderStream.h b/base/poco/Net/include/Poco/Net/HTTPHeaderStream.h index bcfca984d8b..cf1a6dba2e6 100644 --- a/base/poco/Net/include/Poco/Net/HTTPHeaderStream.h +++ b/base/poco/Net/include/Poco/Net/HTTPHeaderStream.h @@ -21,7 +21,6 @@ #include #include #include -#include "Poco/MemoryPool.h" #include "Poco/Net/HTTPBasicStreamBuf.h" #include "Poco/Net/Net.h" @@ -74,12 +73,6 @@ namespace Net public: HTTPHeaderInputStream(HTTPSession & session); ~HTTPHeaderInputStream(); - - void * operator new(std::size_t size); - void operator delete(void * ptr); - - private: - static Poco::MemoryPool _pool; }; @@ -89,12 +82,6 @@ namespace Net public: HTTPHeaderOutputStream(HTTPSession & session); ~HTTPHeaderOutputStream(); - - void * operator new(std::size_t size); - void operator delete(void * ptr); - - private: - static Poco::MemoryPool _pool; }; diff --git a/base/poco/Net/include/Poco/Net/HTTPSession.h b/base/poco/Net/include/Poco/Net/HTTPSession.h index d0045025f5f..934b34be5d5 100644 --- a/base/poco/Net/include/Poco/Net/HTTPSession.h +++ b/base/poco/Net/include/Poco/Net/HTTPSession.h @@ -192,7 +192,7 @@ namespace Net HTTPSession & operator=(const HTTPSession &); StreamSocket _socket; - char * _pBuffer; + std::unique_ptr _pBuffer; char * _pCurrent; char * _pEnd; bool _keepAlive; diff --git a/base/poco/Net/include/Poco/Net/HTTPStream.h b/base/poco/Net/include/Poco/Net/HTTPStream.h index 0197bc62eb2..48502347b2c 100644 --- a/base/poco/Net/include/Poco/Net/HTTPStream.h +++ b/base/poco/Net/include/Poco/Net/HTTPStream.h @@ -21,7 +21,6 @@ #include #include #include -#include "Poco/MemoryPool.h" #include "Poco/Net/HTTPBasicStreamBuf.h" #include "Poco/Net/Net.h" @@ -75,12 +74,6 @@ namespace Net public: HTTPInputStream(HTTPSession & session); ~HTTPInputStream(); - - void * operator new(std::size_t size); - void operator delete(void * ptr); - - private: - static Poco::MemoryPool _pool; }; @@ -90,12 +83,6 @@ namespace Net public: HTTPOutputStream(HTTPSession & session); ~HTTPOutputStream(); - - void * operator new(std::size_t size); - void operator delete(void * ptr); - - private: - static Poco::MemoryPool _pool; }; diff --git a/base/poco/Net/src/HTTPBufferAllocator.cpp b/base/poco/Net/src/HTTPBufferAllocator.cpp deleted file mode 100644 index 2944e2a6121..00000000000 --- a/base/poco/Net/src/HTTPBufferAllocator.cpp +++ /dev/null @@ -1,44 +0,0 @@ -// -// HTTPBufferAllocator.cpp -// -// Library: Net -// Package: HTTP -// Module: HTTPBufferAllocator -// -// Copyright (c) 2005-2006, Applied Informatics Software Engineering GmbH. -// and Contributors. -// -// SPDX-License-Identifier: BSL-1.0 -// - - -#include "Poco/Net/HTTPBufferAllocator.h" - - -using Poco::MemoryPool; - - -namespace Poco { -namespace Net { - - -MemoryPool HTTPBufferAllocator::_pool(HTTPBufferAllocator::BUFFER_SIZE, 16); - - -char* HTTPBufferAllocator::allocate(std::streamsize size) -{ - poco_assert_dbg (size == BUFFER_SIZE); - - return reinterpret_cast(_pool.get()); -} - - -void HTTPBufferAllocator::deallocate(char* ptr, std::streamsize size) -{ - poco_assert_dbg (size == BUFFER_SIZE); - - _pool.release(ptr); -} - - -} } // namespace Poco::Net diff --git a/base/poco/Net/src/HTTPChunkedStream.cpp b/base/poco/Net/src/HTTPChunkedStream.cpp index f2f79da590b..376e3f55492 100644 --- a/base/poco/Net/src/HTTPChunkedStream.cpp +++ b/base/poco/Net/src/HTTPChunkedStream.cpp @@ -34,7 +34,7 @@ namespace Net { HTTPChunkedStreamBuf::HTTPChunkedStreamBuf(HTTPSession& session, openmode mode): - HTTPBasicStreamBuf(HTTPBufferAllocator::BUFFER_SIZE, mode), + HTTPBasicStreamBuf(HTTP_DEFAULT_BUFFER_SIZE, mode), _session(session), _mode(mode), _chunk(0) @@ -181,10 +181,6 @@ HTTPChunkedStreamBuf* HTTPChunkedIOS::rdbuf() // HTTPChunkedInputStream // - -Poco::MemoryPool HTTPChunkedInputStream::_pool(sizeof(HTTPChunkedInputStream)); - - HTTPChunkedInputStream::HTTPChunkedInputStream(HTTPSession& session): HTTPChunkedIOS(session, std::ios::in), std::istream(&_buf) @@ -196,34 +192,10 @@ HTTPChunkedInputStream::~HTTPChunkedInputStream() { } - -void* HTTPChunkedInputStream::operator new(std::size_t size) -{ - return _pool.get(); -} - - -void HTTPChunkedInputStream::operator delete(void* ptr) -{ - try - { - _pool.release(ptr); - } - catch (...) - { - poco_unexpected(); - } -} - - // // HTTPChunkedOutputStream // - -Poco::MemoryPool HTTPChunkedOutputStream::_pool(sizeof(HTTPChunkedOutputStream)); - - HTTPChunkedOutputStream::HTTPChunkedOutputStream(HTTPSession& session): HTTPChunkedIOS(session, std::ios::out), std::ostream(&_buf) @@ -235,24 +207,4 @@ HTTPChunkedOutputStream::~HTTPChunkedOutputStream() { } - -void* HTTPChunkedOutputStream::operator new(std::size_t size) -{ - return _pool.get(); -} - - -void HTTPChunkedOutputStream::operator delete(void* ptr) -{ - try - { - _pool.release(ptr); - } - catch (...) - { - poco_unexpected(); - } -} - - } } // namespace Poco::Net diff --git a/base/poco/Net/src/HTTPFixedLengthStream.cpp b/base/poco/Net/src/HTTPFixedLengthStream.cpp index d19f6122ee1..fd77ff71cd9 100644 --- a/base/poco/Net/src/HTTPFixedLengthStream.cpp +++ b/base/poco/Net/src/HTTPFixedLengthStream.cpp @@ -30,7 +30,7 @@ namespace Net { HTTPFixedLengthStreamBuf::HTTPFixedLengthStreamBuf(HTTPSession& session, ContentLength length, openmode mode): - HTTPBasicStreamBuf(HTTPBufferAllocator::BUFFER_SIZE, mode), + HTTPBasicStreamBuf(HTTP_DEFAULT_BUFFER_SIZE, mode), _session(session), _length(length), _count(0) @@ -109,9 +109,6 @@ HTTPFixedLengthStreamBuf* HTTPFixedLengthIOS::rdbuf() // -Poco::MemoryPool HTTPFixedLengthInputStream::_pool(sizeof(HTTPFixedLengthInputStream)); - - HTTPFixedLengthInputStream::HTTPFixedLengthInputStream(HTTPSession& session, HTTPFixedLengthStreamBuf::ContentLength length): HTTPFixedLengthIOS(session, length, std::ios::in), std::istream(&_buf) @@ -124,33 +121,10 @@ HTTPFixedLengthInputStream::~HTTPFixedLengthInputStream() } -void* HTTPFixedLengthInputStream::operator new(std::size_t size) -{ - return _pool.get(); -} - - -void HTTPFixedLengthInputStream::operator delete(void* ptr) -{ - try - { - _pool.release(ptr); - } - catch (...) - { - poco_unexpected(); - } -} - - // // HTTPFixedLengthOutputStream // - -Poco::MemoryPool HTTPFixedLengthOutputStream::_pool(sizeof(HTTPFixedLengthOutputStream)); - - HTTPFixedLengthOutputStream::HTTPFixedLengthOutputStream(HTTPSession& session, HTTPFixedLengthStreamBuf::ContentLength length): HTTPFixedLengthIOS(session, length, std::ios::out), std::ostream(&_buf) @@ -163,23 +137,4 @@ HTTPFixedLengthOutputStream::~HTTPFixedLengthOutputStream() } -void* HTTPFixedLengthOutputStream::operator new(std::size_t size) -{ - return _pool.get(); -} - - -void HTTPFixedLengthOutputStream::operator delete(void* ptr) -{ - try - { - _pool.release(ptr); - } - catch (...) - { - poco_unexpected(); - } -} - - } } // namespace Poco::Net diff --git a/base/poco/Net/src/HTTPHeaderStream.cpp b/base/poco/Net/src/HTTPHeaderStream.cpp index 8e0091fcbe3..39b9007062d 100644 --- a/base/poco/Net/src/HTTPHeaderStream.cpp +++ b/base/poco/Net/src/HTTPHeaderStream.cpp @@ -26,7 +26,7 @@ namespace Net { HTTPHeaderStreamBuf::HTTPHeaderStreamBuf(HTTPSession& session, openmode mode): - HTTPBasicStreamBuf(HTTPBufferAllocator::BUFFER_SIZE, mode), + HTTPBasicStreamBuf(HTTP_DEFAULT_BUFFER_SIZE, mode), _session(session), _end(false) { @@ -101,10 +101,6 @@ HTTPHeaderStreamBuf* HTTPHeaderIOS::rdbuf() // HTTPHeaderInputStream // - -Poco::MemoryPool HTTPHeaderInputStream::_pool(sizeof(HTTPHeaderInputStream)); - - HTTPHeaderInputStream::HTTPHeaderInputStream(HTTPSession& session): HTTPHeaderIOS(session, std::ios::in), std::istream(&_buf) @@ -116,34 +112,10 @@ HTTPHeaderInputStream::~HTTPHeaderInputStream() { } - -void* HTTPHeaderInputStream::operator new(std::size_t size) -{ - return _pool.get(); -} - - -void HTTPHeaderInputStream::operator delete(void* ptr) -{ - try - { - _pool.release(ptr); - } - catch (...) - { - poco_unexpected(); - } -} - - // // HTTPHeaderOutputStream // - -Poco::MemoryPool HTTPHeaderOutputStream::_pool(sizeof(HTTPHeaderOutputStream)); - - HTTPHeaderOutputStream::HTTPHeaderOutputStream(HTTPSession& session): HTTPHeaderIOS(session, std::ios::out), std::ostream(&_buf) @@ -155,24 +127,4 @@ HTTPHeaderOutputStream::~HTTPHeaderOutputStream() { } - -void* HTTPHeaderOutputStream::operator new(std::size_t size) -{ - return _pool.get(); -} - - -void HTTPHeaderOutputStream::operator delete(void* ptr) -{ - try - { - _pool.release(ptr); - } - catch (...) - { - poco_unexpected(); - } -} - - } } // namespace Poco::Net diff --git a/base/poco/Net/src/HTTPSession.cpp b/base/poco/Net/src/HTTPSession.cpp index cb6fdc25e9a..d2663baaf9f 100644 --- a/base/poco/Net/src/HTTPSession.cpp +++ b/base/poco/Net/src/HTTPSession.cpp @@ -13,8 +13,8 @@ #include "Poco/Net/HTTPSession.h" -#include "Poco/Net/HTTPBufferAllocator.h" #include "Poco/Net/NetException.h" +#include "Poco/Net/HTTPBasicStreamBuf.h" #include @@ -68,14 +68,6 @@ HTTPSession::HTTPSession(const StreamSocket& socket, bool keepAlive): HTTPSession::~HTTPSession() { - try - { - if (_pBuffer) HTTPBufferAllocator::deallocate(_pBuffer, HTTPBufferAllocator::BUFFER_SIZE); - } - catch (...) - { - poco_unexpected(); - } try { close(); @@ -177,10 +169,10 @@ void HTTPSession::refill() { if (!_pBuffer) { - _pBuffer = HTTPBufferAllocator::allocate(HTTPBufferAllocator::BUFFER_SIZE); + _pBuffer = std::make_unique(HTTP_DEFAULT_BUFFER_SIZE); } - _pCurrent = _pEnd = _pBuffer; - int n = receive(_pBuffer, HTTPBufferAllocator::BUFFER_SIZE); + _pCurrent = _pEnd = _pBuffer.get(); + int n = receive(_pBuffer.get(), HTTP_DEFAULT_BUFFER_SIZE); _pEnd += n; } @@ -199,7 +191,7 @@ void HTTPSession::connect(const SocketAddress& address) _socket.setNoDelay(true); // There may be leftover data from a previous (failed) request in the buffer, // so we clear it. - _pCurrent = _pEnd = _pBuffer; + _pCurrent = _pEnd = _pBuffer.get(); } diff --git a/base/poco/Net/src/HTTPStream.cpp b/base/poco/Net/src/HTTPStream.cpp index 4acb881c4f3..c2f27600569 100644 --- a/base/poco/Net/src/HTTPStream.cpp +++ b/base/poco/Net/src/HTTPStream.cpp @@ -26,7 +26,7 @@ namespace Net { HTTPStreamBuf::HTTPStreamBuf(HTTPSession& session, openmode mode): - HTTPBasicStreamBuf(HTTPBufferAllocator::BUFFER_SIZE, mode), + HTTPBasicStreamBuf(HTTP_DEFAULT_BUFFER_SIZE, mode), _session(session), _mode(mode) { @@ -96,10 +96,6 @@ HTTPStreamBuf* HTTPIOS::rdbuf() // HTTPInputStream // - -Poco::MemoryPool HTTPInputStream::_pool(sizeof(HTTPInputStream)); - - HTTPInputStream::HTTPInputStream(HTTPSession& session): HTTPIOS(session, std::ios::in), std::istream(&_buf) @@ -112,33 +108,11 @@ HTTPInputStream::~HTTPInputStream() } -void* HTTPInputStream::operator new(std::size_t size) -{ - return _pool.get(); -} - - -void HTTPInputStream::operator delete(void* ptr) -{ - try - { - _pool.release(ptr); - } - catch (...) - { - poco_unexpected(); - } -} - - // // HTTPOutputStream // -Poco::MemoryPool HTTPOutputStream::_pool(sizeof(HTTPOutputStream)); - - HTTPOutputStream::HTTPOutputStream(HTTPSession& session): HTTPIOS(session, std::ios::out), std::ostream(&_buf) @@ -150,24 +124,4 @@ HTTPOutputStream::~HTTPOutputStream() { } - -void* HTTPOutputStream::operator new(std::size_t size) -{ - return _pool.get(); -} - - -void HTTPOutputStream::operator delete(void* ptr) -{ - try - { - _pool.release(ptr); - } - catch (...) - { - poco_unexpected(); - } -} - - } } // namespace Poco::Net diff --git a/cmake/autogenerated_versions.txt b/cmake/autogenerated_versions.txt index 015037b2de6..821b7b46855 100644 --- a/cmake/autogenerated_versions.txt +++ b/cmake/autogenerated_versions.txt @@ -2,11 +2,11 @@ # NOTE: has nothing common with DBMS_TCP_PROTOCOL_VERSION, # only DBMS_TCP_PROTOCOL_VERSION should be incremented on protocol changes. -SET(VERSION_REVISION 54475) +SET(VERSION_REVISION 54476) SET(VERSION_MAJOR 23) -SET(VERSION_MINOR 6) +SET(VERSION_MINOR 7) SET(VERSION_PATCH 1) -SET(VERSION_GITHASH 2fec796e73efda10a538a03af3205ce8ffa1b2de) -SET(VERSION_DESCRIBE v23.6.1.1-testing) -SET(VERSION_STRING 23.6.1.1) +SET(VERSION_GITHASH d1c7e13d08868cb04d3562dcced704dd577cb1df) +SET(VERSION_DESCRIBE v23.7.1.1-testing) +SET(VERSION_STRING 23.7.1.1) # end of autochange diff --git a/contrib/abseil-cpp-cmake/CMakeLists.txt b/contrib/abseil-cpp-cmake/CMakeLists.txt index 4c31ecfc599..d64b6036611 100644 --- a/contrib/abseil-cpp-cmake/CMakeLists.txt +++ b/contrib/abseil-cpp-cmake/CMakeLists.txt @@ -17,3 +17,17 @@ get_target_property(FLAT_HASH_SET_INCLUDE_DIR absl::flat_hash_set INTERFACE_INCL target_include_directories (_abseil_swiss_tables SYSTEM BEFORE INTERFACE ${FLAT_HASH_SET_INCLUDE_DIR}) add_library(ch_contrib::abseil_swiss_tables ALIAS _abseil_swiss_tables) + +set(ABSL_FORMAT_SRC + ${ABSL_ROOT_DIR}/absl/strings/internal/str_format/arg.cc + ${ABSL_ROOT_DIR}/absl/strings/internal/str_format/bind.cc + ${ABSL_ROOT_DIR}/absl/strings/internal/str_format/extension.cc + ${ABSL_ROOT_DIR}/absl/strings/internal/str_format/float_conversion.cc + ${ABSL_ROOT_DIR}/absl/strings/internal/str_format/output.cc + ${ABSL_ROOT_DIR}/absl/strings/internal/str_format/parser.cc +) + +add_library(_abseil_str_format ${ABSL_FORMAT_SRC}) +target_include_directories(_abseil_str_format PUBLIC ${ABSL_ROOT_DIR}) + +add_library(ch_contrib::abseil_str_format ALIAS _abseil_str_format) diff --git a/contrib/azure-cmake/CMakeLists.txt b/contrib/azure-cmake/CMakeLists.txt index 1e2a4c97824..7aba81259d3 100644 --- a/contrib/azure-cmake/CMakeLists.txt +++ b/contrib/azure-cmake/CMakeLists.txt @@ -1,6 +1,6 @@ option (ENABLE_AZURE_BLOB_STORAGE "Enable Azure blob storage" ${ENABLE_LIBRARIES}) -if (NOT ENABLE_AZURE_BLOB_STORAGE OR BUILD_STANDALONE_KEEPER OR OS_FREEBSD) +if (NOT ENABLE_AZURE_BLOB_STORAGE OR OS_FREEBSD) message(STATUS "Not using Azure blob storage") return() endif() diff --git a/contrib/cityhash102/include/city.h b/contrib/cityhash102/include/city.h index 77d4c988cdd..87363d16444 100644 --- a/contrib/cityhash102/include/city.h +++ b/contrib/cityhash102/include/city.h @@ -61,11 +61,24 @@ namespace CityHash_v1_0_2 typedef uint8_t uint8; typedef uint32_t uint32; typedef uint64_t uint64; -typedef std::pair uint128; +/// Represent an unsigned integer of 128 bits as it's used in CityHash. +/// Originally CityHash used `std::pair` instead of this struct, +/// however the members `first` and `second` could be easily confused so they were renamed to `low64` and `high64`: +/// `first` -> `low64`, `second` -> `high64`. +struct uint128 +{ + uint64 low64 = 0; + uint64 high64 = 0; -inline uint64 Uint128Low64(const uint128& x) { return x.first; } -inline uint64 Uint128High64(const uint128& x) { return x.second; } + uint128() = default; + uint128(uint64 low64_, uint64 high64_) : low64(low64_), high64(high64_) {} + friend bool operator ==(const uint128 & x, const uint128 & y) { return (x.low64 == y.low64) && (x.high64 == y.high64); } + friend bool operator !=(const uint128 & x, const uint128 & y) { return !(x == y); } +}; + +inline uint64 Uint128Low64(const uint128 & x) { return x.low64; } +inline uint64 Uint128High64(const uint128 & x) { return x.high64; } // Hash function for a byte array. uint64 CityHash64(const char *buf, size_t len); diff --git a/contrib/libhdfs3 b/contrib/libhdfs3 index 164b89253fa..377220ef351 160000 --- a/contrib/libhdfs3 +++ b/contrib/libhdfs3 @@ -1 +1 @@ -Subproject commit 164b89253fad7991bce77882f01b51ab81d19f3d +Subproject commit 377220ef351ae24994a5fcd2b5fa3930d00c4db0 diff --git a/contrib/re2 b/contrib/re2 index 13ebb377c6a..03da4fc0857 160000 --- a/contrib/re2 +++ b/contrib/re2 @@ -1 +1 @@ -Subproject commit 13ebb377c6ad763ca61d12dd6f88b1126bd0b911 +Subproject commit 03da4fc0857c285e3a26782f6bc8931c4c950df4 diff --git a/contrib/re2-cmake/CMakeLists.txt b/contrib/re2-cmake/CMakeLists.txt index 19939c11ebf..305c2400c77 100644 --- a/contrib/re2-cmake/CMakeLists.txt +++ b/contrib/re2-cmake/CMakeLists.txt @@ -12,6 +12,7 @@ endif() set(SRC_DIR "${ClickHouse_SOURCE_DIR}/contrib/re2") set(RE2_SOURCES + ${SRC_DIR}/re2/bitmap256.cc ${SRC_DIR}/re2/bitstate.cc ${SRC_DIR}/re2/compile.cc ${SRC_DIR}/re2/dfa.cc @@ -28,15 +29,16 @@ set(RE2_SOURCES ${SRC_DIR}/re2/regexp.cc ${SRC_DIR}/re2/set.cc ${SRC_DIR}/re2/simplify.cc - ${SRC_DIR}/re2/stringpiece.cc ${SRC_DIR}/re2/tostring.cc ${SRC_DIR}/re2/unicode_casefold.cc ${SRC_DIR}/re2/unicode_groups.cc + ${SRC_DIR}/util/pcre.cc ${SRC_DIR}/util/rune.cc ${SRC_DIR}/util/strutil.cc ) add_library(re2 ${RE2_SOURCES}) target_include_directories(re2 PUBLIC "${SRC_DIR}") +target_link_libraries(re2 ch_contrib::abseil_str_format) # Building re2 which is thread-safe and re2_st which is not. # re2 changes its state during matching of regular expression, e.g. creates temporary DFA. @@ -48,6 +50,7 @@ target_compile_definitions (re2_st PRIVATE NDEBUG NO_THREADS re2=re2_st) target_include_directories (re2_st PRIVATE .) target_include_directories (re2_st SYSTEM PUBLIC ${CMAKE_CURRENT_BINARY_DIR}) target_include_directories (re2_st SYSTEM BEFORE PUBLIC ${SRC_DIR}) +target_link_libraries (re2_st ch_contrib::abseil_str_format) file (MAKE_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/re2_st) foreach (FILENAME filtered_re2.h re2.h set.h stringpiece.h) @@ -60,17 +63,6 @@ foreach (FILENAME filtered_re2.h re2.h set.h stringpiece.h) add_dependencies (re2_st transform_${FILENAME}) endforeach () -file (MAKE_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/util) -foreach (FILENAME mutex.h) - add_custom_command (OUTPUT "${CMAKE_CURRENT_BINARY_DIR}/util/${FILENAME}" - COMMAND ${CMAKE_COMMAND} -DSOURCE_FILENAME="${SRC_DIR}/util/${FILENAME}" - -DTARGET_FILENAME="${CMAKE_CURRENT_BINARY_DIR}/util/${FILENAME}" - -P "${CMAKE_CURRENT_SOURCE_DIR}/re2_transform.cmake" - COMMENT "Creating ${FILENAME} for re2_st library.") - add_custom_target (transform_${FILENAME} DEPENDS "${CMAKE_CURRENT_BINARY_DIR}/util/${FILENAME}") - add_dependencies (re2_st transform_${FILENAME}) -endforeach () - # NOTE: you should not change name of library here, since it is used to generate required header (see above) add_library(ch_contrib::re2 ALIAS re2) add_library(ch_contrib::re2_st ALIAS re2_st) diff --git a/docker/images.json b/docker/images.json index b4f3e755bd1..e8fc329a640 100644 --- a/docker/images.json +++ b/docker/images.json @@ -120,11 +120,12 @@ "docker/test/base": { "name": "clickhouse/test-base", "dependent": [ - "docker/test/stateless", - "docker/test/integration/base", "docker/test/fuzzer", + "docker/test/integration/base", "docker/test/keeper-jepsen", - "docker/test/server-jepsen" + "docker/test/server-jepsen", + "docker/test/sqllogic", + "docker/test/stateless" ] }, "docker/test/integration/kerberized_hadoop": { diff --git a/docker/keeper/Dockerfile b/docker/keeper/Dockerfile index 367f6043b90..f13fcdc14d6 100644 --- a/docker/keeper/Dockerfile +++ b/docker/keeper/Dockerfile @@ -32,7 +32,7 @@ RUN arch=${TARGETARCH:-amd64} \ esac ARG REPOSITORY="https://s3.amazonaws.com/clickhouse-builds/22.4/31c367d3cd3aefd316778601ff6565119fe36682/package_release" -ARG VERSION="23.5.3.24" +ARG VERSION="23.6.1.1524" ARG PACKAGES="clickhouse-keeper" # user/group precreated explicitly with fixed uid/gid on purpose. diff --git a/docker/packager/binary/Dockerfile b/docker/packager/binary/Dockerfile index dd21c8552d3..e824161a688 100644 --- a/docker/packager/binary/Dockerfile +++ b/docker/packager/binary/Dockerfile @@ -89,7 +89,7 @@ RUN arch=${TARGETARCH:-amd64} \ && dpkg -i /tmp/nfpm.deb \ && rm /tmp/nfpm.deb -ARG GO_VERSION=1.19.5 +ARG GO_VERSION=1.19.10 # We need go for clickhouse-diagnostics RUN arch=${TARGETARCH:-amd64} \ && curl -Lo /tmp/go.tgz "https://go.dev/dl/go${GO_VERSION}.linux-${arch}.tar.gz" \ diff --git a/docker/server/Dockerfile.alpine b/docker/server/Dockerfile.alpine index e7e879fa95f..5e5be3f6d73 100644 --- a/docker/server/Dockerfile.alpine +++ b/docker/server/Dockerfile.alpine @@ -33,7 +33,7 @@ RUN arch=${TARGETARCH:-amd64} \ # lts / testing / prestable / etc ARG REPO_CHANNEL="stable" ARG REPOSITORY="https://packages.clickhouse.com/tgz/${REPO_CHANNEL}" -ARG VERSION="23.5.3.24" +ARG VERSION="23.6.1.1524" ARG PACKAGES="clickhouse-client clickhouse-server clickhouse-common-static" # user/group precreated explicitly with fixed uid/gid on purpose. diff --git a/docker/server/Dockerfile.ubuntu b/docker/server/Dockerfile.ubuntu index f393b98cfe6..8693193455f 100644 --- a/docker/server/Dockerfile.ubuntu +++ b/docker/server/Dockerfile.ubuntu @@ -1,4 +1,4 @@ -FROM ubuntu:22.04 +FROM ubuntu:20.04 # see https://github.com/moby/moby/issues/4032#issuecomment-192327844 ARG DEBIAN_FRONTEND=noninteractive @@ -11,18 +11,19 @@ RUN sed -i "s|http://archive.ubuntu.com|${apt_archive}|g" /etc/apt/sources.list && apt-get update \ && apt-get upgrade -yq \ && apt-get install --yes --no-install-recommends \ - apt-transport-https \ ca-certificates \ - dirmngr \ - gnupg2 \ - wget \ locales \ tzdata \ - && apt-get clean + wget \ + && apt-get clean \ + && rm -rf \ + /var/lib/apt/lists/* \ + /var/cache/debconf \ + /tmp/* ARG REPO_CHANNEL="stable" -ARG REPOSITORY="deb https://packages.clickhouse.com/deb ${REPO_CHANNEL} main" -ARG VERSION="23.5.3.24" +ARG REPOSITORY="deb [signed-by=/usr/share/keyrings/clickhouse-keyring.gpg] https://packages.clickhouse.com/deb ${REPO_CHANNEL} main" +ARG VERSION="23.6.1.1524" ARG PACKAGES="clickhouse-client clickhouse-server clickhouse-common-static" # set non-empty deb_location_url url to create a docker image @@ -43,49 +44,68 @@ ARG single_binary_location_url="" ARG TARGETARCH -RUN arch=${TARGETARCH:-amd64} \ +# install from a web location with deb packages +RUN arch="${TARGETARCH:-amd64}" \ && if [ -n "${deb_location_url}" ]; then \ echo "installing from custom url with deb packages: ${deb_location_url}" \ - rm -rf /tmp/clickhouse_debs \ + && rm -rf /tmp/clickhouse_debs \ && mkdir -p /tmp/clickhouse_debs \ && for package in ${PACKAGES}; do \ { wget --progress=bar:force:noscroll "${deb_location_url}/${package}_${VERSION}_${arch}.deb" -P /tmp/clickhouse_debs || \ wget --progress=bar:force:noscroll "${deb_location_url}/${package}_${VERSION}_all.deb" -P /tmp/clickhouse_debs ; } \ || exit 1 \ ; done \ - && dpkg -i /tmp/clickhouse_debs/*.deb ; \ - elif [ -n "${single_binary_location_url}" ]; then \ + && dpkg -i /tmp/clickhouse_debs/*.deb \ + && rm -rf /tmp/* ; \ + fi + +# install from a single binary +RUN if [ -n "${single_binary_location_url}" ]; then \ echo "installing from single binary url: ${single_binary_location_url}" \ && rm -rf /tmp/clickhouse_binary \ && mkdir -p /tmp/clickhouse_binary \ && wget --progress=bar:force:noscroll "${single_binary_location_url}" -O /tmp/clickhouse_binary/clickhouse \ && chmod +x /tmp/clickhouse_binary/clickhouse \ - && /tmp/clickhouse_binary/clickhouse install --user "clickhouse" --group "clickhouse" ; \ - else \ - mkdir -p /etc/apt/sources.list.d \ - && apt-key adv --keyserver keyserver.ubuntu.com --recv 8919F6BD2B48D754 \ - && echo ${REPOSITORY} > /etc/apt/sources.list.d/clickhouse.list \ + && /tmp/clickhouse_binary/clickhouse install --user "clickhouse" --group "clickhouse" \ + && rm -rf /tmp/* ; \ + fi + +# A fallback to installation from ClickHouse repository +RUN if ! clickhouse local -q "SELECT ''" > /dev/null 2>&1; then \ + apt-get update \ + && apt-get install --yes --no-install-recommends \ + apt-transport-https \ + ca-certificates \ + dirmngr \ + gnupg2 \ + && mkdir -p /etc/apt/sources.list.d \ + && GNUPGHOME=$(mktemp -d) \ + && GNUPGHOME="$GNUPGHOME" gpg --no-default-keyring \ + --keyring /usr/share/keyrings/clickhouse-keyring.gpg \ + --keyserver hkp://keyserver.ubuntu.com:80 --recv-keys 8919F6BD2B48D754 \ + && rm -r "$GNUPGHOME" \ + && chmod +r /usr/share/keyrings/clickhouse-keyring.gpg \ + && echo "${REPOSITORY}" > /etc/apt/sources.list.d/clickhouse.list \ && echo "installing from repository: ${REPOSITORY}" \ && apt-get update \ - && apt-get --yes -o "Dpkg::Options::=--force-confdef" -o "Dpkg::Options::=--force-confold" upgrade \ && for package in ${PACKAGES}; do \ packages="${packages} ${package}=${VERSION}" \ ; done \ && apt-get install --allow-unauthenticated --yes --no-install-recommends ${packages} || exit 1 \ - ; fi \ - && clickhouse-local -q 'SELECT * FROM system.build_options' \ - && rm -rf \ - /var/lib/apt/lists/* \ - /var/cache/debconf \ - /tmp/* \ - && mkdir -p /var/lib/clickhouse /var/log/clickhouse-server /etc/clickhouse-server /etc/clickhouse-client \ - && chmod ugo+Xrw -R /var/lib/clickhouse /var/log/clickhouse-server /etc/clickhouse-server /etc/clickhouse-client - -RUN apt-get autoremove --purge -yq libksba8 && \ - apt-get autoremove -yq + && rm -rf \ + /var/lib/apt/lists/* \ + /var/cache/debconf \ + /tmp/* \ + && apt-get autoremove --purge -yq libksba8 \ + && apt-get autoremove -yq \ + ; fi +# post install # we need to allow "others" access to clickhouse folder, because docker container # can be started with arbitrary uid (openshift usecase) +RUN clickhouse-local -q 'SELECT * FROM system.build_options' \ + && mkdir -p /var/lib/clickhouse /var/log/clickhouse-server /etc/clickhouse-server /etc/clickhouse-client \ + && chmod ugo+Xrw -R /var/lib/clickhouse /var/log/clickhouse-server /etc/clickhouse-server /etc/clickhouse-client RUN locale-gen en_US.UTF-8 ENV LANG en_US.UTF-8 diff --git a/docker/server/README.md b/docker/server/README.md index 18dce492123..67646a262f5 100644 --- a/docker/server/README.md +++ b/docker/server/README.md @@ -20,7 +20,6 @@ For more information and documentation see https://clickhouse.com/. - The amd64 image requires support for [SSE3 instructions](https://en.wikipedia.org/wiki/SSE3). Virtually all x86 CPUs after 2005 support SSE3. - The arm64 image requires support for the [ARMv8.2-A architecture](https://en.wikipedia.org/wiki/AArch64#ARMv8.2-A). Most ARM CPUs after 2017 support ARMv8.2-A. A notable exception is Raspberry Pi 4 from 2019 whose CPU only supports ARMv8.0-A. -- Since the Clickhouse 23.3 Ubuntu image started using `ubuntu:22.04` as its base image, it requires docker version >= `20.10.10`, or use `docker run -- privileged` instead. Alternatively, try the Clickhouse Alpine image. ## How to use this image diff --git a/docker/test/fasttest/Dockerfile b/docker/test/fasttest/Dockerfile index ffb13fc774d..da4baa8c687 100644 --- a/docker/test/fasttest/Dockerfile +++ b/docker/test/fasttest/Dockerfile @@ -9,6 +9,7 @@ RUN apt-get update \ expect \ file \ lsof \ + odbcinst \ psmisc \ python3 \ python3-lxml \ diff --git a/docker/test/fasttest/run.sh b/docker/test/fasttest/run.sh index dab873377ce..989ed9d2fbb 100755 --- a/docker/test/fasttest/run.sh +++ b/docker/test/fasttest/run.sh @@ -80,7 +80,7 @@ function start_server function clone_root { - git config --global --add safe.directory "$FASTTEST_SOURCE" + [ "$UID" -eq 0 ] && git config --global --add safe.directory "$FASTTEST_SOURCE" git clone --depth 1 https://github.com/ClickHouse/ClickHouse.git -- "$FASTTEST_SOURCE" 2>&1 | ts '%Y-%m-%d %H:%M:%S' | tee "$FASTTEST_OUTPUT/clone_log.txt" ( @@ -151,7 +151,7 @@ function clone_submodules ) git submodule sync - git submodule update --jobs=16 --depth 1 --init "${SUBMODULES_TO_UPDATE[@]}" + git submodule update --jobs=16 --depth 1 --single-branch --init "${SUBMODULES_TO_UPDATE[@]}" git submodule foreach git reset --hard git submodule foreach git checkout @ -f git submodule foreach git clean -xfd @@ -202,10 +202,11 @@ function build | ts '%Y-%m-%d %H:%M:%S' \ | tee "$FASTTEST_OUTPUT/test_result.txt" if [ "$COPY_CLICKHOUSE_BINARY_TO_OUTPUT" -eq "1" ]; then - cp programs/clickhouse "$FASTTEST_OUTPUT/clickhouse" + mkdir -p "$FASTTEST_OUTPUT/binaries/" + cp programs/clickhouse "$FASTTEST_OUTPUT/binaries/clickhouse" - strip programs/clickhouse -o "$FASTTEST_OUTPUT/clickhouse-stripped" - zstd --threads=0 "$FASTTEST_OUTPUT/clickhouse-stripped" + strip programs/clickhouse -o programs/clickhouse-stripped + zstd --threads=0 programs/clickhouse-stripped -o "$FASTTEST_OUTPUT/binaries/clickhouse-stripped.zst" fi ccache_status ccache --evict-older-than 1d ||: diff --git a/docker/test/integration/base/Dockerfile b/docker/test/integration/base/Dockerfile index de8efa20af4..270b40e23a6 100644 --- a/docker/test/integration/base/Dockerfile +++ b/docker/test/integration/base/Dockerfile @@ -46,12 +46,13 @@ RUN arch=${TARGETARCH:-amd64} \ arm64) rarch=aarch64 ;; \ esac \ && cd /tmp \ - && curl -o mysql-odbc.rpm "https://cdn.mysql.com/archives/mysql-connector-odbc-8.0/mysql-connector-odbc-8.0.27-1.el8.${rarch}.rpm" \ + && curl -o mysql-odbc.rpm "https://cdn.mysql.com/archives/mysql-connector-odbc-8.0/mysql-connector-odbc-8.0.32-1.el9.${rarch}.rpm" \ && rpm2archive mysql-odbc.rpm \ && tar xf mysql-odbc.rpm.tgz -C / ./usr/lib64/ \ - && LINK_DIR=$(dpkg -L libodbc1 | rg '^/usr/lib/.*-linux-gnu/odbc$') \ - && ln -s /usr/lib64/libmyodbc8a.so "$LINK_DIR" \ - && ln -s /usr/lib64/libmyodbc8a.so "$LINK_DIR"/libmyodbc.so + && rm mysql-odbc.rpm mysql-odbc.rpm.tgz \ + && ODBC_DIR=$(dpkg -L odbc-postgresql | rg '^/usr/lib/.*-linux-gnu/odbc$') \ + && ln -s /usr/lib64/libmyodbc8a.so "$ODBC_DIR" \ + && ln -s /usr/lib64/libmyodbc8a.so "$ODBC_DIR"/libmyodbc.so # Unfortunately this is required for a single test for conversion data from zookeeper to clickhouse-keeper. # ZooKeeper is not started by default, but consumes some space in containers. diff --git a/docker/test/integration/helper_container/Dockerfile b/docker/test/integration/helper_container/Dockerfile index 6a093081bf2..60adaea1796 100644 --- a/docker/test/integration/helper_container/Dockerfile +++ b/docker/test/integration/helper_container/Dockerfile @@ -2,4 +2,7 @@ # Helper docker container to run iptables without sudo FROM alpine -RUN apk add -U iproute2 +RUN apk add --no-cache -U iproute2 \ + && for bin in iptables iptables-restore iptables-save; \ + do ln -sf xtables-nft-multi "/sbin/$bin"; \ + done diff --git a/docker/test/integration/mysql_php_client/Dockerfile b/docker/test/integration/mysql_php_client/Dockerfile index 55db4d15a7f..0e11ae023e6 100644 --- a/docker/test/integration/mysql_php_client/Dockerfile +++ b/docker/test/integration/mysql_php_client/Dockerfile @@ -1,7 +1,7 @@ # docker build -t clickhouse/mysql-php-client . # MySQL PHP client docker container -FROM php:8.0.18-cli +FROM php:8-cli-alpine COPY ./client.crt client.crt COPY ./client.key client.key diff --git a/docker/test/integration/runner/Dockerfile b/docker/test/integration/runner/Dockerfile index 14c97e479f6..d6c127c8421 100644 --- a/docker/test/integration/runner/Dockerfile +++ b/docker/test/integration/runner/Dockerfile @@ -1,5 +1,5 @@ # docker build -t clickhouse/integration-tests-runner . -FROM ubuntu:20.04 +FROM ubuntu:22.04 # ARG for quick switch to a given ubuntu mirror ARG apt_archive="http://archive.ubuntu.com" @@ -47,26 +47,30 @@ ENV TZ=Etc/UTC RUN ln -snf /usr/share/zoneinfo/$TZ /etc/localtime && echo $TZ > /etc/timezone ENV DOCKER_CHANNEL stable +# Unpin the docker version after the release 24.0.3 is released +# https://github.com/moby/moby/issues/45770#issuecomment-1618255130 RUN curl -fsSL https://download.docker.com/linux/ubuntu/gpg | apt-key add - \ && add-apt-repository "deb https://download.docker.com/linux/ubuntu $(lsb_release -c -s) ${DOCKER_CHANNEL}" \ && apt-get update \ && env DEBIAN_FRONTEND=noninteractive apt-get install --yes \ - docker-ce \ + docker-ce='5:23.*' \ && rm -rf \ /var/lib/apt/lists/* \ /var/cache/debconf \ /tmp/* \ - && apt-get clean + && apt-get clean \ + && dockerd --version; docker --version -RUN dockerd --version; docker --version RUN python3 -m pip install --no-cache-dir \ PyMySQL \ - aerospike==4.0.0 \ - avro==1.10.2 \ + aerospike==11.1.0 \ asyncio \ + avro==1.10.2 \ + azure-storage-blob \ cassandra-driver \ - confluent-kafka==1.5.0 \ + confluent-kafka==1.9.2 \ + delta-spark==2.3.0 \ dict2xml \ dicttoxml \ docker \ @@ -76,47 +80,51 @@ RUN python3 -m pip install --no-cache-dir \ kafka-python \ kazoo \ lz4 \ + meilisearch==0.18.3 \ minio \ nats-py \ protobuf \ - psycopg2-binary==2.8.6 \ + psycopg2-binary==2.9.6 \ + pyhdfs \ pymongo==3.11.0 \ + pyspark==3.3.2 \ pytest \ pytest-order==1.0.0 \ - pytest-timeout \ pytest-random \ - pytest-xdist \ pytest-repeat \ + pytest-timeout \ + pytest-xdist \ pytz \ redis \ - tzlocal==2.1 \ - urllib3 \ requests-kerberos \ - pyspark==3.3.2 \ - delta-spark==2.2.0 \ - pyhdfs \ - azure-storage-blob \ - meilisearch==0.18.3 - -COPY modprobe.sh /usr/local/bin/modprobe -COPY dockerd-entrypoint.sh /usr/local/bin/ -COPY compose/ /compose/ -COPY misc/ /misc/ + tzlocal==2.1 \ + urllib3 +# Hudi supports only spark 3.3.*, not 3.4 RUN curl -fsSL -O https://dlcdn.apache.org/spark/spark-3.3.2/spark-3.3.2-bin-hadoop3.tgz \ && tar xzvf spark-3.3.2-bin-hadoop3.tgz -C / \ && rm spark-3.3.2-bin-hadoop3.tgz # download spark and packages # if you change packages, don't forget to update them in tests/integration/helpers/cluster.py -RUN echo ":quit" | /spark-3.3.2-bin-hadoop3/bin/spark-shell --packages "org.apache.hudi:hudi-spark3.3-bundle_2.12:0.13.0,io.delta:delta-core_2.12:2.2.0,org.apache.iceberg:iceberg-spark-runtime-3.3_2.12:1.1.0" > /dev/null +RUN packages="org.apache.hudi:hudi-spark3.3-bundle_2.12:0.13.0,\ +io.delta:delta-core_2.12:2.3.0,\ +org.apache.iceberg:iceberg-spark-runtime-3.3_2.12:1.1.0" \ + && /spark-3.3.2-bin-hadoop3/bin/spark-shell --packages "$packages" > /dev/null \ + && find /root/.ivy2/ -name '*.jar' -exec ln -sf {} /spark-3.3.2-bin-hadoop3/jars/ \; RUN set -x \ && addgroup --system dockremap \ - && adduser --system dockremap \ + && adduser --system dockremap \ && adduser dockremap dockremap \ && echo 'dockremap:165536:65536' >> /etc/subuid \ - && echo 'dockremap:165536:65536' >> /etc/subgid + && echo 'dockremap:165536:65536' >> /etc/subgid + +COPY modprobe.sh /usr/local/bin/modprobe +COPY dockerd-entrypoint.sh /usr/local/bin/ +COPY compose/ /compose/ +COPY misc/ /misc/ + # Same options as in test/base/Dockerfile # (in case you need to override them in tests) diff --git a/docker/test/integration/runner/dockerd-entrypoint.sh b/docker/test/integration/runner/dockerd-entrypoint.sh index fe47fc90951..3c4ff522b36 100755 --- a/docker/test/integration/runner/dockerd-entrypoint.sh +++ b/docker/test/integration/runner/dockerd-entrypoint.sh @@ -12,6 +12,17 @@ echo '{ "registry-mirrors" : ["http://dockerhub-proxy.dockerhub-proxy-zone:5000"] }' | dd of=/etc/docker/daemon.json 2>/dev/null +if [ -f /sys/fs/cgroup/cgroup.controllers ]; then + # move the processes from the root group to the /init group, + # otherwise writing subtree_control fails with EBUSY. + # An error during moving non-existent process (i.e., "cat") is ignored. + mkdir -p /sys/fs/cgroup/init + xargs -rn1 < /sys/fs/cgroup/cgroup.procs > /sys/fs/cgroup/init/cgroup.procs || : + # enable controllers + sed -e 's/ / +/g' -e 's/^/+/' < /sys/fs/cgroup/cgroup.controllers \ + > /sys/fs/cgroup/cgroup.subtree_control +fi + # In case of test hung it is convenient to use pytest --pdb to debug it, # and on hung you can simply press Ctrl-C and it will spawn a python pdb, # but on SIGINT dockerd will exit, so ignore it to preserve the daemon. @@ -52,6 +63,8 @@ export CLICKHOUSE_TESTS_BASE_CONFIG_DIR=/clickhouse-config export CLICKHOUSE_ODBC_BRIDGE_BINARY_PATH=/clickhouse-odbc-bridge export CLICKHOUSE_LIBRARY_BRIDGE_BINARY_PATH=/clickhouse-library-bridge +export DOCKER_BASE_TAG=${DOCKER_BASE_TAG:=latest} +export DOCKER_HELPER_TAG=${DOCKER_HELPER_TAG:=latest} export DOCKER_MYSQL_GOLANG_CLIENT_TAG=${DOCKER_MYSQL_GOLANG_CLIENT_TAG:=latest} export DOCKER_DOTNET_CLIENT_TAG=${DOCKER_DOTNET_CLIENT_TAG:=latest} export DOCKER_MYSQL_JAVA_CLIENT_TAG=${DOCKER_MYSQL_JAVA_CLIENT_TAG:=latest} diff --git a/docker/test/sqllogic/Dockerfile b/docker/test/sqllogic/Dockerfile index 83dcf7e1f56..5cf71e4d3f8 100644 --- a/docker/test/sqllogic/Dockerfile +++ b/docker/test/sqllogic/Dockerfile @@ -13,6 +13,7 @@ RUN apt-get update --yes \ sqlite3 \ unixodbc \ unixodbc-dev \ + odbcinst \ sudo \ && apt-get clean diff --git a/docker/test/stateful/Dockerfile b/docker/test/stateful/Dockerfile index 71a2e92e3a8..f513735a2d0 100644 --- a/docker/test/stateful/Dockerfile +++ b/docker/test/stateful/Dockerfile @@ -16,8 +16,9 @@ COPY s3downloader /s3downloader ENV S3_URL="https://clickhouse-datasets.s3.amazonaws.com" ENV DATASETS="hits visits" -RUN npm install -g azurite -RUN npm install tslib +# The following is already done in clickhouse/stateless-test +# RUN npm install -g azurite +# RUN npm install tslib COPY run.sh / CMD ["/bin/bash", "/run.sh"] diff --git a/docker/test/stateless/Dockerfile b/docker/test/stateless/Dockerfile index 40109255a7e..32996140521 100644 --- a/docker/test/stateless/Dockerfile +++ b/docker/test/stateless/Dockerfile @@ -20,6 +20,7 @@ RUN apt-get update -y \ netcat-openbsd \ nodejs \ npm \ + odbcinst \ openjdk-11-jre-headless \ openssl \ postgresql-client \ @@ -71,7 +72,7 @@ RUN arch=${TARGETARCH:-amd64} \ && chmod +x ./mc ./minio -RUN wget 'https://dlcdn.apache.org/hadoop/common/hadoop-3.3.1/hadoop-3.3.1.tar.gz' \ +RUN wget --no-verbose 'https://dlcdn.apache.org/hadoop/common/hadoop-3.3.1/hadoop-3.3.1.tar.gz' \ && tar -xvf hadoop-3.3.1.tar.gz \ && rm -rf hadoop-3.3.1.tar.gz @@ -79,8 +80,8 @@ ENV MINIO_ROOT_USER="clickhouse" ENV MINIO_ROOT_PASSWORD="clickhouse" ENV EXPORT_S3_STORAGE_POLICIES=1 -RUN npm install -g azurite -RUN npm install tslib +RUN npm install -g azurite \ + && npm install -g tslib COPY run.sh / COPY setup_minio.sh / diff --git a/docker/test/stateless/run.sh b/docker/test/stateless/run.sh index 21cb3168083..fe53925ecc8 100755 --- a/docker/test/stateless/run.sh +++ b/docker/test/stateless/run.sh @@ -18,6 +18,9 @@ ln -s /usr/share/clickhouse-test/clickhouse-test /usr/bin/clickhouse-test # shellcheck disable=SC1091 source /usr/share/clickhouse-test/ci/attach_gdb.lib || true # FIXME: to not break old builds, clean on 2023-09-01 +# shellcheck disable=SC1091 +source /usr/share/clickhouse-test/ci/utils.lib || true # FIXME: to not break old builds, clean on 2023-09-01 + # install test configs /usr/share/clickhouse-test/config/install.sh @@ -90,6 +93,22 @@ sleep 5 attach_gdb_to_clickhouse || true # FIXME: to not break old builds, clean on 2023-09-01 +function fn_exists() { + declare -F "$1" > /dev/null; +} + +# FIXME: to not break old builds, clean on 2023-09-01 +function try_run_with_retry() { + local total_retries="$1" + shift + + if fn_exists run_with_retry; then + run_with_retry "$total_retries" "$@" + else + "$@" + fi +} + function run_tests() { set -x @@ -137,8 +156,7 @@ function run_tests() ADDITIONAL_OPTIONS+=('--report-logs-stats') - clickhouse-test "00001_select_1" > /dev/null ||: - clickhouse-client -q "insert into system.zookeeper (name, path, value) values ('auxiliary_zookeeper2', '/test/chroot/', '')" ||: + try_run_with_retry 10 clickhouse-client -q "insert into system.zookeeper (name, path, value) values ('auxiliary_zookeeper2', '/test/chroot/', '')" set +e clickhouse-test --testname --shard --zookeeper --check-zookeeper-session --hung-check --print-time \ diff --git a/docker/test/style/Dockerfile b/docker/test/style/Dockerfile index 746cc7bb2d5..2aa0b1a62d6 100644 --- a/docker/test/style/Dockerfile +++ b/docker/test/style/Dockerfile @@ -18,7 +18,7 @@ RUN apt-get update && env DEBIAN_FRONTEND=noninteractive apt-get install --yes \ python3-pip \ shellcheck \ yamllint \ - && pip3 install black==23.1.0 boto3 codespell==2.2.1 dohq-artifactory mypy PyGithub unidiff pylint==2.6.2 \ + && pip3 install black==23.1.0 boto3 codespell==2.2.1 mypy==1.3.0 PyGithub unidiff pylint==2.6.2 \ && apt-get clean \ && rm -rf /root/.cache/pip diff --git a/docker/test/upgrade/run.sh b/docker/test/upgrade/run.sh index e72c28b7167..82a88272df9 100644 --- a/docker/test/upgrade/run.sh +++ b/docker/test/upgrade/run.sh @@ -76,7 +76,8 @@ sudo mv /etc/clickhouse-server/config.d/keeper_port.xml.tmp /etc/clickhouse-serv # But we still need default disk because some tables loaded only into it sudo cat /etc/clickhouse-server/config.d/s3_storage_policy_by_default.xml \ | sed "s|
s3
|
s3
default|" \ - > /etc/clickhouse-server/config.d/s3_storage_policy_by_default.xml.tmp mv /etc/clickhouse-server/config.d/s3_storage_policy_by_default.xml.tmp /etc/clickhouse-server/config.d/s3_storage_policy_by_default.xml + > /etc/clickhouse-server/config.d/s3_storage_policy_by_default.xml.tmp +mv /etc/clickhouse-server/config.d/s3_storage_policy_by_default.xml.tmp /etc/clickhouse-server/config.d/s3_storage_policy_by_default.xml sudo chown clickhouse /etc/clickhouse-server/config.d/s3_storage_policy_by_default.xml sudo chgrp clickhouse /etc/clickhouse-server/config.d/s3_storage_policy_by_default.xml @@ -189,6 +190,7 @@ rg -Fav -e "Code: 236. DB::Exception: Cancelled merging parts" \ -e "Authentication failed" \ -e "Cannot flush" \ -e "Container already exists" \ + -e "doesn't have metadata version on disk" \ clickhouse-server.upgrade.log \ | grep -av -e "_repl_01111_.*Mapping for table with UUID" \ | zgrep -Fa "" > /test_output/upgrade_error_messages.txt \ diff --git a/docker/test/util/Dockerfile b/docker/test/util/Dockerfile index a49278e960b..85e888f1df7 100644 --- a/docker/test/util/Dockerfile +++ b/docker/test/util/Dockerfile @@ -1,5 +1,5 @@ # docker build -t clickhouse/test-util . -FROM ubuntu:20.04 +FROM ubuntu:22.04 # ARG for quick switch to a given ubuntu mirror ARG apt_archive="http://archive.ubuntu.com" diff --git a/docs/_description_templates/template-data-type.md b/docs/_description_templates/template-data-type.md new file mode 100644 index 00000000000..239edb2808b --- /dev/null +++ b/docs/_description_templates/template-data-type.md @@ -0,0 +1,29 @@ +--- +toc_priority: +toc_title: +--- + +# data_type_name {#data_type-name} + +Description. + +**Parameters** (Optional) + +- `x` — Description. [Type name](relative/path/to/type/dscr.md#type). +- `y` — Description. [Type name](relative/path/to/type/dscr.md#type). + +**Examples** + +```sql + +``` + +## Additional Info {#additional-info} (Optional) + +The name of an additional section can be any, for example, **Usage**. + +**See Also** (Optional) + +- [link](#) + +[Original article](https://clickhouse.com/docs/en/data-types//) diff --git a/docs/_description_templates/template-engine.md b/docs/_description_templates/template-engine.md new file mode 100644 index 00000000000..392bc59ed33 --- /dev/null +++ b/docs/_description_templates/template-engine.md @@ -0,0 +1,63 @@ +# EngineName {#enginename} + +- What the Database/Table engine does. +- Relations with other engines if they exist. + +## Creating a Database {#creating-a-database} +``` sql + CREATE DATABASE ... +``` +or + +## Creating a Table {#creating-a-table} +``` sql + CREATE TABLE ... +``` + +**Engine Parameters** + +**Query Clauses** (for Table engines only) + +## Virtual columns {#virtual-columns} (for Table engines only) + +List and virtual columns with description, if they exist. + +## Data Types Support {#data_types-support} (for Database engines only) + +| EngineName | ClickHouse | +|-----------------------|------------------------------------| +| NativeDataTypeName | [ClickHouseDataTypeName](link#) | + + +## Specifics and recommendations {#specifics-and-recommendations} + +Algorithms +Specifics of read and write processes +Examples of tasks +Recommendations for usage +Specifics of data storage + +## Usage Example {#usage-example} + +The example must show usage and use cases. The following text contains the recommended parts of this section. + +Input table: + +``` text +``` + +Query: + +``` sql +``` + +Result: + +``` text +``` + +Follow up with any text to clarify the example. + +**See Also** + +- [link](#) diff --git a/docs/_description_templates/template-function.md b/docs/_description_templates/template-function.md new file mode 100644 index 00000000000..6bdc764c449 --- /dev/null +++ b/docs/_description_templates/template-function.md @@ -0,0 +1,51 @@ +## functionName {#functionname-in-lower-case} + +Short description. + +**Syntax** (without SELECT) + +``` sql + +``` + +Alias: ``. (Optional) + +More text (Optional). + +**Arguments** (Optional) + +- `x` — Description. Optional (only for optional arguments). Possible values: . Default value: . [Type name](relative/path/to/type/dscr.md#type). +- `y` — Description. Optional (only for optional arguments). Possible values: .Default value: . [Type name](relative/path/to/type/dscr.md#type). + +**Parameters** (Optional, only for parametric aggregate functions) + +- `z` — Description. Optional (only for optional parameters). Possible values: . Default value: . [Type name](relative/path/to/type/dscr.md#type). + +**Returned value(s)** + +- Returned values list. + +Type: [Type name](relative/path/to/type/dscr.md#type). + +**Example** + +The example must show usage and/or a use cases. The following text contains recommended parts of an example. + +Input table (Optional): + +``` text +``` + +Query: + +``` sql +``` + +Result: + +``` text +``` + +**See Also** (Optional) + +- [link](#) diff --git a/docs/_description_templates/template-server-setting.md b/docs/_description_templates/template-server-setting.md new file mode 100644 index 00000000000..0b37d46cf41 --- /dev/null +++ b/docs/_description_templates/template-server-setting.md @@ -0,0 +1,33 @@ +## server_setting_name {#server_setting_name} + +Description. + +Describe what is configured in this section of settings. + +Possible value: ... + +Default value: ... + +**Settings** (Optional) + +If the section contains several settings, list them here. Specify possible values and default values: + +- setting_1 — Description. +- setting_2 — Description. + +**Example** + +```xml + + ... + ... + +``` + +**Additional Info** (Optional) + +The name of an additional section can be any, for example, **Usage**. + +**See Also** (Optional) + +- [link](#) diff --git a/docs/_description_templates/template-setting.md b/docs/_description_templates/template-setting.md new file mode 100644 index 00000000000..fc912aba3e1 --- /dev/null +++ b/docs/_description_templates/template-setting.md @@ -0,0 +1,27 @@ +## setting_name {#setting_name} + +Description. + +For the switch setting, use the typical phrase: “Enables or disables something …”. + +Possible values: + +*For switcher setting:* + +- 0 — Disabled. +- 1 — Enabled. + +*For another setting (typical phrases):* + +- Positive integer. +- 0 — Disabled or unlimited or something else. + +Default value: `value`. + +**Additional Info** (Optional) + +The name of an additional section can be any, for example, **Usage**. + +**See Also** (Optional) + +- [link](#) diff --git a/docs/_description_templates/template-statement.md b/docs/_description_templates/template-statement.md new file mode 100644 index 00000000000..238570c2217 --- /dev/null +++ b/docs/_description_templates/template-statement.md @@ -0,0 +1,24 @@ +# Statement name (for example, SHOW USER) {#statement-name-in-lower-case} + +Brief description of what the statement does. + +**Syntax** + +```sql +Syntax of the statement. +``` + +## Other necessary sections of the description (Optional) {#anchor} + +Examples of descriptions with a complicated structure: + +- https://clickhouse.com/docs/en/sql-reference/statements/grant/ +- https://clickhouse.com/docs/en/sql-reference/statements/revoke/ +- https://clickhouse.com/docs/en/sql-reference/statements/select/join/ + + +**See Also** (Optional) + +Links to related topics as a list. + +- [link](#) diff --git a/docs/_description_templates/template-system-table.md b/docs/_description_templates/template-system-table.md new file mode 100644 index 00000000000..f2decc4bb6d --- /dev/null +++ b/docs/_description_templates/template-system-table.md @@ -0,0 +1,25 @@ +# system.table_name {#system-tables_table-name} + +Description. + +Columns: + +- `column_name` ([data_type_name](path/to/data_type.md)) — Description. + +**Example** + +Query: + +``` sql +SELECT * FROM system.table_name +``` + +Result: + +``` text +Some output. It shouldn't be too long. +``` + +**See Also** + +- [Article name](path/to/article_name.md) — Some words about referenced information. diff --git a/docs/changelogs/v23.3.6.7-lts.md b/docs/changelogs/v23.3.6.7-lts.md new file mode 100644 index 00000000000..387cc126aba --- /dev/null +++ b/docs/changelogs/v23.3.6.7-lts.md @@ -0,0 +1,19 @@ +--- +sidebar_position: 1 +sidebar_label: 2023 +--- + +# 2023 Changelog + +### ClickHouse release v23.3.6.7-lts (7e3f0a271b7) FIXME as compared to v23.3.5.9-lts (f5fbc2fd2b3) + +#### Improvement +* Backported in [#51240](https://github.com/ClickHouse/ClickHouse/issues/51240): Improve the progress bar for file/s3/hdfs/url table functions by using chunk size from source data and using incremental total size counting in each thread. Fix the progress bar for *Cluster functions. This closes [#47250](https://github.com/ClickHouse/ClickHouse/issues/47250). [#51088](https://github.com/ClickHouse/ClickHouse/pull/51088) ([Kruglov Pavel](https://github.com/Avogar)). + +#### Build/Testing/Packaging Improvement +* Backported in [#51529](https://github.com/ClickHouse/ClickHouse/issues/51529): Split huge `RUN` in Dockerfile into smaller conditional. Install the necessary tools on demand in the same `RUN` layer, and remove them after that. Upgrade the OS only once at the beginning. Use a modern way to check the signed repository. Downgrade the base repo to ubuntu:20.04 to address the issues on older docker versions. Upgrade golang version to address golang vulnerabilities. [#51504](https://github.com/ClickHouse/ClickHouse/pull/51504) ([Mikhail f. Shiryaev](https://github.com/Felixoid)). + +#### Bug Fix (user-visible misbehavior in an official stable release) + +* Fix type of LDAP server params hash in cache entry [#50865](https://github.com/ClickHouse/ClickHouse/pull/50865) ([Julian Maicher](https://github.com/jmaicher)). + diff --git a/docs/changelogs/v23.3.7.5-lts.md b/docs/changelogs/v23.3.7.5-lts.md new file mode 100644 index 00000000000..7a5fd5a19b6 --- /dev/null +++ b/docs/changelogs/v23.3.7.5-lts.md @@ -0,0 +1,16 @@ +--- +sidebar_position: 1 +sidebar_label: 2023 +--- + +# 2023 Changelog + +### ClickHouse release v23.3.7.5-lts (bc683c11c92) FIXME as compared to v23.3.6.7-lts (7e3f0a271b7) + +#### Build/Testing/Packaging Improvement +* Backported in [#51568](https://github.com/ClickHouse/ClickHouse/issues/51568): This a follow-up for [#51504](https://github.com/ClickHouse/ClickHouse/issues/51504), the cleanup was lost during refactoring. [#51564](https://github.com/ClickHouse/ClickHouse/pull/51564) ([Mikhail f. Shiryaev](https://github.com/Felixoid)). + +#### Bug Fix (user-visible misbehavior in an official stable release) + +* Fix fuzzer failure in ActionsDAG [#51301](https://github.com/ClickHouse/ClickHouse/pull/51301) ([Alexey Milovidov](https://github.com/alexey-milovidov)). + diff --git a/docs/changelogs/v23.4.5.22-stable.md b/docs/changelogs/v23.4.5.22-stable.md new file mode 100644 index 00000000000..2d61f5b11cf --- /dev/null +++ b/docs/changelogs/v23.4.5.22-stable.md @@ -0,0 +1,27 @@ +--- +sidebar_position: 1 +sidebar_label: 2023 +--- + +# 2023 Changelog + +### ClickHouse release v23.4.5.22-stable (0ced5d6a8da) FIXME as compared to v23.4.4.16-stable (747ba4fc6a0) + +#### Build/Testing/Packaging Improvement +* Backported in [#51530](https://github.com/ClickHouse/ClickHouse/issues/51530): Split huge `RUN` in Dockerfile into smaller conditional. Install the necessary tools on demand in the same `RUN` layer, and remove them after that. Upgrade the OS only once at the beginning. Use a modern way to check the signed repository. Downgrade the base repo to ubuntu:20.04 to address the issues on older docker versions. Upgrade golang version to address golang vulnerabilities. [#51504](https://github.com/ClickHouse/ClickHouse/pull/51504) ([Mikhail f. Shiryaev](https://github.com/Felixoid)). +* Backported in [#51570](https://github.com/ClickHouse/ClickHouse/issues/51570): This a follow-up for [#51504](https://github.com/ClickHouse/ClickHouse/issues/51504), the cleanup was lost during refactoring. [#51564](https://github.com/ClickHouse/ClickHouse/pull/51564) ([Mikhail f. Shiryaev](https://github.com/Felixoid)). + +#### Bug Fix (user-visible misbehavior in an official stable release) + +* Fix broken index analysis when binary operator contains a null constant argument [#50177](https://github.com/ClickHouse/ClickHouse/pull/50177) ([Amos Bird](https://github.com/amosbird)). +* Fix reconnecting of HTTPS session when target host IP was changed [#50240](https://github.com/ClickHouse/ClickHouse/pull/50240) ([Aleksei Filatov](https://github.com/aalexfvk)). +* Fix incorrect constant folding [#50536](https://github.com/ClickHouse/ClickHouse/pull/50536) ([Alexey Milovidov](https://github.com/alexey-milovidov)). +* Fix type of LDAP server params hash in cache entry [#50865](https://github.com/ClickHouse/ClickHouse/pull/50865) ([Julian Maicher](https://github.com/jmaicher)). +* Fallback to parsing big integer from String instead of exception in Parquet format [#50873](https://github.com/ClickHouse/ClickHouse/pull/50873) ([Kruglov Pavel](https://github.com/Avogar)). +* Do not apply projection if read-in-order was enabled. [#50923](https://github.com/ClickHouse/ClickHouse/pull/50923) ([Nikolai Kochetov](https://github.com/KochetovNicolai)). +* Fix fuzzer failure in ActionsDAG [#51301](https://github.com/ClickHouse/ClickHouse/pull/51301) ([Alexey Milovidov](https://github.com/alexey-milovidov)). + +#### NOT FOR CHANGELOG / INSIGNIFICANT + +* Increase max array size in group bitmap [#50620](https://github.com/ClickHouse/ClickHouse/pull/50620) ([Kruglov Pavel](https://github.com/Avogar)). + diff --git a/docs/changelogs/v23.5.4.25-stable.md b/docs/changelogs/v23.5.4.25-stable.md new file mode 100644 index 00000000000..53d3a7c9c0a --- /dev/null +++ b/docs/changelogs/v23.5.4.25-stable.md @@ -0,0 +1,31 @@ +--- +sidebar_position: 1 +sidebar_label: 2023 +--- + +# 2023 Changelog + +### ClickHouse release v23.5.4.25-stable (190f962abcf) FIXME as compared to v23.5.3.24-stable (76f54616d3b) + +#### Improvement +* Backported in [#51235](https://github.com/ClickHouse/ClickHouse/issues/51235): Improve the progress bar for file/s3/hdfs/url table functions by using chunk size from source data and using incremental total size counting in each thread. Fix the progress bar for *Cluster functions. This closes [#47250](https://github.com/ClickHouse/ClickHouse/issues/47250). [#51088](https://github.com/ClickHouse/ClickHouse/pull/51088) ([Kruglov Pavel](https://github.com/Avogar)). +* Backported in [#51255](https://github.com/ClickHouse/ClickHouse/issues/51255): Disable cache setting `do_not_evict_index_and_mark_files` (Was enabled in `23.5`). [#51222](https://github.com/ClickHouse/ClickHouse/pull/51222) ([Kseniia Sumarokova](https://github.com/kssenii)). + +#### Build/Testing/Packaging Improvement +* Backported in [#51531](https://github.com/ClickHouse/ClickHouse/issues/51531): Split huge `RUN` in Dockerfile into smaller conditional. Install the necessary tools on demand in the same `RUN` layer, and remove them after that. Upgrade the OS only once at the beginning. Use a modern way to check the signed repository. Downgrade the base repo to ubuntu:20.04 to address the issues on older docker versions. Upgrade golang version to address golang vulnerabilities. [#51504](https://github.com/ClickHouse/ClickHouse/pull/51504) ([Mikhail f. Shiryaev](https://github.com/Felixoid)). +* Backported in [#51572](https://github.com/ClickHouse/ClickHouse/issues/51572): This a follow-up for [#51504](https://github.com/ClickHouse/ClickHouse/issues/51504), the cleanup was lost during refactoring. [#51564](https://github.com/ClickHouse/ClickHouse/pull/51564) ([Mikhail f. Shiryaev](https://github.com/Felixoid)). + +#### Bug Fix (user-visible misbehavior in an official stable release) + +* Query Cache: Try to fix bad cast from ColumnConst to ColumnVector [#50704](https://github.com/ClickHouse/ClickHouse/pull/50704) ([Robert Schulze](https://github.com/rschu1ze)). +* Fix type of LDAP server params hash in cache entry [#50865](https://github.com/ClickHouse/ClickHouse/pull/50865) ([Julian Maicher](https://github.com/jmaicher)). +* Fallback to parsing big integer from String instead of exception in Parquet format [#50873](https://github.com/ClickHouse/ClickHouse/pull/50873) ([Kruglov Pavel](https://github.com/Avogar)). +* Do not apply projection if read-in-order was enabled. [#50923](https://github.com/ClickHouse/ClickHouse/pull/50923) ([Nikolai Kochetov](https://github.com/KochetovNicolai)). +* Fix race azure blob storage iterator [#50936](https://github.com/ClickHouse/ClickHouse/pull/50936) ([SmitaRKulkarni](https://github.com/SmitaRKulkarni)). +* Fix ineffective query cache for SELECTs with subqueries [#51132](https://github.com/ClickHouse/ClickHouse/pull/51132) ([Robert Schulze](https://github.com/rschu1ze)). +* Fix fuzzer failure in ActionsDAG [#51301](https://github.com/ClickHouse/ClickHouse/pull/51301) ([Alexey Milovidov](https://github.com/alexey-milovidov)). + +#### NOT FOR CHANGELOG / INSIGNIFICANT + +* Fix ParallelReadBuffer seek [#50820](https://github.com/ClickHouse/ClickHouse/pull/50820) ([Michael Kolupaev](https://github.com/al13n321)). + diff --git a/docs/changelogs/v23.6.1.1524-stable.md b/docs/changelogs/v23.6.1.1524-stable.md new file mode 100644 index 00000000000..6d295d61ef4 --- /dev/null +++ b/docs/changelogs/v23.6.1.1524-stable.md @@ -0,0 +1,301 @@ +--- +sidebar_position: 1 +sidebar_label: 2023 +--- + +# 2023 Changelog + +### ClickHouse release v23.6.1.1524-stable (d1c7e13d088) FIXME as compared to v23.5.1.3174-stable (2fec796e73e) + +#### Backward Incompatible Change +* Delete feature `do_not_evict_index_and_mark_files` in the fs cache. This feature was only making things worse. [#51253](https://github.com/ClickHouse/ClickHouse/pull/51253) ([Kseniia Sumarokova](https://github.com/kssenii)). +* Remove ALTER support for experimental LIVE VIEW. [#51287](https://github.com/ClickHouse/ClickHouse/pull/51287) ([Alexey Milovidov](https://github.com/alexey-milovidov)). + +#### New Feature +* Add setting `session_timezone`, it is used as default timezone for session when not explicitly specified. [#44149](https://github.com/ClickHouse/ClickHouse/pull/44149) ([Andrey Zvonov](https://github.com/zvonand)). +* Added overlay database engine and representation of a directory as a database This commit adds 4 databases: 1. DatabaseOverlay: Implements the IDatabase interface. Allow to combine multiple databases, such as FileSystem and Memory. Internally, it stores a vector with other database pointers and proxies requests to them in turn until it is executed successfully. 2. DatabaseFilesystem: allows to read-only interact with files stored on the file system. Internally, it uses TableFunctionFile to implicitly load file when a user requests the table. Result of TableFunctionFile call cached inside to provide quick access. 3. DatabaseS3: allows to read-only interact with s3 storage. It uses TableFunctionS3 to implicitly load table from s3 4. DatabaseHDFS: allows to interact with hdfs storage. It uses TableFunctionHDFS to implicitly load table from hdfs. [#48821](https://github.com/ClickHouse/ClickHouse/pull/48821) ([alekseygolub](https://github.com/alekseygolub)). +* Add a new setting named `use_mysql_types_in_show_columns` to alter the `SHOW COLUMNS` SQL statement to display MySQL equivalent types when a client is connected via the MySQL compatibility port. [#49577](https://github.com/ClickHouse/ClickHouse/pull/49577) ([Thomas Panetti](https://github.com/tpanetti)). +* Added option `--rename_files_after_processing `. This closes [#34207](https://github.com/ClickHouse/ClickHouse/issues/34207). [#49626](https://github.com/ClickHouse/ClickHouse/pull/49626) ([alekseygolub](https://github.com/alekseygolub)). +* 1. Add `TableFunctionRedis` 3. Add table engine Redis 4. Add `RedisCommon` which contains Redis related tools and types 5. Support `equals` and `in` filter push down into Redis. [#50150](https://github.com/ClickHouse/ClickHouse/pull/50150) ([JackyWoo](https://github.com/JackyWoo)). +* Allow to skip empty files in file/s3/url/hdfs table functions using settings `s3_skip_empty_files`, `hdfs_skip_empty_files`, `engine_file_skip_empty_files`, `engine_url_skip_empty_files`. [#50364](https://github.com/ClickHouse/ClickHouse/pull/50364) ([Kruglov Pavel](https://github.com/Avogar)). +* Clickhouse-client can now be called with a connection instead of "--host", "--port", "--user" etc. [#50689](https://github.com/ClickHouse/ClickHouse/pull/50689) ([Alexey Gerasimchuck](https://github.com/Demilivor)). +* Codec DEFLATE_QPL is now controlled via server setting "enable_deflate_qpl_codec" (default: false) instead of setting "allow_experimental_codecs". This marks QPL_DEFLATE non-experimental. [#50775](https://github.com/ClickHouse/ClickHouse/pull/50775) ([Robert Schulze](https://github.com/rschu1ze)). + +#### Performance Improvement +* Improve performance with enabled QueryProfiler using thread-local timer_id instead of global object. [#48778](https://github.com/ClickHouse/ClickHouse/pull/48778) ([Jiebin Sun](https://github.com/jiebinn)). +* Rewrite CapnProto input/output format to improve its performance. Map column names and CapnProto fields case insensitive, fix reading/writing of nested structure fields. [#49752](https://github.com/ClickHouse/ClickHouse/pull/49752) ([Kruglov Pavel](https://github.com/Avogar)). +* Optimize parquet write performance for parallel threads. [#50102](https://github.com/ClickHouse/ClickHouse/pull/50102) ([Hongbin Ma](https://github.com/binmahone)). +* ### Documentation entry for user-facing changes Disable `parallelize_output_from_storages` for processing MATERIALIZED VIEWs and storages with one block only. [#50214](https://github.com/ClickHouse/ClickHouse/pull/50214) ([Azat Khuzhin](https://github.com/azat)). +* Merge PR https://github.com/ClickHouse/ClickHouse/pull/46558 (Avoid processing already sorted data). Avoid block permutation during sort if the block is already sorted. [#50697](https://github.com/ClickHouse/ClickHouse/pull/50697) ([Maksim Kita](https://github.com/kitaisreal)). +* In the earlier PRs ([#50062](https://github.com/ClickHouse/ClickHouse/issues/50062), [#50307](https://github.com/ClickHouse/ClickHouse/issues/50307)), we used to propose an optimization pattern which transforms the predicates with toYear/toYYYYMM into its equivalent but converter-free form. This transformation could bring significant performance impact to some workloads, such as SSB. However, as issue [#50628](https://github.com/ClickHouse/ClickHouse/issues/50628) indicated, these two PRs would introduce some issues which may results in incomplete query results, and as a result, they were reverted by [#50629](https://github.com/ClickHouse/ClickHouse/issues/50629). [#50951](https://github.com/ClickHouse/ClickHouse/pull/50951) ([Zhiguo Zhou](https://github.com/ZhiguoZh)). +* Make multiple list requests to ZooKeeper in parallel to speed up reading from system.zookeeper table. [#51042](https://github.com/ClickHouse/ClickHouse/pull/51042) ([Alexander Gololobov](https://github.com/davenger)). +* Speedup initialization of DateTime lookup tables for time zones. This should reduce startup/connect time of clickhouse client especially in debug build as it is rather heavy. [#51347](https://github.com/ClickHouse/ClickHouse/pull/51347) ([Alexander Gololobov](https://github.com/davenger)). + +#### Improvement +* Allow to cast IPv6 to IPv4 address for CIDR ::ffff:0:0/96 (IPv4-mapped addresses). [#49759](https://github.com/ClickHouse/ClickHouse/pull/49759) ([Yakov Olkhovskiy](https://github.com/yakov-olkhovskiy)). +* Update MongoDB protocol to support MongoDB 5.1 version and newer. Support for the versions with the old protocol (<3.6) is preserved. Closes [#45621](https://github.com/ClickHouse/ClickHouse/issues/45621), [#49879](https://github.com/ClickHouse/ClickHouse/issues/49879). [#50061](https://github.com/ClickHouse/ClickHouse/pull/50061) ([Nikolay Degterinsky](https://github.com/evillique)). +* Improved scheduling of merge selecting and cleanup tasks in `ReplicatedMergeTree`. The tasks will not be executed too frequently when there's nothing to merge or cleanup. Added settings `max_merge_selecting_sleep_ms`, `merge_selecting_sleep_slowdown_factor`, `max_cleanup_delay_period` and `cleanup_thread_preferred_points_per_iteration`. It should close [#31919](https://github.com/ClickHouse/ClickHouse/issues/31919). [#50107](https://github.com/ClickHouse/ClickHouse/pull/50107) ([Alexander Tokmakov](https://github.com/tavplubix)). +* Support parallel replicas with the analyzer. [#50441](https://github.com/ClickHouse/ClickHouse/pull/50441) ([Raúl Marín](https://github.com/Algunenano)). +* Add setting `input_format_max_bytes_to_read_for_schema_inference` to limit the number of bytes to read in schema inference. Closes [#50577](https://github.com/ClickHouse/ClickHouse/issues/50577). [#50592](https://github.com/ClickHouse/ClickHouse/pull/50592) ([Kruglov Pavel](https://github.com/Avogar)). +* Respect setting input_format_as_default in schema inference. [#50602](https://github.com/ClickHouse/ClickHouse/pull/50602) ([Kruglov Pavel](https://github.com/Avogar)). +* Make filter push down through cross join. [#50605](https://github.com/ClickHouse/ClickHouse/pull/50605) ([Han Fei](https://github.com/hanfei1991)). +* Actual lz4 version is used now. [#50621](https://github.com/ClickHouse/ClickHouse/pull/50621) ([Nikita Taranov](https://github.com/nickitat)). +* Allow to skip trailing empty lines in CSV/TSV/CustomSeparated formats via settings `input_format_csv_skip_trailing_empty_lines`, `input_format_tsv_skip_trailing_empty_lines` and `input_format_custom_skip_trailing_empty_lines` (disabled by default). Closes [#49315](https://github.com/ClickHouse/ClickHouse/issues/49315). [#50635](https://github.com/ClickHouse/ClickHouse/pull/50635) ([Kruglov Pavel](https://github.com/Avogar)). +* Functions "toDateOrDefault|OrNull()" and "accuateCast[OrDefault|OrNull]()" now correctly parse numeric arguments. [#50709](https://github.com/ClickHouse/ClickHouse/pull/50709) ([Dmitry Kardymon](https://github.com/kardymonds)). +* Currently, the csv input format can not parse the csv file with whitespace or \t field delimiter, and these delimiters is supported in spark. [#50712](https://github.com/ClickHouse/ClickHouse/pull/50712) ([KevinyhZou](https://github.com/KevinyhZou)). +* Settings `number_of_mutations_to_delay` and `number_of_mutations_to_throw` are enabled by default now with values 500 and 1000 respectively. [#50726](https://github.com/ClickHouse/ClickHouse/pull/50726) ([Anton Popov](https://github.com/CurtizJ)). +* Keeper improvement: add feature flags for Keeper API. Each feature flag can be disabled or enabled by defining it under `keeper_server.feature_flags` config. E.g. to enable `CheckNotExists` request, `keeper_server.feature_flags.check_not_exists` should be set to `1` on Keeper. [#50796](https://github.com/ClickHouse/ClickHouse/pull/50796) ([Antonio Andelic](https://github.com/antonio2368)). +* The dashboard correctly shows missing values. This closes [#50831](https://github.com/ClickHouse/ClickHouse/issues/50831). [#50832](https://github.com/ClickHouse/ClickHouse/pull/50832) ([Alexey Milovidov](https://github.com/alexey-milovidov)). +* CGroups metrics related to CPU are replaced with one metric, `CGroupMaxCPU` for better usability. The `Normalized` CPU usage metrics will be normalized to CGroups limits instead of the total number of CPUs when they are set. This closes [#50836](https://github.com/ClickHouse/ClickHouse/issues/50836). [#50835](https://github.com/ClickHouse/ClickHouse/pull/50835) ([Alexey Milovidov](https://github.com/alexey-milovidov)). +* Relax the thresholds for "too many parts" to be more modern. Return the backpressure during long-running insert queries. [#50856](https://github.com/ClickHouse/ClickHouse/pull/50856) ([Alexey Milovidov](https://github.com/alexey-milovidov)). +* Added the possibility to use date and time arguments in syslog timestamp format in functions parseDateTimeBestEffort*() and parseDateTime64BestEffort*(). [#50925](https://github.com/ClickHouse/ClickHouse/pull/50925) ([Victor Krasnov](https://github.com/sirvickr)). +* Suggest using `APPEND` or `TRUNCATE` for `INTO OUTFILE` when file exists. [#50950](https://github.com/ClickHouse/ClickHouse/pull/50950) ([alekar](https://github.com/alekar)). +* Add embedded keeper-client to standalone keeper binary. [#50964](https://github.com/ClickHouse/ClickHouse/pull/50964) ([pufit](https://github.com/pufit)). +* Command line parameter "--password" in clickhouse-client can now be specified only once. [#50966](https://github.com/ClickHouse/ClickHouse/pull/50966) ([Alexey Gerasimchuck](https://github.com/Demilivor)). +* Fix data lakes slowness because of synchronous head requests. (Related to Iceberg/Deltalake/Hudi being slow with a lot of files). [#50976](https://github.com/ClickHouse/ClickHouse/pull/50976) ([Kseniia Sumarokova](https://github.com/kssenii)). +* Use `hash_of_all_files` from `system.parts` to check identity of parts during on-cluster backups. [#50997](https://github.com/ClickHouse/ClickHouse/pull/50997) ([Vitaly Baranov](https://github.com/vitlibar)). +* The system table zookeeper_connection connected_time identifies the time when the connection is established (standard format), and session_uptime_elapsed_seconds is added, which labels the duration of the established connection session (in seconds). [#51026](https://github.com/ClickHouse/ClickHouse/pull/51026) ([郭小龙](https://github.com/guoxiaolongzte)). +* Show halves of checksums in `system.parts`, `system.projection_parts` and in error messages in the correct order. [#51040](https://github.com/ClickHouse/ClickHouse/pull/51040) ([Vitaly Baranov](https://github.com/vitlibar)). +* Do not replicate `ALTER PARTITION` queries and mutations through `Replicated` database if it has only one shard and the underlying table is `ReplicatedMergeTree`. [#51049](https://github.com/ClickHouse/ClickHouse/pull/51049) ([Alexander Tokmakov](https://github.com/tavplubix)). +* Improve the progress bar for file/s3/hdfs/url table functions by using chunk size from source data and using incremental total size counting in each thread. Fix the progress bar for *Cluster functions. This closes [#47250](https://github.com/ClickHouse/ClickHouse/issues/47250). [#51088](https://github.com/ClickHouse/ClickHouse/pull/51088) ([Kruglov Pavel](https://github.com/Avogar)). +* Add total_bytes_to_read to Progress packet in TCP protocol for better Progress bar. [#51158](https://github.com/ClickHouse/ClickHouse/pull/51158) ([Kruglov Pavel](https://github.com/Avogar)). +* Better checking of data parts on disks with filesystem cache. [#51164](https://github.com/ClickHouse/ClickHouse/pull/51164) ([Anton Popov](https://github.com/CurtizJ)). +* Disable cache setting `do_not_evict_index_and_mark_files` (Was enabled in `23.5`). [#51222](https://github.com/ClickHouse/ClickHouse/pull/51222) ([Kseniia Sumarokova](https://github.com/kssenii)). +* Fix sometimes not correct current_elements_num in fs cache. [#51242](https://github.com/ClickHouse/ClickHouse/pull/51242) ([Kseniia Sumarokova](https://github.com/kssenii)). +* Add random sleep before merges/mutations execution to split load more evenly between replicas in case of zero-copy replication. [#51282](https://github.com/ClickHouse/ClickHouse/pull/51282) ([alesapin](https://github.com/alesapin)). +* The function `transform` as well as `CASE` with value matching started to support all data types. This closes [#29730](https://github.com/ClickHouse/ClickHouse/issues/29730). This closes [#32387](https://github.com/ClickHouse/ClickHouse/issues/32387). This closes [#50827](https://github.com/ClickHouse/ClickHouse/issues/50827). This closes [#31336](https://github.com/ClickHouse/ClickHouse/issues/31336). This closes [#40493](https://github.com/ClickHouse/ClickHouse/issues/40493). [#51351](https://github.com/ClickHouse/ClickHouse/pull/51351) ([Alexey Milovidov](https://github.com/alexey-milovidov)). +* We have found a bug in LLVM that makes the usage of `compile_expressions` setting unsafe. It is disabled by default. [#51368](https://github.com/ClickHouse/ClickHouse/pull/51368) ([Alexey Milovidov](https://github.com/alexey-milovidov)). +* Issue [#50220](https://github.com/ClickHouse/ClickHouse/issues/50220) reports a core in `grace_hash` join. We finally reproduce the exception on local, and found that the issue is related to the failure of creating temporary file. Somehow this is triggered in https://github.com/ClickHouse/ClickHouse/pull/49816 https://github.com/ClickHouse/ClickHouse/pull/49483. [#51382](https://github.com/ClickHouse/ClickHouse/pull/51382) ([lgbo](https://github.com/lgbo-ustc)). + +#### Build/Testing/Packaging Improvement +* Update contrib/re2 to 2023-06-02. [#50949](https://github.com/ClickHouse/ClickHouse/pull/50949) ([Yuriy Chernyshov](https://github.com/georgthegreat)). +* ClickHouse server will print the list of changed settings on fatal errors. This closes [#51137](https://github.com/ClickHouse/ClickHouse/issues/51137). [#51138](https://github.com/ClickHouse/ClickHouse/pull/51138) ([Alexey Milovidov](https://github.com/alexey-milovidov)). +* In https://github.com/ClickHouse/ClickHouse/pull/51143 the fasstests failed, but the status wasn't created because of the chown `file not found`. This addresses it. Decrease the default values for `http-max-field-value-size` and `http_max_field_name_size` to 128K. [#51163](https://github.com/ClickHouse/ClickHouse/pull/51163) ([Mikhail f. Shiryaev](https://github.com/Felixoid)). +* Update Ubuntu version in docker containers. [#51180](https://github.com/ClickHouse/ClickHouse/pull/51180) ([Mikhail f. Shiryaev](https://github.com/Felixoid)). +* Allow building ClickHouse with clang-17. [#51300](https://github.com/ClickHouse/ClickHouse/pull/51300) ([Alexey Milovidov](https://github.com/alexey-milovidov)). +* [SQLancer](https://github.com/sqlancer/sqlancer) check is considered stable as bugs that were triggered by it are fixed. Now failures of SQLancer check will be reported as failed check status. [#51340](https://github.com/ClickHouse/ClickHouse/pull/51340) ([Ilya Yatsishin](https://github.com/qoega)). +* Making our CI even better. [#51494](https://github.com/ClickHouse/ClickHouse/pull/51494) ([Nikita Mikhaylov](https://github.com/nikitamikhaylov)). +* Split huge `RUN` in Dockerfile into smaller conditional. Install the necessary tools on demand in the same `RUN` layer, and remove them after that. Upgrade the OS only once at the beginning. Use a modern way to check the signed repository. Downgrade the base repo to ubuntu:20.04 to address the issues on older docker versions. Upgrade golang version to address golang vulnerabilities. [#51504](https://github.com/ClickHouse/ClickHouse/pull/51504) ([Mikhail f. Shiryaev](https://github.com/Felixoid)). +* This a follow-up for [#51504](https://github.com/ClickHouse/ClickHouse/issues/51504), the cleanup was lost during refactoring. [#51564](https://github.com/ClickHouse/ClickHouse/pull/51564) ([Mikhail f. Shiryaev](https://github.com/Felixoid)). + +#### Bug Fix (user-visible misbehavior in an official stable release) + +* Report loading status for executable dictionaries correctly [#48775](https://github.com/ClickHouse/ClickHouse/pull/48775) ([Anton Kozlov](https://github.com/tonickkozlov)). +* Proper mutation of skip indices and projections [#50104](https://github.com/ClickHouse/ClickHouse/pull/50104) ([Amos Bird](https://github.com/amosbird)). +* Cleanup moving parts [#50489](https://github.com/ClickHouse/ClickHouse/pull/50489) ([vdimir](https://github.com/vdimir)). +* Fix backward compatibility for IP types hashing in aggregate functions [#50551](https://github.com/ClickHouse/ClickHouse/pull/50551) ([Yakov Olkhovskiy](https://github.com/yakov-olkhovskiy)). +* Fix Log family table return wrong rows count after truncate [#50585](https://github.com/ClickHouse/ClickHouse/pull/50585) ([flynn](https://github.com/ucasfl)). +* Fix bug in `uniqExact` parallel merging [#50590](https://github.com/ClickHouse/ClickHouse/pull/50590) ([Nikita Taranov](https://github.com/nickitat)). +* Revert recent grace hash join changes [#50699](https://github.com/ClickHouse/ClickHouse/pull/50699) ([vdimir](https://github.com/vdimir)). +* Query Cache: Try to fix bad cast from ColumnConst to ColumnVector [#50704](https://github.com/ClickHouse/ClickHouse/pull/50704) ([Robert Schulze](https://github.com/rschu1ze)). +* Do not read all the columns from right GLOBAL JOIN table. [#50721](https://github.com/ClickHouse/ClickHouse/pull/50721) ([Nikolai Kochetov](https://github.com/KochetovNicolai)). +* Avoid storing logs in Keeper containing unknown operation [#50751](https://github.com/ClickHouse/ClickHouse/pull/50751) ([Antonio Andelic](https://github.com/antonio2368)). +* SummingMergeTree support for DateTime64 [#50797](https://github.com/ClickHouse/ClickHouse/pull/50797) ([Jordi Villar](https://github.com/jrdi)). +* Add compat setting for non-const timezones [#50834](https://github.com/ClickHouse/ClickHouse/pull/50834) ([Robert Schulze](https://github.com/rschu1ze)). +* Fix type of LDAP server params hash in cache entry [#50865](https://github.com/ClickHouse/ClickHouse/pull/50865) ([Julian Maicher](https://github.com/jmaicher)). +* Fallback to parsing big integer from String instead of exception in Parquet format [#50873](https://github.com/ClickHouse/ClickHouse/pull/50873) ([Kruglov Pavel](https://github.com/Avogar)). +* Fix checking the lock file too often while writing a backup [#50889](https://github.com/ClickHouse/ClickHouse/pull/50889) ([Vitaly Baranov](https://github.com/vitlibar)). +* Do not apply projection if read-in-order was enabled. [#50923](https://github.com/ClickHouse/ClickHouse/pull/50923) ([Nikolai Kochetov](https://github.com/KochetovNicolai)). +* Fix race azure blob storage iterator [#50936](https://github.com/ClickHouse/ClickHouse/pull/50936) ([SmitaRKulkarni](https://github.com/SmitaRKulkarni)). +* Fix erroneous `sort_description` propagation in `CreatingSets` [#50955](https://github.com/ClickHouse/ClickHouse/pull/50955) ([Nikita Taranov](https://github.com/nickitat)). +* Fix iceberg V2 optional metadata parsing [#50974](https://github.com/ClickHouse/ClickHouse/pull/50974) ([Kseniia Sumarokova](https://github.com/kssenii)). +* MaterializedMySQL: Keep parentheses for empty table overrides [#50977](https://github.com/ClickHouse/ClickHouse/pull/50977) ([Val Doroshchuk](https://github.com/valbok)). +* Fix crash in BackupCoordinationStageSync::setError() [#51012](https://github.com/ClickHouse/ClickHouse/pull/51012) ([Vitaly Baranov](https://github.com/vitlibar)). +* Fix subtly broken copy-on-write of ColumnLowCardinality dictionary [#51064](https://github.com/ClickHouse/ClickHouse/pull/51064) ([Michael Kolupaev](https://github.com/al13n321)). +* Generate safe IVs [#51086](https://github.com/ClickHouse/ClickHouse/pull/51086) ([Salvatore Mesoraca](https://github.com/aiven-sal)). +* Fix ineffective query cache for SELECTs with subqueries [#51132](https://github.com/ClickHouse/ClickHouse/pull/51132) ([Robert Schulze](https://github.com/rschu1ze)). +* Fix Set index with constant nullable comparison. [#51205](https://github.com/ClickHouse/ClickHouse/pull/51205) ([Nikolai Kochetov](https://github.com/KochetovNicolai)). +* Fix a crash in s3 and s3Cluster functions [#51209](https://github.com/ClickHouse/ClickHouse/pull/51209) ([Nikolay Degterinsky](https://github.com/evillique)). +* Fix core dump when compile expression [#51231](https://github.com/ClickHouse/ClickHouse/pull/51231) ([LiuNeng](https://github.com/liuneng1994)). +* Fix use-after-free in StorageURL when switching URLs [#51260](https://github.com/ClickHouse/ClickHouse/pull/51260) ([Michael Kolupaev](https://github.com/al13n321)). +* Updated check for parameterized view [#51272](https://github.com/ClickHouse/ClickHouse/pull/51272) ([SmitaRKulkarni](https://github.com/SmitaRKulkarni)). +* Fix multiple writing of same file to backup [#51299](https://github.com/ClickHouse/ClickHouse/pull/51299) ([Vitaly Baranov](https://github.com/vitlibar)). +* Fix fuzzer failure in ActionsDAG [#51301](https://github.com/ClickHouse/ClickHouse/pull/51301) ([Alexey Milovidov](https://github.com/alexey-milovidov)). +* Remove garbage from function `transform` [#51350](https://github.com/ClickHouse/ClickHouse/pull/51350) ([Alexey Milovidov](https://github.com/alexey-milovidov)). +* Fix MSan report in lowerUTF8/upperUTF8 [#51371](https://github.com/ClickHouse/ClickHouse/pull/51371) ([Alexey Milovidov](https://github.com/alexey-milovidov)). +* fs cache: fix a bit incorrect use_count after [#44985](https://github.com/ClickHouse/ClickHouse/issues/44985) [#51406](https://github.com/ClickHouse/ClickHouse/pull/51406) ([Kseniia Sumarokova](https://github.com/kssenii)). +* Fix segfault in MathUnary [#51499](https://github.com/ClickHouse/ClickHouse/pull/51499) ([Ilya Yatsishin](https://github.com/qoega)). +* Fix logical assert in `tupleElement()` with default values [#51534](https://github.com/ClickHouse/ClickHouse/pull/51534) ([Robert Schulze](https://github.com/rschu1ze)). +* fs cache: remove file from opened file cache immediately when evicting file [#51596](https://github.com/ClickHouse/ClickHouse/pull/51596) ([Kseniia Sumarokova](https://github.com/kssenii)). + +#### NOT FOR CHANGELOG / INSIGNIFICANT + +* Deprecate delete-on-destroy.txt [#49181](https://github.com/ClickHouse/ClickHouse/pull/49181) ([Alexander Gololobov](https://github.com/davenger)). +* Attempt to increase the general runners' survival rate [#49283](https://github.com/ClickHouse/ClickHouse/pull/49283) ([Mikhail f. Shiryaev](https://github.com/Felixoid)). +* Refactor subqueries for IN [#49570](https://github.com/ClickHouse/ClickHouse/pull/49570) ([Nikolai Kochetov](https://github.com/KochetovNicolai)). +* Test plan optimization analyzer [#50095](https://github.com/ClickHouse/ClickHouse/pull/50095) ([Igor Nikonov](https://github.com/devcrafter)). +* Implement endianness-independent serialization for quantileTiming [#50324](https://github.com/ClickHouse/ClickHouse/pull/50324) ([ltrk2](https://github.com/ltrk2)). +* require `finalize()` call before d-tor for all writes buffers [#50395](https://github.com/ClickHouse/ClickHouse/pull/50395) ([Sema Checherinda](https://github.com/CheSema)). +* Implement big-endian support for the deterministic reservoir sampler [#50405](https://github.com/ClickHouse/ClickHouse/pull/50405) ([ltrk2](https://github.com/ltrk2)). +* Fix compilation error on big-endian platforms [#50406](https://github.com/ClickHouse/ClickHouse/pull/50406) ([ltrk2](https://github.com/ltrk2)). +* Attach gdb in stateless tests [#50487](https://github.com/ClickHouse/ClickHouse/pull/50487) ([Kruglov Pavel](https://github.com/Avogar)). +* JIT infrastructure refactoring [#50531](https://github.com/ClickHouse/ClickHouse/pull/50531) ([Maksim Kita](https://github.com/kitaisreal)). +* Analyzer: Do not apply Query Tree optimizations on shards [#50584](https://github.com/ClickHouse/ClickHouse/pull/50584) ([Dmitry Novik](https://github.com/novikd)). +* Increase max array size in group bitmap [#50620](https://github.com/ClickHouse/ClickHouse/pull/50620) ([Kruglov Pavel](https://github.com/Avogar)). +* Misc Annoy index improvements [#50661](https://github.com/ClickHouse/ClickHouse/pull/50661) ([Robert Schulze](https://github.com/rschu1ze)). +* Fix reading negative decimals in avro format [#50668](https://github.com/ClickHouse/ClickHouse/pull/50668) ([Kruglov Pavel](https://github.com/Avogar)). +* Unify priorities for connection pools [#50675](https://github.com/ClickHouse/ClickHouse/pull/50675) ([Sergei Trifonov](https://github.com/serxa)). +* Prostpone check of outdated parts [#50676](https://github.com/ClickHouse/ClickHouse/pull/50676) ([Alexander Tokmakov](https://github.com/tavplubix)). +* Unify priorities: `IExecutableTask`s [#50677](https://github.com/ClickHouse/ClickHouse/pull/50677) ([Sergei Trifonov](https://github.com/serxa)). +* Disable grace_hash join in stress tests [#50693](https://github.com/ClickHouse/ClickHouse/pull/50693) ([vdimir](https://github.com/vdimir)). +* ReverseTransform small improvement [#50698](https://github.com/ClickHouse/ClickHouse/pull/50698) ([Maksim Kita](https://github.com/kitaisreal)). +* Support OPTIMIZE for temporary tables [#50710](https://github.com/ClickHouse/ClickHouse/pull/50710) ([Alexander Tokmakov](https://github.com/tavplubix)). +* Refactor reading from object storages [#50711](https://github.com/ClickHouse/ClickHouse/pull/50711) ([Kseniia Sumarokova](https://github.com/kssenii)). +* Fix data race in log message of cached buffer [#50723](https://github.com/ClickHouse/ClickHouse/pull/50723) ([Kseniia Sumarokova](https://github.com/kssenii)). +* Add new keywords into projections documentation [#50743](https://github.com/ClickHouse/ClickHouse/pull/50743) ([YalalovSM](https://github.com/YalalovSM)). +* Fix build for aarch64 (temporary disable azure) [#50770](https://github.com/ClickHouse/ClickHouse/pull/50770) ([alesapin](https://github.com/alesapin)). +* Update version after release [#50772](https://github.com/ClickHouse/ClickHouse/pull/50772) ([Alexey Milovidov](https://github.com/alexey-milovidov)). +* Update version_date.tsv and changelogs after v23.5.1.3174-stable [#50774](https://github.com/ClickHouse/ClickHouse/pull/50774) ([robot-clickhouse](https://github.com/robot-clickhouse)). +* Update CHANGELOG.md [#50788](https://github.com/ClickHouse/ClickHouse/pull/50788) ([Ilya Yatsishin](https://github.com/qoega)). +* Update version_date.tsv and changelogs after v23.2.7.32-stable [#50809](https://github.com/ClickHouse/ClickHouse/pull/50809) ([robot-clickhouse](https://github.com/robot-clickhouse)). +* Desctructing --> Destructing [#50810](https://github.com/ClickHouse/ClickHouse/pull/50810) ([Robert Schulze](https://github.com/rschu1ze)). +* Don't mark a part as broken on `Poco::TimeoutException` [#50811](https://github.com/ClickHouse/ClickHouse/pull/50811) ([Alexander Tokmakov](https://github.com/tavplubix)). +* Rename azure_blob_storage to azureBlobStorage [#50812](https://github.com/ClickHouse/ClickHouse/pull/50812) ([SmitaRKulkarni](https://github.com/SmitaRKulkarni)). +* Fix ParallelReadBuffer seek [#50820](https://github.com/ClickHouse/ClickHouse/pull/50820) ([Michael Kolupaev](https://github.com/al13n321)). +* [RFC] Print git hash when crashing [#50823](https://github.com/ClickHouse/ClickHouse/pull/50823) ([Michael Kolupaev](https://github.com/al13n321)). +* Add tests for function "transform" [#50833](https://github.com/ClickHouse/ClickHouse/pull/50833) ([Alexey Milovidov](https://github.com/alexey-milovidov)). +* Update version_date.tsv and changelogs after v23.5.2.7-stable [#50844](https://github.com/ClickHouse/ClickHouse/pull/50844) ([robot-clickhouse](https://github.com/robot-clickhouse)). +* Updated changelog with azureBlobStorage table function & engine entry [#50850](https://github.com/ClickHouse/ClickHouse/pull/50850) ([SmitaRKulkarni](https://github.com/SmitaRKulkarni)). +* Update easy_tasks_sorted_ru.md [#50853](https://github.com/ClickHouse/ClickHouse/pull/50853) ([Alexey Milovidov](https://github.com/alexey-milovidov)). +* Document x86 / ARM prerequisites for Docker image [#50867](https://github.com/ClickHouse/ClickHouse/pull/50867) ([Robert Schulze](https://github.com/rschu1ze)). +* MaterializedMySQL: Add test_named_collections [#50874](https://github.com/ClickHouse/ClickHouse/pull/50874) ([Val Doroshchuk](https://github.com/valbok)). +* Update version_date.tsv and changelogs after v22.8.18.31-lts [#50881](https://github.com/ClickHouse/ClickHouse/pull/50881) ([robot-clickhouse](https://github.com/robot-clickhouse)). +* Update version_date.tsv and changelogs after v23.3.3.52-lts [#50882](https://github.com/ClickHouse/ClickHouse/pull/50882) ([robot-clickhouse](https://github.com/robot-clickhouse)). +* Update version_date.tsv and changelogs after v23.4.3.48-stable [#50883](https://github.com/ClickHouse/ClickHouse/pull/50883) ([robot-clickhouse](https://github.com/robot-clickhouse)). +* MaterializedMySQL: Add additional test case to insert_with_modify_binlog_checksum [#50884](https://github.com/ClickHouse/ClickHouse/pull/50884) ([Val Doroshchuk](https://github.com/valbok)). +* Update broken tests list [#50886](https://github.com/ClickHouse/ClickHouse/pull/50886) ([Dmitry Novik](https://github.com/novikd)). +* Fix LOGICAL_ERROR in snowflakeToDateTime*() [#50893](https://github.com/ClickHouse/ClickHouse/pull/50893) ([Robert Schulze](https://github.com/rschu1ze)). +* Tests with parallel replicas are no more "always green" [#50896](https://github.com/ClickHouse/ClickHouse/pull/50896) ([Nikita Mikhaylov](https://github.com/nikitamikhaylov)). +* Slightly more information in error message about cached disk [#50897](https://github.com/ClickHouse/ClickHouse/pull/50897) ([Michael Kolupaev](https://github.com/al13n321)). +* do not call finalize after exception [#50907](https://github.com/ClickHouse/ClickHouse/pull/50907) ([Sema Checherinda](https://github.com/CheSema)). +* Update Annoy docs [#50912](https://github.com/ClickHouse/ClickHouse/pull/50912) ([Robert Schulze](https://github.com/rschu1ze)). +* A bit safer UserDefinedSQLFunctionVisitor [#50913](https://github.com/ClickHouse/ClickHouse/pull/50913) ([Kseniia Sumarokova](https://github.com/kssenii)). +* Update contribe/orc in .gitmodules [#50920](https://github.com/ClickHouse/ClickHouse/pull/50920) ([San](https://github.com/santrancisco)). +* MaterializedMySQL: Add missing DROP DATABASE for tests [#50924](https://github.com/ClickHouse/ClickHouse/pull/50924) ([Val Doroshchuk](https://github.com/valbok)). +* Fix 'Illegal column timezone' in stress tests [#50929](https://github.com/ClickHouse/ClickHouse/pull/50929) ([Alexander Tokmakov](https://github.com/tavplubix)). +* Fix tests sanity checks and avoid dropping system.query_log table [#50934](https://github.com/ClickHouse/ClickHouse/pull/50934) ([Azat Khuzhin](https://github.com/azat)). +* Fix tests for throttling by allowing more margin of error for trottling event [#50935](https://github.com/ClickHouse/ClickHouse/pull/50935) ([Azat Khuzhin](https://github.com/azat)). +* 01746_convert_type_with_default: Temporarily disable flaky test [#50937](https://github.com/ClickHouse/ClickHouse/pull/50937) ([Robert Schulze](https://github.com/rschu1ze)). +* Fix the statless tests image for old commits [#50947](https://github.com/ClickHouse/ClickHouse/pull/50947) ([Mikhail f. Shiryaev](https://github.com/Felixoid)). +* Fix logic in `AsynchronousBoundedReadBuffer::seek` [#50952](https://github.com/ClickHouse/ClickHouse/pull/50952) ([Nikita Taranov](https://github.com/nickitat)). +* Uncomment flaky test (01746_convert_type_with_default) [#50954](https://github.com/ClickHouse/ClickHouse/pull/50954) ([Dmitry Kardymon](https://github.com/kardymonds)). +* Fix keeper-client help message [#50965](https://github.com/ClickHouse/ClickHouse/pull/50965) ([pufit](https://github.com/pufit)). +* fix build issue on clang 15 [#50967](https://github.com/ClickHouse/ClickHouse/pull/50967) ([Chang chen](https://github.com/baibaichen)). +* Docs: Fix embedded video link [#50972](https://github.com/ClickHouse/ClickHouse/pull/50972) ([Robert Schulze](https://github.com/rschu1ze)). +* Change submodule capnproto to it's fork in ClickHouse [#50987](https://github.com/ClickHouse/ClickHouse/pull/50987) ([Kruglov Pavel](https://github.com/Avogar)). +* Attempt to make 01281_group_by_limit_memory_tracking not flaky [#50995](https://github.com/ClickHouse/ClickHouse/pull/50995) ([Dmitry Novik](https://github.com/novikd)). +* Fix flaky 02561_null_as_default_more_formats [#51001](https://github.com/ClickHouse/ClickHouse/pull/51001) ([Igor Nikonov](https://github.com/devcrafter)). +* Fix flaky test_seekable_formats [#51002](https://github.com/ClickHouse/ClickHouse/pull/51002) ([Kruglov Pavel](https://github.com/Avogar)). +* Follow-up to [#50448](https://github.com/ClickHouse/ClickHouse/issues/50448) [#51006](https://github.com/ClickHouse/ClickHouse/pull/51006) ([Alexander Tokmakov](https://github.com/tavplubix)). +* Fix a versions' tweak for tagged commits, improve version_helper [#51035](https://github.com/ClickHouse/ClickHouse/pull/51035) ([Mikhail f. Shiryaev](https://github.com/Felixoid)). +* Sqlancer has changed master to main [#51060](https://github.com/ClickHouse/ClickHouse/pull/51060) ([Mikhail f. Shiryaev](https://github.com/Felixoid)). +* Do not spam sqlancer build log [#51061](https://github.com/ClickHouse/ClickHouse/pull/51061) ([Mikhail f. Shiryaev](https://github.com/Felixoid)). +* Refactor IColumn::forEachSubcolumn to make it slightly harder to implement incorrectly [#51072](https://github.com/ClickHouse/ClickHouse/pull/51072) ([Michael Kolupaev](https://github.com/al13n321)). +* MaterializedMySQL: Rename materialize_with_ddl.py -> materialized_with_ddl [#51074](https://github.com/ClickHouse/ClickHouse/pull/51074) ([Val Doroshchuk](https://github.com/valbok)). +* Improve woboq browser report [#51077](https://github.com/ClickHouse/ClickHouse/pull/51077) ([Mikhail f. Shiryaev](https://github.com/Felixoid)). +* Fix for part_names_mutex used after destruction [#51099](https://github.com/ClickHouse/ClickHouse/pull/51099) ([Alexander Gololobov](https://github.com/davenger)). +* Fix ColumnConst::forEachSubcolumn missing from previous PR [#51102](https://github.com/ClickHouse/ClickHouse/pull/51102) ([Michael Kolupaev](https://github.com/al13n321)). +* Fix the test 02783_parsedatetimebesteffort_syslog flakiness [#51112](https://github.com/ClickHouse/ClickHouse/pull/51112) ([Victor Krasnov](https://github.com/sirvickr)). +* Compatibility with clang-17 [#51114](https://github.com/ClickHouse/ClickHouse/pull/51114) ([Alexey Milovidov](https://github.com/alexey-milovidov)). +* Make more parallel get requests to ZooKeeper in system.zookeeper [#51118](https://github.com/ClickHouse/ClickHouse/pull/51118) ([Alexander Gololobov](https://github.com/davenger)). +* Fix 02703_max_local_write_bandwidth flakiness [#51120](https://github.com/ClickHouse/ClickHouse/pull/51120) ([Azat Khuzhin](https://github.com/azat)). +* Update version_date.tsv and changelogs after v23.5.3.24-stable [#51121](https://github.com/ClickHouse/ClickHouse/pull/51121) ([robot-clickhouse](https://github.com/robot-clickhouse)). +* Update version_date.tsv and changelogs after v23.4.4.16-stable [#51122](https://github.com/ClickHouse/ClickHouse/pull/51122) ([robot-clickhouse](https://github.com/robot-clickhouse)). +* Update version_date.tsv and changelogs after v23.3.4.17-lts [#51123](https://github.com/ClickHouse/ClickHouse/pull/51123) ([robot-clickhouse](https://github.com/robot-clickhouse)). +* Update version_date.tsv and changelogs after v22.8.19.10-lts [#51124](https://github.com/ClickHouse/ClickHouse/pull/51124) ([robot-clickhouse](https://github.com/robot-clickhouse)). +* Fix typo [#51126](https://github.com/ClickHouse/ClickHouse/pull/51126) ([Alexey Milovidov](https://github.com/alexey-milovidov)). +* Slightly better diagnostics [#51127](https://github.com/ClickHouse/ClickHouse/pull/51127) ([Alexey Milovidov](https://github.com/alexey-milovidov)). +* Small fix in `MergeTreePrefetchedReadPool` [#51131](https://github.com/ClickHouse/ClickHouse/pull/51131) ([Nikita Taranov](https://github.com/nickitat)). +* Don't report table function accesses to system.errors [#51147](https://github.com/ClickHouse/ClickHouse/pull/51147) ([Raúl Marín](https://github.com/Algunenano)). +* Fix SQLancer branch name [#51148](https://github.com/ClickHouse/ClickHouse/pull/51148) ([Ilya Yatsishin](https://github.com/qoega)). +* Revert "Added ability to implicitly use file/hdfs/s3 table functions in clickhouse-local" [#51149](https://github.com/ClickHouse/ClickHouse/pull/51149) ([Alexander Tokmakov](https://github.com/tavplubix)). +* More profile events for fs cache [#51161](https://github.com/ClickHouse/ClickHouse/pull/51161) ([Kseniia Sumarokova](https://github.com/kssenii)). +* Unforget to pass callback to readBigAt() in ParallelReadBuffer [#51165](https://github.com/ClickHouse/ClickHouse/pull/51165) ([Michael Kolupaev](https://github.com/al13n321)). +* Update README.md [#51179](https://github.com/ClickHouse/ClickHouse/pull/51179) ([Tyler Hannan](https://github.com/tylerhannan)). +* Update exception message [#51187](https://github.com/ClickHouse/ClickHouse/pull/51187) ([Kseniia Sumarokova](https://github.com/kssenii)). +* Split long test 02149_schema_inference_formats_with_schema into several tests to avoid timeout in debug [#51197](https://github.com/ClickHouse/ClickHouse/pull/51197) ([Kruglov Pavel](https://github.com/Avogar)). +* Avoid initializing DateLUT from emptyArray function registration [#51199](https://github.com/ClickHouse/ClickHouse/pull/51199) ([Alexander Gololobov](https://github.com/davenger)). +* Suppress check for covered parts in ZooKeeper [#51207](https://github.com/ClickHouse/ClickHouse/pull/51207) ([Alexander Tokmakov](https://github.com/tavplubix)). +* One more profile event for fs cache [#51223](https://github.com/ClickHouse/ClickHouse/pull/51223) ([Kseniia Sumarokova](https://github.com/kssenii)). +* Typo: passowrd_sha256_hex --> password_sha256_hex [#51233](https://github.com/ClickHouse/ClickHouse/pull/51233) ([Robert Schulze](https://github.com/rschu1ze)). +* Introduce settings enum field with auto-generated values list [#51237](https://github.com/ClickHouse/ClickHouse/pull/51237) ([Sergei Trifonov](https://github.com/serxa)). +* Drop session if we fail to get Keeper API version [#51238](https://github.com/ClickHouse/ClickHouse/pull/51238) ([Alexander Gololobov](https://github.com/davenger)). +* Revert "Fix a crash in s3 and s3Cluster functions" [#51239](https://github.com/ClickHouse/ClickHouse/pull/51239) ([Alexander Tokmakov](https://github.com/tavplubix)). +* fix flaky `AsyncLoader` destructor [#51245](https://github.com/ClickHouse/ClickHouse/pull/51245) ([Sergei Trifonov](https://github.com/serxa)). +* Docs: little cleanup of configuration-files.md [#51249](https://github.com/ClickHouse/ClickHouse/pull/51249) ([Robert Schulze](https://github.com/rschu1ze)). +* Fix a stupid bug on Replicated database recovery [#51252](https://github.com/ClickHouse/ClickHouse/pull/51252) ([Alexander Tokmakov](https://github.com/tavplubix)). +* FileCache: tryReserve() slight improvement [#51259](https://github.com/ClickHouse/ClickHouse/pull/51259) ([Igor Nikonov](https://github.com/devcrafter)). +* Ugly hotfix for "terminate on uncaught exception" in WriteBufferFromOStream [#51265](https://github.com/ClickHouse/ClickHouse/pull/51265) ([Alexander Tokmakov](https://github.com/tavplubix)). +* Avoid too many calls to Poco::Logger::get [#51266](https://github.com/ClickHouse/ClickHouse/pull/51266) ([Kseniia Sumarokova](https://github.com/kssenii)). +* Update version_date.tsv and changelogs after v23.3.5.9-lts [#51269](https://github.com/ClickHouse/ClickHouse/pull/51269) ([robot-clickhouse](https://github.com/robot-clickhouse)). +* Better reporting of broken parts [#51270](https://github.com/ClickHouse/ClickHouse/pull/51270) ([Anton Popov](https://github.com/CurtizJ)). +* Update ext-dict-functions.md [#51283](https://github.com/ClickHouse/ClickHouse/pull/51283) ([Mike Kot](https://github.com/myrrc)). +* Disable table structure check for secondary queries from Replicated db [#51284](https://github.com/ClickHouse/ClickHouse/pull/51284) ([Alexander Tokmakov](https://github.com/tavplubix)). +* Define Thrift version for parquet and use correct arrow version [#51285](https://github.com/ClickHouse/ClickHouse/pull/51285) ([Kruglov Pavel](https://github.com/Avogar)). +* Restore Azure build on ARM [#51288](https://github.com/ClickHouse/ClickHouse/pull/51288) ([Robert Schulze](https://github.com/rschu1ze)). +* Query Cache: Un-comment settings in server cfg [#51294](https://github.com/ClickHouse/ClickHouse/pull/51294) ([Robert Schulze](https://github.com/rschu1ze)). +* Require more checks [#51295](https://github.com/ClickHouse/ClickHouse/pull/51295) ([Alexey Milovidov](https://github.com/alexey-milovidov)). +* Fix metadata loading test [#51297](https://github.com/ClickHouse/ClickHouse/pull/51297) ([Alexey Milovidov](https://github.com/alexey-milovidov)). +* Scratch the strange Python code [#51302](https://github.com/ClickHouse/ClickHouse/pull/51302) ([Alexey Milovidov](https://github.com/alexey-milovidov)). +* Add a test for [#47865](https://github.com/ClickHouse/ClickHouse/issues/47865) [#51306](https://github.com/ClickHouse/ClickHouse/pull/51306) ([Alexey Milovidov](https://github.com/alexey-milovidov)). +* Add a test for [#48894](https://github.com/ClickHouse/ClickHouse/issues/48894) [#51307](https://github.com/ClickHouse/ClickHouse/pull/51307) ([Alexey Milovidov](https://github.com/alexey-milovidov)). +* Add a test for [#48676](https://github.com/ClickHouse/ClickHouse/issues/48676) [#51308](https://github.com/ClickHouse/ClickHouse/pull/51308) ([Alexey Milovidov](https://github.com/alexey-milovidov)). +* Fix long test `functions_bad_arguments` [#51310](https://github.com/ClickHouse/ClickHouse/pull/51310) ([Alexey Milovidov](https://github.com/alexey-milovidov)). +* Unify merge predicate [#51344](https://github.com/ClickHouse/ClickHouse/pull/51344) ([Alexander Tokmakov](https://github.com/tavplubix)). +* Fix using locks in ProcessList [#51348](https://github.com/ClickHouse/ClickHouse/pull/51348) ([Vitaly Baranov](https://github.com/vitlibar)). +* Add a test for [#42631](https://github.com/ClickHouse/ClickHouse/issues/42631) [#51353](https://github.com/ClickHouse/ClickHouse/pull/51353) ([Alexey Milovidov](https://github.com/alexey-milovidov)). +* Fix performance tests due to warnings from jemalloc about Per-CPU arena disabled [#51362](https://github.com/ClickHouse/ClickHouse/pull/51362) ([Azat Khuzhin](https://github.com/azat)). +* Fix "merge_truncate_long" test [#51369](https://github.com/ClickHouse/ClickHouse/pull/51369) ([Alexey Milovidov](https://github.com/alexey-milovidov)). +* Increase timeout of Fast Test [#51372](https://github.com/ClickHouse/ClickHouse/pull/51372) ([Alexey Milovidov](https://github.com/alexey-milovidov)). +* Fix bad tests for DNS [#51374](https://github.com/ClickHouse/ClickHouse/pull/51374) ([Alexey Milovidov](https://github.com/alexey-milovidov)). +* Attempt to fix the `relax_too_many_parts` test [#51375](https://github.com/ClickHouse/ClickHouse/pull/51375) ([Alexey Milovidov](https://github.com/alexey-milovidov)). +* Fix MySQL test in Debug mode [#51376](https://github.com/ClickHouse/ClickHouse/pull/51376) ([Alexey Milovidov](https://github.com/alexey-milovidov)). +* Fix bad test `01018_Distributed__shard_num` [#51377](https://github.com/ClickHouse/ClickHouse/pull/51377) ([Alexey Milovidov](https://github.com/alexey-milovidov)). +* Fix "logical error" in addressToLineWithInlines [#51379](https://github.com/ClickHouse/ClickHouse/pull/51379) ([Alexey Milovidov](https://github.com/alexey-milovidov)). +* Fix test 01280_ttl_where_group_by [#51380](https://github.com/ClickHouse/ClickHouse/pull/51380) ([Alexey Milovidov](https://github.com/alexey-milovidov)). +* Attempt to fix `test_ssl_cert_authentication` [#51384](https://github.com/ClickHouse/ClickHouse/pull/51384) ([Alexey Milovidov](https://github.com/alexey-milovidov)). +* Revert "Merge pull request [#50951](https://github.com/ClickHouse/ClickHouse/issues/50951) from ZhiguoZh/20230607-toyear-fix" [#51390](https://github.com/ClickHouse/ClickHouse/pull/51390) ([Alexey Milovidov](https://github.com/alexey-milovidov)). +* Two tests are twice longer in average with Analyzer and sometimes failing [#51391](https://github.com/ClickHouse/ClickHouse/pull/51391) ([Alexey Milovidov](https://github.com/alexey-milovidov)). +* Fix 00899_long_attach_memory_limit [#51395](https://github.com/ClickHouse/ClickHouse/pull/51395) ([Alexey Milovidov](https://github.com/alexey-milovidov)). +* Fix test 01293_optimize_final_force [#51396](https://github.com/ClickHouse/ClickHouse/pull/51396) ([Alexey Milovidov](https://github.com/alexey-milovidov)). +* Fix test 02481_parquet_list_monotonically_increasing_offsets [#51397](https://github.com/ClickHouse/ClickHouse/pull/51397) ([Alexey Milovidov](https://github.com/alexey-milovidov)). +* Fix test 02497_trace_events_stress_long [#51398](https://github.com/ClickHouse/ClickHouse/pull/51398) ([Alexey Milovidov](https://github.com/alexey-milovidov)). +* Fix broken labeling for `manual approve` [#51405](https://github.com/ClickHouse/ClickHouse/pull/51405) ([Mikhail f. Shiryaev](https://github.com/Felixoid)). +* Fix parts lifetime in `MergeTreeTransaction` [#51407](https://github.com/ClickHouse/ClickHouse/pull/51407) ([Alexander Tokmakov](https://github.com/tavplubix)). +* Fix flaky test test_skip_empty_files [#51409](https://github.com/ClickHouse/ClickHouse/pull/51409) ([Kruglov Pavel](https://github.com/Avogar)). +* fix flacky test test_profile_events_s3 [#51412](https://github.com/ClickHouse/ClickHouse/pull/51412) ([Sema Checherinda](https://github.com/CheSema)). +* Update README.md [#51413](https://github.com/ClickHouse/ClickHouse/pull/51413) ([Tyler Hannan](https://github.com/tylerhannan)). +* Replace try/catch logic in hasTokenOrNull() by something more lightweight [#51425](https://github.com/ClickHouse/ClickHouse/pull/51425) ([Robert Schulze](https://github.com/rschu1ze)). +* Add retries to `tlsv1_3` tests [#51434](https://github.com/ClickHouse/ClickHouse/pull/51434) ([János Benjamin Antal](https://github.com/antaljanosbenjamin)). +* Update exception message [#51440](https://github.com/ClickHouse/ClickHouse/pull/51440) ([Kseniia Sumarokova](https://github.com/kssenii)). +* fs cache: add check for intersecting ranges [#51444](https://github.com/ClickHouse/ClickHouse/pull/51444) ([Kseniia Sumarokova](https://github.com/kssenii)). +* Slightly better code around packets for parallel replicas [#51451](https://github.com/ClickHouse/ClickHouse/pull/51451) ([Nikita Mikhaylov](https://github.com/nikitamikhaylov)). +* Update system_warnings test [#51453](https://github.com/ClickHouse/ClickHouse/pull/51453) ([Alexey Milovidov](https://github.com/alexey-milovidov)). +* Many fixes [#51455](https://github.com/ClickHouse/ClickHouse/pull/51455) ([Alexey Milovidov](https://github.com/alexey-milovidov)). +* Fix test 01605_adaptive_granularity_block_borders [#51457](https://github.com/ClickHouse/ClickHouse/pull/51457) ([Alexey Milovidov](https://github.com/alexey-milovidov)). +* Try fix flaky 02497_storage_file_reader_selection [#51468](https://github.com/ClickHouse/ClickHouse/pull/51468) ([Kruglov Pavel](https://github.com/Avogar)). +* Try making Keeper in `DatabaseReplicated` tests more stable [#51473](https://github.com/ClickHouse/ClickHouse/pull/51473) ([Antonio Andelic](https://github.com/antonio2368)). +* Convert 02003_memory_limit_in_client from expect to sh test (to fix flakiness) [#51475](https://github.com/ClickHouse/ClickHouse/pull/51475) ([Azat Khuzhin](https://github.com/azat)). +* Fix test_disk_over_web_server [#51476](https://github.com/ClickHouse/ClickHouse/pull/51476) ([Kseniia Sumarokova](https://github.com/kssenii)). +* Delay shutdown of system and temporary databases [#51479](https://github.com/ClickHouse/ClickHouse/pull/51479) ([Kseniia Sumarokova](https://github.com/kssenii)). +* Fix memory leakage in CompressionCodecDeflateQpl [#51480](https://github.com/ClickHouse/ClickHouse/pull/51480) ([Vitaly Baranov](https://github.com/vitlibar)). +* Increase retries in test_multiple_disks/test.py::test_start_stop_moves [#51482](https://github.com/ClickHouse/ClickHouse/pull/51482) ([Kseniia Sumarokova](https://github.com/kssenii)). +* Fix race in BoundedReadBuffer [#51484](https://github.com/ClickHouse/ClickHouse/pull/51484) ([Kseniia Sumarokova](https://github.com/kssenii)). +* Fix flaky unit test [#51485](https://github.com/ClickHouse/ClickHouse/pull/51485) ([Kseniia Sumarokova](https://github.com/kssenii)). +* Fix flaky test `test_host_regexp_multiple_ptr_records` [#51506](https://github.com/ClickHouse/ClickHouse/pull/51506) ([Nikolay Degterinsky](https://github.com/evillique)). +* Add a comment [#51517](https://github.com/ClickHouse/ClickHouse/pull/51517) ([Kseniia Sumarokova](https://github.com/kssenii)). +* Make `test_ssl_cert_authentication` similar to `test_tlvs1_3` [#51520](https://github.com/ClickHouse/ClickHouse/pull/51520) ([János Benjamin Antal](https://github.com/antaljanosbenjamin)). +* Fix duplicate storage set logical error. [#51521](https://github.com/ClickHouse/ClickHouse/pull/51521) ([Nikolai Kochetov](https://github.com/KochetovNicolai)). +* Update test_storage_postgresql/test.py::test_concurrent_queries [#51523](https://github.com/ClickHouse/ClickHouse/pull/51523) ([Kseniia Sumarokova](https://github.com/kssenii)). +* Fix FATAL: query context is not detached from thread group [#51540](https://github.com/ClickHouse/ClickHouse/pull/51540) ([Igor Nikonov](https://github.com/devcrafter)). +* Update version_date.tsv and changelogs after v23.3.6.7-lts [#51548](https://github.com/ClickHouse/ClickHouse/pull/51548) ([robot-clickhouse](https://github.com/robot-clickhouse)). +* Decoupled commits from [#51180](https://github.com/ClickHouse/ClickHouse/issues/51180) for backports [#51561](https://github.com/ClickHouse/ClickHouse/pull/51561) ([Mikhail f. Shiryaev](https://github.com/Felixoid)). +* Try to fix deadlock in ZooKeeper client [#51563](https://github.com/ClickHouse/ClickHouse/pull/51563) ([Alexander Tokmakov](https://github.com/tavplubix)). +* Retry chroot creation in ZK before stateless tests [#51585](https://github.com/ClickHouse/ClickHouse/pull/51585) ([Antonio Andelic](https://github.com/antonio2368)). +* use timeout instead trap in 01443_merge_truncate_long.sh [#51593](https://github.com/ClickHouse/ClickHouse/pull/51593) ([Sema Checherinda](https://github.com/CheSema)). +* Update version_date.tsv and changelogs after v23.5.4.25-stable [#51604](https://github.com/ClickHouse/ClickHouse/pull/51604) ([robot-clickhouse](https://github.com/robot-clickhouse)). +* Fix MergeTreeMarksLoader segfaulting if marks file is longer than expected [#51636](https://github.com/ClickHouse/ClickHouse/pull/51636) ([Michael Kolupaev](https://github.com/al13n321)). +* Update version_date.tsv and changelogs after v23.4.5.22-stable [#51638](https://github.com/ClickHouse/ClickHouse/pull/51638) ([robot-clickhouse](https://github.com/robot-clickhouse)). +* Update version_date.tsv and changelogs after v23.3.7.5-lts [#51639](https://github.com/ClickHouse/ClickHouse/pull/51639) ([robot-clickhouse](https://github.com/robot-clickhouse)). +* Update parts.md [#51643](https://github.com/ClickHouse/ClickHouse/pull/51643) ([Ramazan Polat](https://github.com/ramazanpolat)). + diff --git a/docs/en/engines/table-engines/integrations/odbc.md b/docs/en/engines/table-engines/integrations/odbc.md index e29e56c10b2..71085feb626 100644 --- a/docs/en/engines/table-engines/integrations/odbc.md +++ b/docs/en/engines/table-engines/integrations/odbc.md @@ -54,7 +54,7 @@ $ sudo mysql ``` sql mysql> CREATE USER 'clickhouse'@'localhost' IDENTIFIED BY 'clickhouse'; -mysql> GRANT ALL PRIVILEGES ON *.* TO 'clickhouse'@'clickhouse' WITH GRANT OPTION; +mysql> GRANT ALL PRIVILEGES ON *.* TO 'clickhouse'@'localhost' WITH GRANT OPTION; ``` Then configure the connection in `/etc/odbc.ini`. @@ -66,7 +66,7 @@ DRIVER = /usr/local/lib/libmyodbc5w.so SERVER = 127.0.0.1 PORT = 3306 DATABASE = test -USERNAME = clickhouse +USER = clickhouse PASSWORD = clickhouse ``` @@ -83,6 +83,9 @@ $ isql -v mysqlconn Table in MySQL: ``` text +mysql> CREATE DATABASE test; +Query OK, 1 row affected (0,01 sec) + mysql> CREATE TABLE `test`.`test` ( -> `int_id` INT NOT NULL AUTO_INCREMENT, -> `int_nullable` INT NULL DEFAULT NULL, @@ -91,10 +94,10 @@ mysql> CREATE TABLE `test`.`test` ( -> PRIMARY KEY (`int_id`)); Query OK, 0 rows affected (0,09 sec) -mysql> insert into test (`int_id`, `float`) VALUES (1,2); +mysql> insert into test.test (`int_id`, `float`) VALUES (1,2); Query OK, 1 row affected (0,00 sec) -mysql> select * from test; +mysql> select * from test.test; +------+----------+-----+----------+ | int_id | int_nullable | float | float_nullable | +------+----------+-----+----------+ diff --git a/docs/en/engines/table-engines/integrations/redis.md b/docs/en/engines/table-engines/integrations/redis.md index 568179eb690..8086a6503b8 100644 --- a/docs/en/engines/table-engines/integrations/redis.md +++ b/docs/en/engines/table-engines/integrations/redis.md @@ -44,11 +44,12 @@ Create a table in ClickHouse which allows to read data from Redis: ``` sql CREATE TABLE redis_table ( - `k` String, - `m` String, - `n` UInt32 + `key` String, + `v1` UInt32, + `v2` String, + `v3` Float32 ) -ENGINE = Redis('redis1:6379') PRIMARY KEY(k); +ENGINE = Redis('redis1:6379') PRIMARY KEY(key); ``` Insert: @@ -111,9 +112,16 @@ Flush Redis db asynchronously. Also `Truncate` support SYNC mode. TRUNCATE TABLE redis_table SYNC; ``` +Join: + +Join with other tables. + +``` +SELECT * FROM redis_table JOIN merge_tree_table ON merge_tree_table.key=redis_table.key; +``` ## Limitations {#limitations} Redis engine also supports scanning queries, such as `where k > xx`, but it has some limitations: -1. Scanning query may produce some duplicated keys in a very rare case when it is rehashing. See details in [Redis Scan](https://github.com/redis/redis/blob/e4d183afd33e0b2e6e8d1c79a832f678a04a7886/src/dict.c#L1186-L1269) +1. Scanning query may produce some duplicated keys in a very rare case when it is rehashing. See details in [Redis Scan](https://github.com/redis/redis/blob/e4d183afd33e0b2e6e8d1c79a832f678a04a7886/src/dict.c#L1186-L1269). 2. During the scanning, keys could be created and deleted, so the resulting dataset can not represent a valid point in time. diff --git a/docs/en/engines/table-engines/mergetree-family/mergetree.md b/docs/en/engines/table-engines/mergetree-family/mergetree.md index 42454af6feb..67043ef1062 100644 --- a/docs/en/engines/table-engines/mergetree-family/mergetree.md +++ b/docs/en/engines/table-engines/mergetree-family/mergetree.md @@ -756,6 +756,17 @@ If you perform the `SELECT` query between merges, you may get expired data. To a - [ttl_only_drop_parts](/docs/en/operations/settings/settings.md/#ttl_only_drop_parts) setting +## Disk types + +In addition to local block devices, ClickHouse supports these storage types: +- [`s3` for S3 and MinIO](#table_engine-mergetree-s3) +- [`gcs` for GCS](/docs/en/integrations/data-ingestion/gcs/index.md/#creating-a-disk) +- [`blob_storage_disk` for Azure Blob Storage](#table_engine-mergetree-azure-blob-storage) +- [`hdfs` for HDFS](#hdfs-storage) +- [`web` for read-only from web](#web-storage) +- [`cache` for local caching](/docs/en/operations/storing-data.md/#using-local-cache) +- [`s3_plain` for backups to S3](/docs/en/operations/backup#backuprestore-using-an-s3-disk) + ## Using Multiple Block Devices for Data Storage {#table_engine-mergetree-multiple-volumes} ### Introduction {#introduction} @@ -936,7 +947,16 @@ configuration files; all the settings are in the CREATE/ATTACH query. The example uses `type=web`, but any disk type can be configured as dynamic, even Local disk. Local disks require a path argument to be inside the server config parameter `custom_local_disks_base_directory`, which has no default, so set that also when using local disk. ::: +#### Example dynamic web storage + +:::tip +A [demo dataset](https://github.com/ClickHouse/web-tables-demo) is hosted in GitHub. To prepare your own tables for web storage see the tool [clickhouse-static-files-uploader](/docs/en/operations/storing-data.md/#storing-data-on-webserver) +::: + +In this `ATTACH TABLE` query the `UUID` provided matches the directory name of the data, and the endpoint is the URL for the raw GitHub content. + ```sql +# highlight-next-line ATTACH TABLE uk_price_paid UUID 'cf712b4f-2ca8-435c-ac23-c4393efe52f7' ( price UInt32, @@ -1238,6 +1258,93 @@ Examples of working configurations can be found in integration tests directory ( Zero-copy replication is disabled by default in ClickHouse version 22.8 and higher. This feature is not recommended for production use. ::: +## HDFS storage {#hdfs-storage} + +In this sample configuration: +- the disk is of type `hdfs` +- the data is hosted at `hdfs://hdfs1:9000/clickhouse/` + +```xml + + + + + hdfs + hdfs://hdfs1:9000/clickhouse/ + true + + + local + / + + + + + +
+ hdfs +
+ + hdd + +
+
+
+
+
+``` + +## Web storage (read-only) {#web-storage} + +Web storage can be used for read-only purposes. An example use is for hosting sample +data, or for migrating data. + +:::tip +Storage can also be configured temporarily within a query, if a web dataset is not expected +to be used routinely, see [dynamic storage](#dynamic-storage) and skip editing the +configuration file. +::: + +In this sample configuration: +- the disk is of type `web` +- the data is hosted at `http://nginx:80/test1/` +- a cache on local storage is used + +```xml + + + + + web + http://nginx:80/test1/ + + + cache + web + cached_web_cache/ + 100000000 + + + + + +
+ web +
+
+
+ + +
+ cached_web +
+
+
+
+
+
+``` + ## Virtual Columns {#virtual-columns} - `_part` — Name of a part. diff --git a/docs/en/getting-started/install.md b/docs/en/getting-started/install.md index d44dc861888..d2e7ab30478 100644 --- a/docs/en/getting-started/install.md +++ b/docs/en/getting-started/install.md @@ -378,6 +378,10 @@ request](https://github.com/ClickHouse/ClickHouse/commits/master) and find CI ch https://s3.amazonaws.com/clickhouse/builds/PRs/.../.../binary_aarch64_v80compat/clickhouse". You can then click the link to download the build. +### macOS-only: Install with Homebrew + +To install ClickHouse using the popular `brew` package manager, follow the instructions listed in the [ClickHouse Homebrew tap](https://github.com/ClickHouse/homebrew-clickhouse). + ## Launch {#launch} To start the server as a daemon, run: diff --git a/docs/en/operations/server-configuration-parameters/settings.md b/docs/en/operations/server-configuration-parameters/settings.md index 40c1b8d64a1..bad7e388377 100644 --- a/docs/en/operations/server-configuration-parameters/settings.md +++ b/docs/en/operations/server-configuration-parameters/settings.md @@ -2120,7 +2120,13 @@ This section contains the following parameters: - `operation_timeout_ms` — Maximum timeout for one operation in milliseconds. - `root` — The [znode](http://zookeeper.apache.org/doc/r3.5.5/zookeeperOver.html#Nodes+and+ephemeral+nodes) that is used as the root for znodes used by the ClickHouse server. Optional. - `identity` — User and password, that can be required by ZooKeeper to give access to requested znodes. Optional. - +- zookeeper_load_balancing - Specifies the algorithm of ZooKeeper node selection. + * random - randomly selects one of ZooKeeper nodes. + * in_order - selects the first ZooKeeper node, if it's not available then the second, and so on. + * nearest_hostname - selects a ZooKeeper node with a hostname that is most similar to the server’s hostname. + * first_or_random - selects the first ZooKeeper node, if it's not available then randomly selects one of remaining ZooKeeper nodes. + * round_robin - selects the first ZooKeeper node, if reconnection happens selects the next. + **Example configuration** ``` xml @@ -2139,6 +2145,8 @@ This section contains the following parameters: /path/to/zookeeper/node user:password + + random ``` diff --git a/docs/en/operations/settings/settings.md b/docs/en/operations/settings/settings.md index cff65e049f3..cff13302cdc 100644 --- a/docs/en/operations/settings/settings.md +++ b/docs/en/operations/settings/settings.md @@ -1322,7 +1322,7 @@ Connection pool size for PostgreSQL table engine and database engine. Default value: 16 -## postgresql_connection_pool_size {#postgresql-connection-pool-size} +## postgresql_connection_pool_wait_timeout {#postgresql-connection-pool-wait-timeout} Connection pool push/pop timeout on empty pool for PostgreSQL table engine and database engine. By default it will block on empty pool. diff --git a/docs/en/operations/storing-data.md b/docs/en/operations/storing-data.md index 5804ad8545b..fe6e8e15b0c 100644 --- a/docs/en/operations/storing-data.md +++ b/docs/en/operations/storing-data.md @@ -184,13 +184,15 @@ These settings should be defined in the disk configuration section. - `enable_filesystem_query_cache_limit` - allow to limit the size of cache which is downloaded within each query (depends on user setting `max_query_cache_size`). Default: `false`. -- `enable_cache_hits_threshold` - number which defines how many times some data needs to be read before it will be cached. Default: `0`, e.g. the data is cached at the first attempt to read it. +- `enable_cache_hits_threshold` - number which defines how many times some data needs to be read before it will be cached. Default: `false`. This threshold can be defined by `cache_hits_threshold`. Default: `0`, e.g. the data is cached at the first attempt to read it. + +- `enable_bypass_cache_with_threshold` - allows to skip cache completely in case the requested read range exceeds the threshold. Default: `false`. This threshold can be defined by `bypass_cache_threashold`. Default: `268435456` (`256Mi`). - `do_not_evict_index_and_mark_files` - do not evict small frequently used files according to cache policy. Default: `false`. This setting was added in version 22.8. If you used filesystem cache before this version, then it will not work on versions starting from 22.8 if this setting is set to `true`. If you want to use this setting, clear old cache created before version 22.8 before upgrading. -- `max_file_segment_size` - a maximum size of a single cache file in bytes or in readable format (`ki, Mi, Gi, etc`, example `10Gi`). Default: `104857600` (`100Mi`). +- `max_file_segment_size` - a maximum size of a single cache file in bytes or in readable format (`ki, Mi, Gi, etc`, example `10Gi`). Default: `8388608` (`8Mi`). -- `max_elements` - a limit for a number of cache files. Default: `1048576`. +- `max_elements` - a limit for a number of cache files. Default: `10000000`. File Cache **query/profile settings**: diff --git a/docs/en/operations/system-tables/index.md b/docs/en/operations/system-tables/index.md index 508419783ef..1b720098fc7 100644 --- a/docs/en/operations/system-tables/index.md +++ b/docs/en/operations/system-tables/index.md @@ -13,6 +13,7 @@ System tables provide information about: - Server states, processes, and environment. - Server’s internal processes. +- Options used when the ClickHouse binary was built. System tables: diff --git a/docs/en/operations/system-tables/parts.md b/docs/en/operations/system-tables/parts.md index 9159d1e9284..5829e5ad313 100644 --- a/docs/en/operations/system-tables/parts.md +++ b/docs/en/operations/system-tables/parts.md @@ -27,7 +27,7 @@ Columns: Data storing format is controlled by the `min_bytes_for_wide_part` and `min_rows_for_wide_part` settings of the [MergeTree](../../engines/table-engines/mergetree-family/mergetree.md) table. - - `active` ([UInt8](../../sql-reference/data-types/int-uint.md)) – Flag that indicates whether the data part is active. If a data part is active, it’s used in a table. Otherwise, it’s deleted. Inactive data parts remain after merging. +- `active` ([UInt8](../../sql-reference/data-types/int-uint.md)) – Flag that indicates whether the data part is active. If a data part is active, it’s used in a table. Otherwise, it’s deleted. Inactive data parts remain after merging. - `marks` ([UInt64](../../sql-reference/data-types/int-uint.md)) – The number of marks. To get the approximate number of rows in a data part, multiply `marks` by the index granularity (usually 8192) (this hint does not work for adaptive granularity). diff --git a/docs/en/sql-reference/aggregate-functions/combinators.md b/docs/en/sql-reference/aggregate-functions/combinators.md index fd693430064..a395b350a55 100644 --- a/docs/en/sql-reference/aggregate-functions/combinators.md +++ b/docs/en/sql-reference/aggregate-functions/combinators.md @@ -97,6 +97,10 @@ Result: If you apply this combinator, the aggregate function does not return the resulting value (such as the number of unique values for the [uniq](../../sql-reference/aggregate-functions/reference/uniq.md#agg_function-uniq) function), but an intermediate state of the aggregation (for `uniq`, this is the hash table for calculating the number of unique values). This is an `AggregateFunction(...)` that can be used for further processing or stored in a table to finish aggregating later. +:::note +Please notice, that -MapState is not an invariant for the same data due to the fact that order of data in intermediate state can change, though it doesn't impact ingestion of this data. +::: + To work with these states, use: - [AggregatingMergeTree](../../engines/table-engines/mergetree-family/aggregatingmergetree.md) table engine. diff --git a/docs/en/sql-reference/functions/array-functions.md b/docs/en/sql-reference/functions/array-functions.md index bdd1445c990..7f2b8f3c605 100644 --- a/docs/en/sql-reference/functions/array-functions.md +++ b/docs/en/sql-reference/functions/array-functions.md @@ -230,13 +230,15 @@ hasAll(set, subset) **Arguments** - `set` – Array of any type with a set of elements. -- `subset` – Array of any type with elements that should be tested to be a subset of `set`. +- `subset` – Array of any type that shares a common supertype with `set` containing elements that should be tested to be a subset of `set`. **Return values** - `1`, if `set` contains all of the elements from `subset`. - `0`, otherwise. +Raises an exception `NO_COMMON_TYPE` if the set and subset elements do not share a common supertype. + **Peculiar properties** - An empty array is a subset of any array. @@ -253,7 +255,7 @@ hasAll(set, subset) `SELECT hasAll(['a', 'b'], ['a'])` returns 1. -`SELECT hasAll([1], ['a'])` returns 0. +`SELECT hasAll([1], ['a'])` raises a `NO_COMMON_TYPE` exception. `SELECT hasAll([[1, 2], [3, 4]], [[1, 2], [3, 5]])` returns 0. @@ -268,13 +270,15 @@ hasAny(array1, array2) **Arguments** - `array1` – Array of any type with a set of elements. -- `array2` – Array of any type with a set of elements. +- `array2` – Array of any type that shares a common supertype with `array1`. **Return values** - `1`, if `array1` and `array2` have one similar element at least. - `0`, otherwise. +Raises an exception `NO_COMMON_TYPE` if the array1 and array2 elements do not share a common supertype. + **Peculiar properties** - `Null` processed as a value. @@ -288,7 +292,7 @@ hasAny(array1, array2) `SELECT hasAny([-128, 1., 512], [1])` returns `1`. -`SELECT hasAny([[1, 2], [3, 4]], ['a', 'c'])` returns `0`. +`SELECT hasAny([[1, 2], [3, 4]], ['a', 'c'])` raises a `NO_COMMON_TYPE` exception. `SELECT hasAll([[1, 2], [3, 4]], [[1, 2], [1, 2]])` returns `1`. @@ -318,6 +322,8 @@ For Example: - `1`, if `array1` contains `array2`. - `0`, otherwise. +Raises an exception `NO_COMMON_TYPE` if the array1 and array2 elements do not share a common supertype. + **Peculiar properties** - The function will return `1` if `array2` is empty. @@ -339,6 +345,9 @@ For Example: `SELECT hasSubstr(['a', 'b' , 'c'], ['a', 'c'])` returns 0. `SELECT hasSubstr([[1, 2], [3, 4], [5, 6]], [[1, 2], [3, 4]])` returns 1. +i +`SELECT hasSubstr([1, 2, NULL, 3, 4], ['a'])` raises a `NO_COMMON_TYPE` exception. + ## indexOf(arr, x) diff --git a/docs/en/sql-reference/functions/functions-for-nulls.md b/docs/en/sql-reference/functions/functions-for-nulls.md index 6f82fedaab7..d57b799e94c 100644 --- a/docs/en/sql-reference/functions/functions-for-nulls.md +++ b/docs/en/sql-reference/functions/functions-for-nulls.md @@ -8,7 +8,7 @@ sidebar_label: Nullable ## isNull -Returns whether the argument is [NULL](../../sql-reference/syntax.md#null-literal). +Returns whether the argument is [NULL](../../sql-reference/syntax.md#null). ``` sql isNull(x) diff --git a/docs/en/sql-reference/functions/tuple-functions.md b/docs/en/sql-reference/functions/tuple-functions.md index 1739920c9f0..7ed2deaeda6 100644 --- a/docs/en/sql-reference/functions/tuple-functions.md +++ b/docs/en/sql-reference/functions/tuple-functions.md @@ -22,14 +22,15 @@ tuple(x, y, …) A function that allows getting a column from a tuple. -If the second argument is a number `n`, it is the column index, starting from 1. If the second argument is a string `s`, it represents the name of the element. Besides, we can provide the third optional argument, such that when index out of bounds or element for such name does not exist, the default value returned instead of throw exception. The second and third arguments if provided are always must be constant. There is no cost to execute the function. +If the second argument is a number `index`, it is the column index, starting from 1. If the second argument is a string `name`, it represents the name of the element. Besides, we can provide the third optional argument, such that when index out of bounds or no element exist for the name, the default value returned instead of throwing an exception. The second and third arguments, if provided, must be constants. There is no cost to execute the function. -The function implements the operator `x.n` and `x.s`. +The function implements operators `x.index` and `x.name`. **Syntax** ``` sql -tupleElement(tuple, n/s [, default_value]) +tupleElement(tuple, index, [, default_value]) +tupleElement(tuple, name, [, default_value]) ``` ## untuple diff --git a/docs/en/sql-reference/statements/show.md b/docs/en/sql-reference/statements/show.md index f96eb55aa45..336b93db9d5 100644 --- a/docs/en/sql-reference/statements/show.md +++ b/docs/en/sql-reference/statements/show.md @@ -283,7 +283,7 @@ The optional keyword `EXTENDED` currently has no effect, it only exists for MySQ `SHOW INDEX` produces a result table with the following structure: - table - The name of the table (String) -- non_unique - 0 if the index can contain duplicates, 1 otherwise (UInt8) +- non_unique - 0 if the index cannot contain duplicates, 1 otherwise (UInt8) - key_name - The name of the index, `PRIMARY` if the index is a primary key index (String) - seq_in_index - Currently unused - column_name - Currently unused diff --git a/docs/ru/sql-reference/aggregate-functions/combinators.md b/docs/ru/sql-reference/aggregate-functions/combinators.md index 3a7ff571f99..99d5f11442c 100644 --- a/docs/ru/sql-reference/aggregate-functions/combinators.md +++ b/docs/ru/sql-reference/aggregate-functions/combinators.md @@ -66,6 +66,10 @@ WITH anySimpleState(number) AS c SELECT toTypeName(c), c FROM numbers(1); В случае применения этого комбинатора, агрегатная функция возвращает не готовое значение (например, в случае функции [uniq](reference/uniq.md#agg_function-uniq) — количество уникальных значений), а промежуточное состояние агрегации (например, в случае функции `uniq` — хэш-таблицу для расчёта количества уникальных значений), которое имеет тип `AggregateFunction(...)` и может использоваться для дальнейшей обработки или может быть сохранено в таблицу для последующей доагрегации. +:::note +Промежуточное состояние для -MapState не является инвариантом для одних и тех же исходных данные т.к. порядок данных может меняться. Это не влияет, тем не менее, на загрузку таких данных. +::: + Для работы с промежуточными состояниями предназначены: - Движок таблиц [AggregatingMergeTree](../../engines/table-engines/mergetree-family/aggregatingmergetree.md). diff --git a/docs/zh/sql-reference/functions/functions-for-nulls.md b/docs/zh/sql-reference/functions/functions-for-nulls.md index 4dd30970923..b3dca3ac549 100644 --- a/docs/zh/sql-reference/functions/functions-for-nulls.md +++ b/docs/zh/sql-reference/functions/functions-for-nulls.md @@ -192,7 +192,7 @@ SELECT coalesce(mail, phone, CAST(icq,'Nullable(String)')) FROM aBook **返回值** - 如果`x`不为`NULL`,返回非`Nullable`类型的原始值。 -- 如果`x`为`NULL`,返回对应非`Nullable`类型的默认值。 +- 如果`x`为`NULL`,则返回任意值。 **示例** diff --git a/programs/client/Client.cpp b/programs/client/Client.cpp index 14516bfa939..19b601b9a7b 100644 --- a/programs/client/Client.cpp +++ b/programs/client/Client.cpp @@ -790,7 +790,7 @@ bool Client::processWithFuzzing(const String & full_query) WriteBufferFromOStream cerr_buf(std::cerr, 4096); fuzz_base->dumpTree(cerr_buf); - cerr_buf.next(); + cerr_buf.finalize(); fmt::print( stderr, @@ -928,7 +928,7 @@ bool Client::processWithFuzzing(const String & full_query) std::cout << std::endl; WriteBufferFromOStream ast_buf(std::cout, 4096); formatAST(*query, ast_buf, false /*highlight*/); - ast_buf.next(); + ast_buf.finalize(); if (const auto * insert = query->as()) { /// For inserts with data it's really useful to have the data itself available in the logs, as formatAST doesn't print it diff --git a/programs/format/Format.cpp b/programs/format/Format.cpp index 392ecefac0f..43c66a32302 100644 --- a/programs/format/Format.cpp +++ b/programs/format/Format.cpp @@ -151,6 +151,7 @@ int mainEntryClickHouseFormat(int argc, char ** argv) WriteBufferFromFileDescriptor out(STDOUT_FILENO); obfuscateQueries(query, out, obfuscated_words_map, used_nouns, hash_func, is_known_identifier); + out.finalize(); } else { @@ -175,7 +176,7 @@ int mainEntryClickHouseFormat(int argc, char ** argv) { WriteBufferFromOStream res_buf(std::cout, 4096); formatAST(*res, res_buf, hilite, oneline); - res_buf.next(); + res_buf.finalize(); if (multiple) std::cout << "\n;\n"; std::cout << std::endl; @@ -199,7 +200,7 @@ int mainEntryClickHouseFormat(int argc, char ** argv) res_cout.write(*s_pos++); } - res_cout.next(); + res_cout.finalize(); if (multiple) std::cout << " \\\n;\n"; std::cout << std::endl; diff --git a/programs/keeper-converter/KeeperConverter.cpp b/programs/keeper-converter/KeeperConverter.cpp index 7d25c1d5017..a049e6bc2b3 100644 --- a/programs/keeper-converter/KeeperConverter.cpp +++ b/programs/keeper-converter/KeeperConverter.cpp @@ -9,6 +9,7 @@ #include #include #include +#include int mainEntryClickHouseKeeperConverter(int argc, char ** argv) @@ -39,8 +40,9 @@ int mainEntryClickHouseKeeperConverter(int argc, char ** argv) try { - auto keeper_context = std::make_shared(); - keeper_context->digest_enabled = true; + auto keeper_context = std::make_shared(true); + keeper_context->setDigestEnabled(true); + keeper_context->setSnapshotDisk(std::make_shared("Keeper-snapshots", options["output-dir"].as(), 0)); DB::KeeperStorage storage(/* tick_time_ms */ 500, /* superdigest */ "", keeper_context, /* initialize_system_nodes */ false); @@ -51,10 +53,10 @@ int mainEntryClickHouseKeeperConverter(int argc, char ** argv) DB::SnapshotMetadataPtr snapshot_meta = std::make_shared(storage.getZXID(), 1, std::make_shared()); DB::KeeperStorageSnapshot snapshot(&storage, snapshot_meta); - DB::KeeperSnapshotManager manager(options["output-dir"].as(), 1, keeper_context); + DB::KeeperSnapshotManager manager(1, keeper_context); auto snp = manager.serializeSnapshotToBuffer(snapshot); - auto path = manager.serializeSnapshotBufferToDisk(*snp, storage.getZXID()); - std::cout << "Snapshot serialized to path:" << path << std::endl; + auto file_info = manager.serializeSnapshotBufferToDisk(*snp, storage.getZXID()); + std::cout << "Snapshot serialized to path:" << fs::path(file_info.disk->getPath()) / file_info.path << std::endl; } catch (...) { diff --git a/programs/keeper/CMakeLists.txt b/programs/keeper/CMakeLists.txt index 8bde1db019c..18bdc8f317c 100644 --- a/programs/keeper/CMakeLists.txt +++ b/programs/keeper/CMakeLists.txt @@ -43,13 +43,15 @@ if (BUILD_STANDALONE_KEEPER) ${CMAKE_CURRENT_SOURCE_DIR}/../../src/Coordination/KeeperDispatcher.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../../src/Coordination/KeeperLogStore.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../../src/Coordination/KeeperServer.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/../../src/Coordination/KeeperContext.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/../../src/Coordination/KeeperFeatureFlags.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../../src/Coordination/KeeperSnapshotManager.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../../src/Coordination/KeeperSnapshotManagerS3.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../../src/Coordination/KeeperStateMachine.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/../../src/Coordination/KeeperContext.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../../src/Coordination/KeeperStateManager.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../../src/Coordination/KeeperStorage.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../../src/Coordination/KeeperAsynchronousMetrics.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/../../src/Coordination/TinyContext.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../../src/Coordination/pathUtils.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../../src/Coordination/SessionExpiryQueue.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../../src/Coordination/SummingStateMachine.cpp @@ -58,10 +60,14 @@ if (BUILD_STANDALONE_KEEPER) ${CMAKE_CURRENT_SOURCE_DIR}/../../src/Core/SettingsFields.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../../src/Core/BaseSettings.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/../../src/Core/ServerSettings.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../../src/Core/Field.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../../src/Core/SettingsEnums.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../../src/Core/ServerUUID.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../../src/Core/UUID.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/../../src/Core/BackgroundSchedulePool.cpp + + ${CMAKE_CURRENT_SOURCE_DIR}/../../src/IO/ReadBuffer.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../../src/Server/KeeperTCPHandler.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../../src/Server/TCPServer.cpp @@ -93,6 +99,10 @@ if (BUILD_STANDALONE_KEEPER) ${CMAKE_CURRENT_SOURCE_DIR}/../../src/Compression/ICompressionCodec.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../../src/Compression/LZ4_decompress_faster.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/../../src/Common/CurrentThread.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/../../src/Common/NamedCollections/NamedCollections.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/../../src/Common/NamedCollections/NamedCollectionConfiguration.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/../../src/Common/ZooKeeper/IKeeper.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../../src/Common/ZooKeeper/TestKeeper.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../../src/Common/ZooKeeper/ZooKeeperCommon.cpp @@ -103,11 +113,58 @@ if (BUILD_STANDALONE_KEEPER) ${CMAKE_CURRENT_SOURCE_DIR}/../../src/Common/ZooKeeper/ZooKeeperLock.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../../src/Common/ZooKeeper/ZooKeeperNodeCache.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/../../src/Disks/registerDisks.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/../../src/Disks/IDisk.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/../../src/Disks/DiskFactory.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/../../src/Disks/DiskSelector.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/../../src/Disks/DiskLocal.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/../../src/Disks/DiskLocalCheckThread.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/../../src/Disks/LocalDirectorySyncGuard.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/../../src/Disks/TemporaryFileOnDisk.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/../../src/Disks/loadLocalDiskConfig.cpp + + ${CMAKE_CURRENT_SOURCE_DIR}/../../src/Disks/ObjectStorages/IObjectStorage.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/../../src/Disks/ObjectStorages/MetadataStorageFromPlainObjectStorage.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/../../src/Disks/ObjectStorages/MetadataStorageFromDisk.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/../../src/Disks/ObjectStorages/MetadataFromDiskTransactionState.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/../../src/Disks/ObjectStorages/DiskObjectStorageMetadata.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/../../src/Disks/ObjectStorages/MetadataStorageFromDiskTransactionOperations.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/../../src/Disks/ObjectStorages/DiskObjectStorage.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/../../src/Disks/ObjectStorages/DiskObjectStorageTransaction.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/../../src/Disks/ObjectStorages/DiskObjectStorageRemoteMetadataRestoreHelper.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/../../src/Disks/ObjectStorages/DiskObjectStorageCommon.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/../../src/Disks/ObjectStorages/ObjectStorageIteratorAsync.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/../../src/Disks/ObjectStorages/ObjectStorageIterator.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/../../src/Disks/ObjectStorages/StoredObject.cpp + + ${CMAKE_CURRENT_SOURCE_DIR}/../../src/Disks/ObjectStorages/S3/registerDiskS3.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/../../src/Disks/ObjectStorages/S3/S3ObjectStorage.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/../../src/Disks/ObjectStorages/S3/S3Capabilities.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/../../src/Disks/ObjectStorages/S3/diskSettings.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/../../src/Disks/ObjectStorages/S3/ProxyListConfiguration.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/../../src/Disks/ObjectStorages/S3/ProxyResolverConfiguration.cpp + + ${CMAKE_CURRENT_SOURCE_DIR}/../../src/Disks/IO/createReadBufferFromFileBase.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/../../src/Disks/IO/ReadBufferFromRemoteFSGather.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/../../src/Disks/IO/IOUringReader.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/../../src/Disks/IO/WriteBufferFromTemporaryFile.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/../../src/Disks/IO/WriteBufferWithFinalizeCallback.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/../../src/Disks/IO/AsynchronousBoundedReadBuffer.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/../../src/Disks/IO/getThreadPoolReader.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/../../src/Disks/IO/ThreadPoolRemoteFSReader.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/../../src/Disks/IO/ThreadPoolReader.cpp + + ${CMAKE_CURRENT_SOURCE_DIR}/../../src/Storages/StorageS3Settings.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/../../src/Daemon/BaseDaemon.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../../src/Daemon/SentryWriter.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../../src/Daemon/GraphiteWriter.cpp ${CMAKE_CURRENT_BINARY_DIR}/../../src/Daemon/GitHash.generated.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/../../src/Coordination/Standalone/Context.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/../../src/Coordination/Standalone/Settings.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/../../src/Coordination/Standalone/ThreadStatusExt.cpp + Keeper.cpp clickhouse-keeper.cpp ) @@ -130,10 +187,6 @@ if (BUILD_STANDALONE_KEEPER) target_compile_definitions (clickhouse-keeper PRIVATE -DCLICKHOUSE_PROGRAM_STANDALONE_BUILD) target_compile_definitions (clickhouse-keeper PUBLIC -DWITHOUT_TEXT_LOG) - target_include_directories(clickhouse-keeper PUBLIC "${CMAKE_CURRENT_SOURCE_DIR}/../../src") # uses includes from src directory - target_include_directories(clickhouse-keeper PUBLIC "${CMAKE_CURRENT_BINARY_DIR}/../../src/Core/include") # uses some includes from core - target_include_directories(clickhouse-keeper PUBLIC "${CMAKE_CURRENT_BINARY_DIR}/../../src") # uses some includes from common - if (ENABLE_CLICKHOUSE_KEEPER_CLIENT AND TARGET ch_rust::skim) target_link_libraries(clickhouse-keeper PRIVATE ch_rust::skim) endif() diff --git a/programs/keeper/Keeper.cpp b/programs/keeper/Keeper.cpp index a1825665188..c0574853251 100644 --- a/programs/keeper/Keeper.cpp +++ b/programs/keeper/Keeper.cpp @@ -24,6 +24,8 @@ #include #include +#include + #include #include @@ -45,6 +47,8 @@ #include #include +#include + int mainEntryClickHouseKeeper(int argc, char ** argv) { @@ -201,9 +205,12 @@ void Keeper::defineOptions(Poco::Util::OptionSet & options) BaseDaemon::defineOptions(options); } -struct Keeper::KeeperHTTPContext : public IHTTPContext +namespace { - explicit KeeperHTTPContext(TinyContextPtr context_) + +struct KeeperHTTPContext : public IHTTPContext +{ + explicit KeeperHTTPContext(ContextPtr context_) : context(std::move(context_)) {} @@ -247,12 +254,14 @@ struct Keeper::KeeperHTTPContext : public IHTTPContext return {context->getConfigRef().getInt64("keeper_server.http_send_timeout", DBMS_DEFAULT_SEND_TIMEOUT_SEC), 0}; } - TinyContextPtr context; + ContextPtr context; }; -HTTPContextPtr Keeper::httpContext() +HTTPContextPtr httpContext() { - return std::make_shared(tiny_context); + return std::make_shared(Context::getGlobalContextInstance()); +} + } int Keeper::main(const std::vector & /*args*/) @@ -316,10 +325,21 @@ try std::mutex servers_lock; auto servers = std::make_shared>(); - tiny_context = std::make_shared(); + auto shared_context = Context::createShared(); + auto global_context = Context::createGlobal(shared_context.get()); + + global_context->makeGlobalContext(); + global_context->setPath(path); + global_context->setRemoteHostFilter(config()); + + if (config().has("macros")) + global_context->setMacros(std::make_unique(config(), "macros", log)); + + registerDisks(/*global_skip_access_check=*/false); + /// This object will periodically calculate some metrics. KeeperAsynchronousMetrics async_metrics( - tiny_context, + global_context, config().getUInt("asynchronous_metrics_update_period_s", 1), [&]() -> std::vector { @@ -344,12 +364,12 @@ try } /// Initialize keeper RAFT. Do nothing if no keeper_server in config. - tiny_context->initializeKeeperDispatcher(/* start_async = */ true); - FourLetterCommandFactory::registerCommands(*tiny_context->getKeeperDispatcher()); + global_context->initializeKeeperDispatcher(/* start_async = */ true); + FourLetterCommandFactory::registerCommands(*global_context->getKeeperDispatcher()); - auto config_getter = [this] () -> const Poco::Util::AbstractConfiguration & + auto config_getter = [&] () -> const Poco::Util::AbstractConfiguration & { - return tiny_context->getConfigRef(); + return global_context->getConfigRef(); }; auto tcp_receive_timeout = config().getInt64("keeper_server.socket_receive_timeout_sec", DBMS_DEFAULT_RECEIVE_TIMEOUT_SEC); @@ -371,7 +391,7 @@ try "Keeper (tcp): " + address.toString(), std::make_unique( new KeeperTCPHandlerFactory( - config_getter, tiny_context->getKeeperDispatcher(), + config_getter, global_context->getKeeperDispatcher(), tcp_receive_timeout, tcp_send_timeout, false), server_pool, socket)); }); @@ -389,7 +409,7 @@ try "Keeper with secure protocol (tcp_secure): " + address.toString(), std::make_unique( new KeeperTCPHandlerFactory( - config_getter, tiny_context->getKeeperDispatcher(), + config_getter, global_context->getKeeperDispatcher(), tcp_receive_timeout, tcp_send_timeout, true), server_pool, socket)); #else UNUSED(port); @@ -441,7 +461,7 @@ try [&](ConfigurationPtr config, bool /* initial_loading */) { if (config->has("keeper_server")) - tiny_context->updateKeeperConfiguration(*config); + global_context->updateKeeperConfiguration(*config); }, /* already_loaded = */ false); /// Reload it right now (initial loading) @@ -472,7 +492,7 @@ try else LOG_INFO(log, "Closed connections to Keeper."); - tiny_context->shutdownKeeperDispatcher(); + global_context->shutdownKeeperDispatcher(); /// Wait server pool to avoid use-after-free of destroyed context in the handlers server_pool.joinAll(); diff --git a/programs/keeper/Keeper.h b/programs/keeper/Keeper.h index 8a7724acb85..f889ffa595b 100644 --- a/programs/keeper/Keeper.h +++ b/programs/keeper/Keeper.h @@ -1,9 +1,7 @@ #pragma once #include -#include #include -#include namespace Poco { @@ -68,11 +66,6 @@ protected: std::string getDefaultConfigFileName() const override; private: - TinyContextPtr tiny_context; - - struct KeeperHTTPContext; - HTTPContextPtr httpContext(); - Poco::Net::SocketAddress socketBindListen(Poco::Net::ServerSocket & socket, const std::string & host, UInt16 port, [[maybe_unused]] bool secure = false) const; using CreateServerFunc = std::function; diff --git a/programs/local/LocalServer.cpp b/programs/local/LocalServer.cpp index caca7cfb50d..2afcd48dafb 100644 --- a/programs/local/LocalServer.cpp +++ b/programs/local/LocalServer.cpp @@ -8,7 +8,9 @@ #include #include #include +#include #include +#include #include #include #include @@ -50,6 +52,8 @@ #include #include +#include "config.h" + #if defined(FUZZING_MODE) #include #endif @@ -170,6 +174,13 @@ static DatabasePtr createMemoryDatabaseIfNotExists(ContextPtr context, const Str return system_database; } +static DatabasePtr createClickHouseLocalDatabaseOverlay(const String & name_, ContextPtr context_) +{ + auto databaseCombiner = std::make_shared(name_, context_); + databaseCombiner->registerNextDatabase(std::make_shared(name_, "", context_)); + databaseCombiner->registerNextDatabase(std::make_shared(name_, context_)); + return databaseCombiner; +} /// If path is specified and not empty, will try to setup server environment and load existing metadata void LocalServer::tryInitPath() @@ -669,7 +680,7 @@ void LocalServer::processConfig() * if such tables will not be dropped, clickhouse-server will not be able to load them due to security reasons. */ std::string default_database = config().getString("default_database", "_local"); - DatabaseCatalog::instance().attachDatabase(default_database, std::make_shared(default_database, global_context)); + DatabaseCatalog::instance().attachDatabase(default_database, createClickHouseLocalDatabaseOverlay(default_database, global_context)); global_context->setCurrentDatabase(default_database); applyCmdOptions(global_context); diff --git a/programs/server/dashboard.html b/programs/server/dashboard.html index 951b7db3aa3..ea818e05e31 100644 --- a/programs/server/dashboard.html +++ b/programs/server/dashboard.html @@ -12,7 +12,8 @@ --chart-background: white; --shadow-color: rgba(0, 0, 0, 0.25); --input-shadow-color: rgba(0, 255, 0, 1); - --error-color: white; + --error-color: red; + --auth-error-color: white; --legend-background: rgba(255, 255, 255, 0.75); --title-color: #666; --text-color: black; @@ -258,7 +259,7 @@ width: 60%; padding: .5rem; - color: var(--error-color); + color: var(--auth-error-color); display: flex; flex-flow: row nowrap; @@ -906,9 +907,9 @@ async function draw(idx, chart, url_params, query) { if (error) { const errorMatch = errorMessages.find(({ regex }) => error.match(regex)) - if (errorMatch) { - const match = error.match(errorMatch.regex) - const message = errorMatch.messageFunc(match) + const match = error.match(errorMatch.regex) + const message = errorMatch.messageFunc(match) + if (message) { const authError = new Error(message) throw authError } @@ -930,7 +931,7 @@ async function draw(idx, chart, url_params, query) { let title_div = chart.querySelector('.title'); if (error) { error_div.firstChild.data = error; - title_div.style.display = 'none'; + title_div.style.display = 'none'; error_div.style.display = 'block'; return false; } else { @@ -1019,13 +1020,15 @@ async function drawAll() { firstLoad = false; } else { enableReloadButton(); + enableRunButton(); } - if (!results.includes(false)) { + if (results.includes(true)) { const element = document.querySelector('.inputs'); element.classList.remove('unconnected'); const add = document.querySelector('#add'); add.style.display = 'block'; - } else { + } + else { const charts = document.querySelector('#charts') charts.style.height = '0px'; } @@ -1050,6 +1053,13 @@ function disableReloadButton() { reloadButton.classList.add('disabled') } +function disableRunButton() { + const runButton = document.getElementById('run') + runButton.value = 'Reloading...' + runButton.disabled = true + runButton.classList.add('disabled') +} + function enableReloadButton() { const reloadButton = document.getElementById('reload') reloadButton.value = 'Reload' @@ -1057,11 +1067,19 @@ function enableReloadButton() { reloadButton.classList.remove('disabled') } +function enableRunButton() { + const runButton = document.getElementById('run') + runButton.value = 'Ok' + runButton.disabled = false + runButton.classList.remove('disabled') +} + function reloadAll() { updateParams(); drawAll(); saveState(); - disableReloadButton() + disableReloadButton(); + disableRunButton(); } document.getElementById('params').onsubmit = function(event) { diff --git a/rust/skim/Cargo.lock b/rust/skim/Cargo.lock index badd22dad07..9f948ee1c38 100644 --- a/rust/skim/Cargo.lock +++ b/rust/skim/Cargo.lock @@ -14,13 +14,19 @@ dependencies = [ [[package]] name = "aho-corasick" -version = "0.7.20" +version = "1.0.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cc936419f96fa211c1b9166887b38e5e40b19958e5b895be7c1f93adec7071ac" +checksum = "43f6cb1bf222025340178f382c426f13757b2960e89779dfcb319c32542a5a41" dependencies = [ "memchr", ] +[[package]] +name = "android-tzdata" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e999941b234f3131b00bc13c22d06e8c5ff726d1b6318ac7eb276997bbb4fef0" + [[package]] name = "android_system_properties" version = "0.1.5" @@ -32,9 +38,9 @@ dependencies = [ [[package]] name = "arrayvec" -version = "0.7.2" +version = "0.7.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8da52d66c7071e2e3fa2a1e5c6d088fec47b593032b254f5e980de8ea54454d6" +checksum = "96d30a06541fbafbc7f82ed10c06164cfbd2c401138f6addd8404629c4b16711" [[package]] name = "atty" @@ -42,7 +48,7 @@ version = "0.2.14" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d9b39be18770d11421cdb1b9947a45dd3f37e93092cbf377614828a319d5fee8" dependencies = [ - "hermit-abi", + "hermit-abi 0.1.19", "libc", "winapi", ] @@ -67,15 +73,15 @@ checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a" [[package]] name = "bumpalo" -version = "3.11.1" +version = "3.13.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "572f695136211188308f16ad2ca5c851a712c464060ae6974944458eb83880ba" +checksum = "a3e2c3daef883ecc1b5d58c15adae93470a91d425f3532ba1695849656af3fc1" [[package]] name = "cc" -version = "1.0.77" +version = "1.0.79" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e9f73505338f7d905b19d18738976aae232eb46b8efc15554ffc56deb5d9ebe4" +checksum = "50d30906286121d95be3d479533b458f87493b30a4b5f79a607db8f5d11aa91f" [[package]] name = "cfg-if" @@ -85,13 +91,13 @@ checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" [[package]] name = "chrono" -version = "0.4.23" +version = "0.4.26" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "16b0a3d9ed01224b22057780a37bb8c5dbfe1be8ba48678e7bf57ec4b385411f" +checksum = "ec837a71355b28f6556dbd569b37b3f363091c0bd4b2e735674521b4c5fd9bc5" dependencies = [ + "android-tzdata", "iana-time-zone", "js-sys", - "num-integer", "num-traits", "time 0.1.45", "wasm-bindgen", @@ -100,9 +106,9 @@ dependencies = [ [[package]] name = "clap" -version = "3.2.23" +version = "3.2.25" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "71655c45cb9845d3270c9d6df84ebe72b4dad3c2ba3f7023ad47c144e4e473a5" +checksum = "4ea181bf566f71cb9a5d17a59e1871af638180a18fb0035c92ae62b705207123" dependencies = [ "atty", "bitflags", @@ -135,9 +141,9 @@ dependencies = [ [[package]] name = "core-foundation-sys" -version = "0.8.3" +version = "0.8.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5827cebf4670468b8772dd191856768aedcb1b0278a04f989f7766351917b9dc" +checksum = "e496a50fda8aacccc86d7529e2c1e0892dbd0f898a6b5645b5561b89c3210efa" [[package]] name = "crossbeam" @@ -155,9 +161,9 @@ dependencies = [ [[package]] name = "crossbeam-channel" -version = "0.5.6" +version = "0.5.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c2dd04ddaf88237dc3b8d8f9a3c1004b506b54b3313403944054d23c0870c521" +checksum = "a33c2bf77f2df06183c3aa30d1e96c0695a313d4f9c453cc3762a6db39f99200" dependencies = [ "cfg-if", "crossbeam-utils", @@ -165,9 +171,9 @@ dependencies = [ [[package]] name = "crossbeam-deque" -version = "0.8.2" +version = "0.8.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "715e8152b692bba2d374b53d4875445368fdf21a94751410af607a5ac677d1fc" +checksum = "ce6fd6f855243022dcecf8702fef0c297d4338e226845fe067f6341ad9fa0cef" dependencies = [ "cfg-if", "crossbeam-epoch", @@ -176,14 +182,14 @@ dependencies = [ [[package]] name = "crossbeam-epoch" -version = "0.9.13" +version = "0.9.15" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "01a9af1f4c2ef74bb8aa1f7e19706bc72d03598c8a570bb5de72243c7a9d9d5a" +checksum = "ae211234986c545741a7dc064309f67ee1e5ad243d0e48335adc0484d960bcc7" dependencies = [ "autocfg", "cfg-if", "crossbeam-utils", - "memoffset 0.7.1", + "memoffset 0.9.0", "scopeguard", ] @@ -199,18 +205,18 @@ dependencies = [ [[package]] name = "crossbeam-utils" -version = "0.8.14" +version = "0.8.16" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4fb766fa798726286dbbb842f174001dab8abc7b627a1dd86e0b7222a95d929f" +checksum = "5a22b2d63d4d1dc0b7f1b6b2747dd0088008a9be28b6ddf0b1e7d335e3037294" dependencies = [ "cfg-if", ] [[package]] name = "cxx" -version = "1.0.83" +version = "1.0.97" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bdf07d07d6531bfcdbe9b8b739b104610c6508dcc4d63b410585faf338241daf" +checksum = "e88abab2f5abbe4c56e8f1fb431b784d710b709888f35755a160e62e33fe38e8" dependencies = [ "cc", "cxxbridge-flags", @@ -220,9 +226,9 @@ dependencies = [ [[package]] name = "cxx-build" -version = "1.0.83" +version = "1.0.97" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d2eb5b96ecdc99f72657332953d4d9c50135af1bac34277801cc3937906ebd39" +checksum = "5c0c11acd0e63bae27dcd2afced407063312771212b7a823b4fd72d633be30fb" dependencies = [ "cc", "codespan-reporting", @@ -230,31 +236,31 @@ dependencies = [ "proc-macro2", "quote", "scratch", - "syn", + "syn 2.0.23", ] [[package]] name = "cxxbridge-flags" -version = "1.0.83" +version = "1.0.97" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ac040a39517fd1674e0f32177648334b0f4074625b5588a64519804ba0553b12" +checksum = "8d3816ed957c008ccd4728485511e3d9aaf7db419aa321e3d2c5a2f3411e36c8" [[package]] name = "cxxbridge-macro" -version = "1.0.83" +version = "1.0.97" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1362b0ddcfc4eb0a1f57b68bd77dd99f0e826958a96abd0ae9bd092e114ffed6" +checksum = "a26acccf6f445af85ea056362561a24ef56cdc15fcc685f03aec50b9c702cb6d" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.23", ] [[package]] name = "darling" -version = "0.14.2" +version = "0.14.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b0dd3cd20dc6b5a876612a6e5accfe7f3dd883db6d07acfbf14c128f61550dfa" +checksum = "7b750cb3417fd1b327431a470f388520309479ab0bf5e323505daf0290cd3850" dependencies = [ "darling_core", "darling_macro", @@ -262,27 +268,27 @@ dependencies = [ [[package]] name = "darling_core" -version = "0.14.2" +version = "0.14.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a784d2ccaf7c98501746bf0be29b2022ba41fd62a2e622af997a03e9f972859f" +checksum = "109c1ca6e6b7f82cc233a97004ea8ed7ca123a9af07a8230878fcfda9b158bf0" dependencies = [ "fnv", "ident_case", "proc-macro2", "quote", "strsim", - "syn", + "syn 1.0.109", ] [[package]] name = "darling_macro" -version = "0.14.2" +version = "0.14.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7618812407e9402654622dd402b0a89dff9ba93badd6540781526117b92aab7e" +checksum = "a4aab4dbc9f7611d8b55048a3a16d2d010c2c8334e46304b40ac1cc14bf3b48e" dependencies = [ "darling_core", "quote", - "syn", + "syn 1.0.109", ] [[package]] @@ -313,7 +319,7 @@ dependencies = [ "darling", "proc-macro2", "quote", - "syn", + "syn 1.0.109", ] [[package]] @@ -323,7 +329,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8f0314b72bed045f3a68671b3c86328386762c93f82d98c65c3cb5e5f573dd68" dependencies = [ "derive_builder_core", - "syn", + "syn 1.0.109", ] [[package]] @@ -349,9 +355,9 @@ dependencies = [ [[package]] name = "either" -version = "1.8.0" +version = "1.8.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "90e5c1c8368803113bf0c9584fc495a58b86dc8a29edbf8fe877d21d9507e797" +checksum = "7fcaabb2fef8c910e7f4c7ce9f67a1283a1715879a7c230ca9d6d1ae31f16d91" [[package]] name = "env_logger" @@ -383,9 +389,9 @@ dependencies = [ [[package]] name = "getrandom" -version = "0.2.8" +version = "0.2.10" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c05aeb6a22b8f62540c194aac980f2115af067bfe15a0734d7277a768d396b31" +checksum = "be4136b2a15dd319360be1c07d9933517ccf0be8f16bf62a3bee4f0d618df427" dependencies = [ "cfg-if", "libc", @@ -407,6 +413,12 @@ dependencies = [ "libc", ] +[[package]] +name = "hermit-abi" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fed44880c466736ef9a5c5b5facefb5ed0785676d0c02d612db14e54f0d84286" + [[package]] name = "humantime" version = "2.1.0" @@ -415,26 +427,25 @@ checksum = "9a3a5bfb195931eeb336b2a7b4d761daec841b97f947d34394601737a7bba5e4" [[package]] name = "iana-time-zone" -version = "0.1.53" +version = "0.1.57" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "64c122667b287044802d6ce17ee2ddf13207ed924c712de9a66a5814d5b64765" +checksum = "2fad5b825842d2b38bd206f3e81d6957625fd7f0a361e345c30e01a0ae2dd613" dependencies = [ "android_system_properties", "core-foundation-sys", "iana-time-zone-haiku", "js-sys", "wasm-bindgen", - "winapi", + "windows", ] [[package]] name = "iana-time-zone-haiku" -version = "0.1.1" +version = "0.1.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0703ae284fc167426161c2e3f1da3ea71d94b21bedbcc9494e92b28e334e3dca" +checksum = "f31827a206f56af32e590ba56d5d2d085f558508192593743f16b2306495269f" dependencies = [ - "cxx", - "cxx-build", + "cc", ] [[package]] @@ -445,9 +456,9 @@ checksum = "b9e0384b61958566e926dc50660321d12159025e767c18e043daf26b70104c39" [[package]] name = "indexmap" -version = "1.9.2" +version = "1.9.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1885e79c1fc4b10f0e172c475f458b7f7b93061064d98c3293e98c5ba0c8b399" +checksum = "bd070e393353796e801d209ad339e89596eb4c8d430d18ede6a1cced8fafbd99" dependencies = [ "autocfg", "hashbrown", @@ -455,9 +466,9 @@ dependencies = [ [[package]] name = "js-sys" -version = "0.3.60" +version = "0.3.64" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "49409df3e3bf0856b916e2ceaca09ee28e6871cf7d9ce97a692cacfdb2a25a47" +checksum = "c5f195fe497f702db0f318b07fdd68edb16955aed830df8363d837542f8f935a" dependencies = [ "wasm-bindgen", ] @@ -470,27 +481,24 @@ checksum = "e2abad23fbc42b3700f2f279844dc832adb2b2eb069b2df918f455c4e18cc646" [[package]] name = "libc" -version = "0.2.138" +version = "0.2.147" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "db6d7e329c562c5dfab7a46a2afabc8b987ab9a4834c9d1ca04dc54c1546cef8" +checksum = "b4668fb0ea861c1df094127ac5f1da3409a82116a4ba74fca2e58ef927159bb3" [[package]] name = "link-cplusplus" -version = "1.0.7" +version = "1.0.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9272ab7b96c9046fbc5bc56c06c117cb639fe2d509df0c421cad82d2915cf369" +checksum = "ecd207c9c713c34f95a097a5b029ac2ce6010530c7b49d7fea24d977dede04f5" dependencies = [ "cc", ] [[package]] name = "log" -version = "0.4.17" +version = "0.4.19" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "abb12e687cfb44aa40f41fc3978ef76448f9b6038cad6aef4259d3c095a2382e" -dependencies = [ - "cfg-if", -] +checksum = "b06a4cde4c0f271a446782e3eff8de789548ce57dbc8eca9292c27f4a42004b4" [[package]] name = "memchr" @@ -509,9 +517,9 @@ dependencies = [ [[package]] name = "memoffset" -version = "0.7.1" +version = "0.9.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5de893c32cde5f383baa4c04c5d6dbdd735cfd4a794b0debdb2bb1b421da5ff4" +checksum = "5a634b1c61a95585bd15607c6ab0c4e5b226e695ff2800ba0cdccddf208c406c" dependencies = [ "autocfg", ] @@ -541,16 +549,6 @@ dependencies = [ "pin-utils", ] -[[package]] -name = "num-integer" -version = "0.1.45" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "225d3389fb3509a24c93f5c29eb6bde2586b98d9f016636dff58d7c6f7569cd9" -dependencies = [ - "autocfg", - "num-traits", -] - [[package]] name = "num-traits" version = "0.2.15" @@ -562,25 +560,25 @@ dependencies = [ [[package]] name = "num_cpus" -version = "1.14.0" +version = "1.16.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f6058e64324c71e02bc2b150e4f3bc8286db6c83092132ffa3f6b1eab0f9def5" +checksum = "4161fcb6d602d4d2081af7c3a45852d875a03dd337a6bfdd6e06407b61342a43" dependencies = [ - "hermit-abi", + "hermit-abi 0.3.1", "libc", ] [[package]] name = "once_cell" -version = "1.16.0" +version = "1.18.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "86f0b0d4bf799edbc74508c1e8bf170ff5f41238e5f8225603ca7caaae2b7860" +checksum = "dd8b5dd2ae5ed71462c540258bedcb51965123ad7e7ccf4b9a8cafaa4a63576d" [[package]] name = "os_str_bytes" -version = "6.4.1" +version = "6.5.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9b7820b9daea5457c9f21c69448905d723fbd21136ccf521748f23fd49e723ee" +checksum = "4d5d9eb14b174ee9aa2ef96dc2b94637a2d4b6e7cb873c7e171f0c20c6cf3eac" [[package]] name = "pin-utils" @@ -590,27 +588,27 @@ checksum = "8b870d8c151b6f2fb93e84a13146138f05d02ed11c7e7c54f8826aaaf7c9f184" [[package]] name = "proc-macro2" -version = "1.0.47" +version = "1.0.63" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5ea3d908b0e36316caf9e9e2c4625cdde190a7e6f440d794667ed17a1855e725" +checksum = "7b368fba921b0dce7e60f5e04ec15e565b3303972b42bcfde1d0713b881959eb" dependencies = [ "unicode-ident", ] [[package]] name = "quote" -version = "1.0.21" +version = "1.0.29" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bbe448f377a7d6961e30f5955f9b8d106c3f5e449d493ee1b125c1d43c2b5179" +checksum = "573015e8ab27661678357f27dc26460738fd2b6c86e46f386fde94cb5d913105" dependencies = [ "proc-macro2", ] [[package]] name = "rayon" -version = "1.6.1" +version = "1.7.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6db3a213adf02b3bcfd2d3846bb41cb22857d131789e01df434fb7e7bc0759b7" +checksum = "1d2df5196e37bcc87abebc0053e20787d73847bb33134a69841207dd0a47f03b" dependencies = [ "either", "rayon-core", @@ -618,9 +616,9 @@ dependencies = [ [[package]] name = "rayon-core" -version = "1.10.1" +version = "1.11.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cac410af5d00ab6884528b4ab69d1e8e146e8d471201800fa1b4524126de6ad3" +checksum = "4b8f95bd6966f5c87776639160a66bd8ab9895d9d4ab01ddba9fc60661aebe8d" dependencies = [ "crossbeam-channel", "crossbeam-deque", @@ -650,9 +648,9 @@ dependencies = [ [[package]] name = "regex" -version = "1.7.0" +version = "1.8.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e076559ef8e241f2ae3479e36f97bd5741c0330689e217ad51ce2c76808b868a" +checksum = "d0ab3ca65655bb1e41f2a8c8cd662eb4fb035e67c3f78da1d61dffe89d07300f" dependencies = [ "aho-corasick", "memchr", @@ -661,15 +659,15 @@ dependencies = [ [[package]] name = "regex-syntax" -version = "0.6.28" +version = "0.7.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "456c603be3e8d448b072f410900c09faf164fbce2d480456f50eea6e25f9c848" +checksum = "436b050e76ed2903236f032a59761c1eb99e1b0aead2c257922771dab1fc8c78" [[package]] name = "rustversion" -version = "1.0.9" +version = "1.0.12" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "97477e48b4cf8603ad5f7aaf897467cf42ab4218a38ef76fb14c2d6773a6d6a8" +checksum = "4f3208ce4d8448b3f3e7d168a73f5e0c43a61e32930de3bceeccedb388b6bf06" [[package]] name = "scopeguard" @@ -679,15 +677,15 @@ checksum = "d29ab0c6d3fc0ee92fe66e2d99f700eab17a8d57d1c1d3b748380fb20baa78cd" [[package]] name = "scratch" -version = "1.0.2" +version = "1.0.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9c8132065adcfd6e02db789d9285a0deb2f3fcb04002865ab67d5fb103533898" +checksum = "1792db035ce95be60c3f8853017b3999209281c24e2ba5bc8e59bf97a0c590c1" [[package]] name = "serde" -version = "1.0.149" +version = "1.0.164" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "256b9932320c590e707b94576e3cc1f7c9024d0ee6612dfbcf1cb106cbe8e055" +checksum = "9e8c8cf938e98f769bc164923b06dce91cea1751522f46f8466461af04c9027d" [[package]] name = "shlex" @@ -697,9 +695,9 @@ checksum = "43b2853a4d09f215c24cc5489c992ce46052d359b5109343cbafbf26bc62f8a3" [[package]] name = "skim" -version = "0.10.2" +version = "0.10.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cebed5f897cd6c0d80fbe30adb36c0abf7400e93043a63ae56458495642b3485" +checksum = "e5d28de0a6cb2cdd83a076f1de9d965b973ae08b244df1aa70b432946dda0f32" dependencies = [ "atty", "beef", @@ -717,7 +715,7 @@ dependencies = [ "rayon", "regex", "shlex", - "time 0.3.17", + "time 0.3.22", "timer", "tuikit", "unicode-width", @@ -732,9 +730,20 @@ checksum = "73473c0e59e6d5812c5dfe2a064a6444949f089e20eec9a2e5506596494e4623" [[package]] name = "syn" -version = "1.0.105" +version = "1.0.109" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "60b9b43d45702de4c839cb9b51d9f529c5dd26a4aff255b42b1ebc03e88ee908" +checksum = "72b64191b275b66ffe2469e8af2c1cfe3bafa67b529ead792a6d0160888b4237" +dependencies = [ + "proc-macro2", + "quote", + "unicode-ident", +] + +[[package]] +name = "syn" +version = "2.0.23" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "59fb7d6d8281a51045d62b8eb3a7d1ce347b76f312af50cd3dc0af39c87c1737" dependencies = [ "proc-macro2", "quote", @@ -754,9 +763,9 @@ dependencies = [ [[package]] name = "termcolor" -version = "1.1.3" +version = "1.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bab24d30b911b2376f3a13cc2cd443142f0c81dda04c118693e35b3835757755" +checksum = "be55cf8942feac5c765c2c993422806843c9a9a45d4d5c407ad6dd2ea95eb9b6" dependencies = [ "winapi-util", ] @@ -769,30 +778,31 @@ checksum = "222a222a5bfe1bba4a77b45ec488a741b3cb8872e5e499451fd7d0129c9c7c3d" [[package]] name = "thiserror" -version = "1.0.37" +version = "1.0.40" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "10deb33631e3c9018b9baf9dcbbc4f737320d2b576bac10f6aefa048fa407e3e" +checksum = "978c9a314bd8dc99be594bc3c175faaa9794be04a5a5e153caba6915336cebac" dependencies = [ "thiserror-impl", ] [[package]] name = "thiserror-impl" -version = "1.0.37" +version = "1.0.40" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "982d17546b47146b28f7c22e3d08465f6b8903d0ea13c1660d9d84a6e7adcdbb" +checksum = "f9456a42c5b0d803c8cd86e73dd7cc9edd429499f37a3550d286d5e86720569f" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.23", ] [[package]] name = "thread_local" -version = "1.1.4" +version = "1.1.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5516c27b78311c50bf42c071425c560ac799b11c30b31f87e3081965fe5e0180" +checksum = "3fdd6f064ccff2d6567adcb3873ca630700f00b5ad3f060c25b5dcfd9a4ce152" dependencies = [ + "cfg-if", "once_cell", ] @@ -809,9 +819,9 @@ dependencies = [ [[package]] name = "time" -version = "0.3.17" +version = "0.3.22" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a561bf4617eebd33bca6434b988f39ed798e527f51a1e797d0ee4f61c0a38376" +checksum = "ea9e1b3cf1243ae005d9e74085d4d542f3125458f3a81af210d901dcd7411efd" dependencies = [ "serde", "time-core", @@ -819,9 +829,9 @@ dependencies = [ [[package]] name = "time-core" -version = "0.1.0" +version = "0.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2e153e1f1acaef8acc537e68b44906d2db6436e2b35ac2c6b42640fff91f00fd" +checksum = "7300fbefb4dadc1af235a9cef3737cea692a9d97e1b9cbcd4ebdae6f8868e6fb" [[package]] name = "timer" @@ -848,9 +858,9 @@ dependencies = [ [[package]] name = "unicode-ident" -version = "1.0.5" +version = "1.0.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6ceab39d59e4c9499d4e5a8ee0e2735b891bb7308ac83dfb4e80cad195c9f6f3" +checksum = "b15811caf2415fb889178633e7724bad2509101cde276048e013b9def5e51fa0" [[package]] name = "unicode-width" @@ -860,15 +870,15 @@ checksum = "c0edd1e5b14653f783770bce4a4dabb4a5108a5370a5f5d8cfe8710c361f6c8b" [[package]] name = "utf8parse" -version = "0.2.0" +version = "0.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "936e4b492acfd135421d8dca4b1aa80a7bfc26e702ef3af710e0752684df5372" +checksum = "711b9620af191e0cdc7468a8d14e709c3dcdb115b36f838e601583af800a370a" [[package]] name = "vte" -version = "0.11.0" +version = "0.11.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1aae21c12ad2ec2d168c236f369c38ff332bc1134f7246350dca641437365045" +checksum = "f5022b5fbf9407086c180e9557be968742d839e68346af7792b8592489732197" dependencies = [ "arrayvec", "utf8parse", @@ -899,9 +909,9 @@ checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423" [[package]] name = "wasm-bindgen" -version = "0.2.83" +version = "0.2.87" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "eaf9f5aceeec8be17c128b2e93e031fb8a4d469bb9c4ae2d7dc1888b26887268" +checksum = "7706a72ab36d8cb1f80ffbf0e071533974a60d0a308d01a5d0375bf60499a342" dependencies = [ "cfg-if", "wasm-bindgen-macro", @@ -909,24 +919,24 @@ dependencies = [ [[package]] name = "wasm-bindgen-backend" -version = "0.2.83" +version = "0.2.87" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4c8ffb332579b0557b52d268b91feab8df3615f265d5270fec2a8c95b17c1142" +checksum = "5ef2b6d3c510e9625e5fe6f509ab07d66a760f0885d858736483c32ed7809abd" dependencies = [ "bumpalo", "log", "once_cell", "proc-macro2", "quote", - "syn", + "syn 2.0.23", "wasm-bindgen-shared", ] [[package]] name = "wasm-bindgen-macro" -version = "0.2.83" +version = "0.2.87" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "052be0f94026e6cbc75cdefc9bae13fd6052cdcaf532fa6c45e7ae33a1e6c810" +checksum = "dee495e55982a3bd48105a7b947fd2a9b4a8ae3010041b9e0faab3f9cd028f1d" dependencies = [ "quote", "wasm-bindgen-macro-support", @@ -934,22 +944,22 @@ dependencies = [ [[package]] name = "wasm-bindgen-macro-support" -version = "0.2.83" +version = "0.2.87" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "07bc0c051dc5f23e307b13285f9d75df86bfdf816c5721e573dec1f9b8aa193c" +checksum = "54681b18a46765f095758388f2d0cf16eb8d4169b639ab575a8f5693af210c7b" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.23", "wasm-bindgen-backend", "wasm-bindgen-shared", ] [[package]] name = "wasm-bindgen-shared" -version = "0.2.83" +version = "0.2.87" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1c38c045535d93ec4f0b4defec448e4291638ee608530863b1e2ba115d4fff7f" +checksum = "ca6ad05a4870b2bf5fe995117d3728437bd27d7cd5f06f13c17443ef369775a1" [[package]] name = "winapi" @@ -981,3 +991,69 @@ name = "winapi-x86_64-pc-windows-gnu" version = "0.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f" + +[[package]] +name = "windows" +version = "0.48.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e686886bc078bc1b0b600cac0147aadb815089b6e4da64016cbd754b6342700f" +dependencies = [ + "windows-targets", +] + +[[package]] +name = "windows-targets" +version = "0.48.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "05d4b17490f70499f20b9e791dcf6a299785ce8af4d709018206dc5b4953e95f" +dependencies = [ + "windows_aarch64_gnullvm", + "windows_aarch64_msvc", + "windows_i686_gnu", + "windows_i686_msvc", + "windows_x86_64_gnu", + "windows_x86_64_gnullvm", + "windows_x86_64_msvc", +] + +[[package]] +name = "windows_aarch64_gnullvm" +version = "0.48.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "91ae572e1b79dba883e0d315474df7305d12f569b400fcf90581b06062f7e1bc" + +[[package]] +name = "windows_aarch64_msvc" +version = "0.48.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b2ef27e0d7bdfcfc7b868b317c1d32c641a6fe4629c171b8928c7b08d98d7cf3" + +[[package]] +name = "windows_i686_gnu" +version = "0.48.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "622a1962a7db830d6fd0a69683c80a18fda201879f0f447f065a3b7467daa241" + +[[package]] +name = "windows_i686_msvc" +version = "0.48.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4542c6e364ce21bf45d69fdd2a8e455fa38d316158cfd43b3ac1c5b1b19f8e00" + +[[package]] +name = "windows_x86_64_gnu" +version = "0.48.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ca2b8a661f7628cbd23440e50b05d705db3686f894fc9580820623656af974b1" + +[[package]] +name = "windows_x86_64_gnullvm" +version = "0.48.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7896dbc1f41e08872e9d5e8f8baa8fdd2677f29468c4e156210174edc7f7b953" + +[[package]] +name = "windows_x86_64_msvc" +version = "0.48.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1a515f5799fe4961cb532f983ce2b23082366b898e52ffbce459c86f67c8378a" diff --git a/src/Access/ContextAccess.cpp b/src/Access/ContextAccess.cpp index 0d28edc0a10..9c57853679f 100644 --- a/src/Access/ContextAccess.cpp +++ b/src/Access/ContextAccess.cpp @@ -228,6 +228,12 @@ ContextAccess::ContextAccess(const AccessControl & access_control_, const Params } +ContextAccess::ContextAccess(FullAccess) + : is_full_access(true), access(std::make_shared(AccessRights::getFullAccess())), access_with_implicit(access) +{ +} + + ContextAccess::~ContextAccess() { enabled_settings.reset(); @@ -413,14 +419,8 @@ std::optional ContextAccess::getQuotaUsage() const std::shared_ptr ContextAccess::getFullAccess() { - static const std::shared_ptr res = [] - { - auto full_access = std::make_shared(); - full_access->is_full_access = true; - full_access->access = std::make_shared(AccessRights::getFullAccess()); - full_access->access_with_implicit = full_access->access; - return full_access; - }(); + static const std::shared_ptr res = + [] { return std::shared_ptr(new ContextAccess{kFullAccess}); }(); return res; } diff --git a/src/Access/ContextAccess.h b/src/Access/ContextAccess.h index 63604a03b4e..60bad0118fc 100644 --- a/src/Access/ContextAccess.h +++ b/src/Access/ContextAccess.h @@ -69,7 +69,6 @@ public: using Params = ContextAccessParams; const Params & getParams() const { return params; } - ContextAccess() { } /// NOLINT ContextAccess(const AccessControl & access_control_, const Params & params_); /// Returns the current user. Throws if user is nullptr. @@ -171,10 +170,17 @@ public: private: friend class AccessControl; + struct FullAccess {}; + static const FullAccess kFullAccess; + + /// Makes an instance of ContextAccess which provides full access to everything + /// without any limitations. This is used for the global context. + explicit ContextAccess(FullAccess); + void initialize(); - void setUser(const UserPtr & user_) const; - void setRolesInfo(const std::shared_ptr & roles_info_) const; - void calculateAccessRights() const; + void setUser(const UserPtr & user_) const TSA_REQUIRES(mutex); + void setRolesInfo(const std::shared_ptr & roles_info_) const TSA_REQUIRES(mutex); + void calculateAccessRights() const TSA_REQUIRES(mutex); template bool checkAccessImpl(const AccessFlags & flags) const; @@ -217,20 +223,23 @@ private: const AccessControl * access_control = nullptr; const Params params; - bool is_full_access = false; - mutable Poco::Logger * trace_log = nullptr; - mutable UserPtr user; - mutable String user_name; - mutable bool user_was_dropped = false; - mutable scope_guard subscription_for_user_change; - mutable std::shared_ptr enabled_roles; - mutable scope_guard subscription_for_roles_changes; - mutable std::shared_ptr roles_info; - mutable std::shared_ptr access; - mutable std::shared_ptr access_with_implicit; - mutable std::shared_ptr enabled_row_policies; - mutable std::shared_ptr enabled_quota; - mutable std::shared_ptr enabled_settings; + const bool is_full_access = false; + + mutable std::atomic user_was_dropped = false; + mutable std::atomic trace_log = nullptr; + + mutable UserPtr user TSA_GUARDED_BY(mutex); + mutable String user_name TSA_GUARDED_BY(mutex); + mutable scope_guard subscription_for_user_change TSA_GUARDED_BY(mutex); + mutable std::shared_ptr enabled_roles TSA_GUARDED_BY(mutex); + mutable scope_guard subscription_for_roles_changes TSA_GUARDED_BY(mutex); + mutable std::shared_ptr roles_info TSA_GUARDED_BY(mutex); + mutable std::shared_ptr access TSA_GUARDED_BY(mutex); + mutable std::shared_ptr access_with_implicit TSA_GUARDED_BY(mutex); + mutable std::shared_ptr enabled_row_policies TSA_GUARDED_BY(mutex); + mutable std::shared_ptr enabled_quota TSA_GUARDED_BY(mutex); + mutable std::shared_ptr enabled_settings TSA_GUARDED_BY(mutex); + mutable std::mutex mutex; }; diff --git a/src/AggregateFunctions/AggregateFunctionGroupArray.cpp b/src/AggregateFunctions/AggregateFunctionGroupArray.cpp index 15f500b8bb6..bb1368b9ff8 100644 --- a/src/AggregateFunctions/AggregateFunctionGroupArray.cpp +++ b/src/AggregateFunctions/AggregateFunctionGroupArray.cpp @@ -25,6 +25,7 @@ IAggregateFunction * createWithNumericOrTimeType(const IDataType & argument_type WhichDataType which(argument_type); if (which.idx == TypeIndex::Date) return new AggregateFunctionTemplate(std::forward(args)...); if (which.idx == TypeIndex::DateTime) return new AggregateFunctionTemplate(std::forward(args)...); + if (which.idx == TypeIndex::IPv4) return new AggregateFunctionTemplate(std::forward(args)...); return createWithNumericType(argument_type, std::forward(args)...); } diff --git a/src/AggregateFunctions/AggregateFunctionGroupUniqArray.cpp b/src/AggregateFunctions/AggregateFunctionGroupUniqArray.cpp index 16f2feb71bf..9e8060d44cc 100644 --- a/src/AggregateFunctions/AggregateFunctionGroupUniqArray.cpp +++ b/src/AggregateFunctions/AggregateFunctionGroupUniqArray.cpp @@ -4,6 +4,7 @@ #include #include #include +#include namespace DB @@ -39,12 +40,22 @@ public: static DataTypePtr createResultType() { return std::make_shared(std::make_shared()); } }; +template +class AggregateFunctionGroupUniqArrayIPv4 : public AggregateFunctionGroupUniqArray +{ +public: + explicit AggregateFunctionGroupUniqArrayIPv4(const DataTypePtr & argument_type, const Array & parameters_, UInt64 max_elems_ = std::numeric_limits::max()) + : AggregateFunctionGroupUniqArray(argument_type, parameters_, createResultType(), max_elems_) {} + static DataTypePtr createResultType() { return std::make_shared(std::make_shared()); } +}; + template IAggregateFunction * createWithExtraTypes(const DataTypePtr & argument_type, TArgs && ... args) { WhichDataType which(argument_type); if (which.idx == TypeIndex::Date) return new AggregateFunctionGroupUniqArrayDate(argument_type, std::forward(args)...); else if (which.idx == TypeIndex::DateTime) return new AggregateFunctionGroupUniqArrayDateTime(argument_type, std::forward(args)...); + else if (which.idx == TypeIndex::IPv4) return new AggregateFunctionGroupUniqArrayIPv4(argument_type, std::forward(args)...); else { /// Check that we can use plain version of AggregateFunctionGroupUniqArrayGeneric diff --git a/src/AggregateFunctions/AggregateFunctionMap.cpp b/src/AggregateFunctions/AggregateFunctionMap.cpp index f6100602f3f..b957b541fe1 100644 --- a/src/AggregateFunctions/AggregateFunctionMap.cpp +++ b/src/AggregateFunctions/AggregateFunctionMap.cpp @@ -100,6 +100,10 @@ public: return std::make_shared>(nested_function, arguments); case TypeIndex::UUID: return std::make_shared>(nested_function, arguments); + case TypeIndex::IPv4: + return std::make_shared>(nested_function, arguments); + case TypeIndex::IPv6: + return std::make_shared>(nested_function, arguments); case TypeIndex::FixedString: case TypeIndex::String: return std::make_shared>(nested_function, arguments); diff --git a/src/AggregateFunctions/AggregateFunctionMap.h b/src/AggregateFunctions/AggregateFunctionMap.h index 55f6611974e..7b9bb088d8f 100644 --- a/src/AggregateFunctions/AggregateFunctionMap.h +++ b/src/AggregateFunctions/AggregateFunctionMap.h @@ -19,7 +19,9 @@ #include #include #include "DataTypes/Serializations/ISerialization.h" +#include #include "base/types.h" +#include #include #include "AggregateFunctions/AggregateFunctionFactory.h" @@ -69,6 +71,31 @@ struct AggregateFunctionMapCombinatorData } }; +/// Specialization for IPv6 - for historical reasons it should be stored as FixedString(16) +template <> +struct AggregateFunctionMapCombinatorData +{ + struct IPv6Hash + { + using hash_type = std::hash; + using is_transparent = void; + + size_t operator()(const IPv6 & ip) const { return hash_type{}(ip); } + }; + + using SearchType = IPv6; + std::unordered_map> merged_maps; + + static void writeKey(const IPv6 & key, WriteBuffer & buf) + { + writeIPv6Binary(key, buf); + } + static void readKey(IPv6 & key, ReadBuffer & buf) + { + readIPv6Binary(key, buf); + } +}; + template class AggregateFunctionMap final : public IAggregateFunctionDataHelper, AggregateFunctionMap> @@ -147,6 +174,8 @@ public: StringRef key_ref; if (key_type->getTypeId() == TypeIndex::FixedString) key_ref = assert_cast(key_column).getDataAt(offset + i); + else if (key_type->getTypeId() == TypeIndex::IPv6) + key_ref = assert_cast(key_column).getDataAt(offset + i); else key_ref = assert_cast(key_column).getDataAt(offset + i); diff --git a/src/AggregateFunctions/AggregateFunctionTopK.cpp b/src/AggregateFunctions/AggregateFunctionTopK.cpp index e568694df02..8f6652223cc 100644 --- a/src/AggregateFunctions/AggregateFunctionTopK.cpp +++ b/src/AggregateFunctions/AggregateFunctionTopK.cpp @@ -5,6 +5,7 @@ #include #include #include +#include static inline constexpr UInt64 TOP_K_MAX_SIZE = 0xFFFFFF; @@ -60,6 +61,22 @@ public: {} }; +template +class AggregateFunctionTopKIPv4 : public AggregateFunctionTopK +{ +public: + using AggregateFunctionTopK::AggregateFunctionTopK; + + AggregateFunctionTopKIPv4(UInt64 threshold_, UInt64 load_factor, const DataTypes & argument_types_, const Array & params) + : AggregateFunctionTopK( + threshold_, + load_factor, + argument_types_, + params, + std::make_shared(std::make_shared())) + {} +}; + template IAggregateFunction * createWithExtraTypes(const DataTypes & argument_types, UInt64 threshold, UInt64 load_factor, const Array & params) @@ -72,6 +89,8 @@ IAggregateFunction * createWithExtraTypes(const DataTypes & argument_types, UInt return new AggregateFunctionTopKDate(threshold, load_factor, argument_types, params); if (which.idx == TypeIndex::DateTime) return new AggregateFunctionTopKDateTime(threshold, load_factor, argument_types, params); + if (which.idx == TypeIndex::IPv4) + return new AggregateFunctionTopKIPv4(threshold, load_factor, argument_types, params); /// Check that we can use plain version of AggregateFunctionTopKGeneric if (argument_types[0]->isValueUnambiguouslyRepresentedInContiguousMemoryRegion()) diff --git a/src/AggregateFunctions/AggregateFunctionUniq.cpp b/src/AggregateFunctions/AggregateFunctionUniq.cpp index bf998c5ee9e..748a232641e 100644 --- a/src/AggregateFunctions/AggregateFunctionUniq.cpp +++ b/src/AggregateFunctions/AggregateFunctionUniq.cpp @@ -8,6 +8,7 @@ #include #include #include +#include #include @@ -60,6 +61,10 @@ createAggregateFunctionUniq(const std::string & name, const DataTypes & argument return std::make_shared>(argument_types); else if (which.isUUID()) return std::make_shared>(argument_types); + else if (which.isIPv4()) + return std::make_shared>(argument_types); + else if (which.isIPv6()) + return std::make_shared>(argument_types); else if (which.isTuple()) { if (use_exact_hash_function) @@ -109,6 +114,10 @@ createAggregateFunctionUniq(const std::string & name, const DataTypes & argument return std::make_shared>>(argument_types); else if (which.isUUID()) return std::make_shared>>(argument_types); + else if (which.isIPv4()) + return std::make_shared>>(argument_types); + else if (which.isIPv6()) + return std::make_shared>>(argument_types); else if (which.isTuple()) { if (use_exact_hash_function) diff --git a/src/AggregateFunctions/AggregateFunctionUniq.h b/src/AggregateFunctions/AggregateFunctionUniq.h index c782b9314fd..de68e9076a0 100644 --- a/src/AggregateFunctions/AggregateFunctionUniq.h +++ b/src/AggregateFunctions/AggregateFunctionUniq.h @@ -101,6 +101,18 @@ struct AggregateFunctionUniqHLL12Data static String getName() { return "uniqHLL12"; } }; +template <> +struct AggregateFunctionUniqHLL12Data +{ + using Set = HyperLogLogWithSmallSetOptimization; + Set set; + + constexpr static bool is_able_to_parallelize_merge = false; + constexpr static bool is_variadic = false; + + static String getName() { return "uniqHLL12"; } +}; + template struct AggregateFunctionUniqHLL12DataForVariadic { @@ -155,6 +167,25 @@ struct AggregateFunctionUniqExactData static String getName() { return "uniqExact"; } }; +/// For historical reasons IPv6 is treated as FixedString(16) +template +struct AggregateFunctionUniqExactData +{ + using Key = UInt128; + + /// When creating, the hash table must be small. + using SingleLevelSet = HashSet, HashTableAllocatorWithStackMemory>; + using TwoLevelSet = TwoLevelHashSet; + using Set = UniqExactSet; + + Set set; + + constexpr static bool is_able_to_parallelize_merge = is_able_to_parallelize_merge_; + constexpr static bool is_variadic = false; + + static String getName() { return "uniqExact"; } +}; + template struct AggregateFunctionUniqExactDataForVariadic : AggregateFunctionUniqExactData { @@ -248,27 +279,22 @@ struct Adder AggregateFunctionUniqUniquesHashSetData> || std::is_same_v>) { const auto & column = *columns[0]; - if constexpr (!std::is_same_v) + if constexpr (std::is_same_v || std::is_same_v) + { + StringRef value = column.getDataAt(row_num); + data.set.insert(CityHash_v1_0_2::CityHash64(value.data, value.size)); + } + else { using ValueType = typename decltype(data.set)::value_type; const auto & value = assert_cast &>(column).getElement(row_num); data.set.insert(static_cast(AggregateFunctionUniqTraits::hash(value))); } - else - { - StringRef value = column.getDataAt(row_num); - data.set.insert(CityHash_v1_0_2::CityHash64(value.data, value.size)); - } } else if constexpr (std::is_same_v>) { const auto & column = *columns[0]; - if constexpr (!std::is_same_v) - { - data.set.template insert( - assert_cast &>(column).getData()[row_num]); - } - else + if constexpr (std::is_same_v || std::is_same_v) { StringRef value = column.getDataAt(row_num); @@ -279,6 +305,11 @@ struct Adder data.set.template insert(key); } + else + { + data.set.template insert( + assert_cast &>(column).getData()[row_num]); + } } #if USE_DATASKETCHES else if constexpr (std::is_same_v) diff --git a/src/AggregateFunctions/AggregateFunctionUniqCombined.cpp b/src/AggregateFunctions/AggregateFunctionUniqCombined.cpp index 1c59da59e83..8c2cb6ea0de 100644 --- a/src/AggregateFunctions/AggregateFunctionUniqCombined.cpp +++ b/src/AggregateFunctions/AggregateFunctionUniqCombined.cpp @@ -8,6 +8,7 @@ #include #include #include +#include #include @@ -60,6 +61,10 @@ namespace return std::make_shared::template AggregateFunction>(argument_types, params); else if (which.isUUID()) return std::make_shared::template AggregateFunction>(argument_types, params); + else if (which.isIPv4()) + return std::make_shared::template AggregateFunction>(argument_types, params); + else if (which.isIPv6()) + return std::make_shared::template AggregateFunction>(argument_types, params); else if (which.isTuple()) { if (use_exact_hash_function) diff --git a/src/AggregateFunctions/AggregateFunctionUniqCombined.h b/src/AggregateFunctions/AggregateFunctionUniqCombined.h index d879e3b3dde..5e8fa69f9de 100644 --- a/src/AggregateFunctions/AggregateFunctionUniqCombined.h +++ b/src/AggregateFunctions/AggregateFunctionUniqCombined.h @@ -119,6 +119,10 @@ struct AggregateFunctionUniqCombinedData : public Aggr { }; +template +struct AggregateFunctionUniqCombinedData : public AggregateFunctionUniqCombinedDataWithKey +{ +}; template class AggregateFunctionUniqCombined final @@ -141,16 +145,16 @@ public: void add(AggregateDataPtr __restrict place, const IColumn ** columns, size_t row_num, Arena *) const override { - if constexpr (!std::is_same_v) - { - const auto & value = assert_cast &>(*columns[0]).getElement(row_num); - this->data(place).set.insert(detail::AggregateFunctionUniqCombinedTraits::hash(value)); - } - else + if constexpr (std::is_same_v || std::is_same_v) { StringRef value = columns[0]->getDataAt(row_num); this->data(place).set.insert(CityHash_v1_0_2::CityHash64(value.data, value.size)); } + else + { + const auto & value = assert_cast &>(*columns[0]).getElement(row_num); + this->data(place).set.insert(detail::AggregateFunctionUniqCombinedTraits::hash(value)); + } } void merge(AggregateDataPtr __restrict place, ConstAggregateDataPtr rhs, Arena *) const override diff --git a/src/BridgeHelper/ExternalDictionaryLibraryBridgeHelper.cpp b/src/BridgeHelper/ExternalDictionaryLibraryBridgeHelper.cpp index 284e7740a58..fcb8ebd1f22 100644 --- a/src/BridgeHelper/ExternalDictionaryLibraryBridgeHelper.cpp +++ b/src/BridgeHelper/ExternalDictionaryLibraryBridgeHelper.cpp @@ -239,6 +239,7 @@ QueryPipeline ExternalDictionaryLibraryBridgeHelper::loadKeys(const Block & requ WriteBufferFromOStream out_buffer(os); auto output_format = getContext()->getOutputFormat(ExternalDictionaryLibraryBridgeHelper::DEFAULT_FORMAT, out_buffer, requested_block.cloneEmpty()); formatBlock(output_format, requested_block); + out_buffer.finalize(); }; return QueryPipeline(loadBase(uri, out_stream_callback)); } diff --git a/src/Client/ClientBase.cpp b/src/Client/ClientBase.cpp index 34b3b1e228a..9befe734298 100644 --- a/src/Client/ClientBase.cpp +++ b/src/Client/ClientBase.cpp @@ -362,7 +362,7 @@ ASTPtr ClientBase::parseQuery(const char *& pos, const char * end, bool allow_mu std::cout << std::endl; WriteBufferFromOStream res_buf(std::cout, 4096); formatAST(*res, res_buf); - res_buf.next(); + res_buf.finalize(); std::cout << std::endl << std::endl; } @@ -2297,7 +2297,9 @@ void ClientBase::runInteractive() catch (const ErrnoException & e) { if (e.getErrno() != EEXIST) - throw; + { + std::cerr << getCurrentExceptionMessage(false) << '\n'; + } } } diff --git a/src/Client/Connection.cpp b/src/Client/Connection.cpp index be226bfd9dd..cd102f46ffe 100644 --- a/src/Client/Connection.cpp +++ b/src/Client/Connection.cpp @@ -1175,16 +1175,12 @@ ProfileInfo Connection::receiveProfileInfo() const ParallelReadRequest Connection::receiveParallelReadRequest() const { - ParallelReadRequest request; - request.deserialize(*in); - return request; + return ParallelReadRequest::deserialize(*in); } InitialAllRangesAnnouncement Connection::receiveInitialParallelReadAnnounecement() const { - InitialAllRangesAnnouncement announcement; - announcement.deserialize(*in); - return announcement; + return InitialAllRangesAnnouncement::deserialize(*in); } diff --git a/src/Client/ConnectionParameters.cpp b/src/Client/ConnectionParameters.cpp index c47d217d432..f6630a06939 100644 --- a/src/Client/ConnectionParameters.cpp +++ b/src/Client/ConnectionParameters.cpp @@ -60,7 +60,15 @@ ConnectionParameters::ConnectionParameters(const Poco::Util::AbstractConfigurati quota_key = config.getString("quota_key", ""); /// By default compression is disabled if address looks like localhost. - compression = config.getBool("compression", !isLocalAddress(DNSResolver::instance().resolveHost(host))) + + /// Avoid DNS request if the host is "localhost". + /// If ClickHouse is run under QEMU-user with a binary for a different architecture, + /// and there are all listed startup dependency shared libraries available, but not the runtime dependencies of glibc, + /// the glibc cannot open "plugins" for DNS resolving, and the DNS resolution does not work. + /// At the same time, I want clickhouse-local to always work, regardless. + /// TODO: get rid of glibc, or replace getaddrinfo to c-ares. + + compression = config.getBool("compression", host != "localhost" && !isLocalAddress(DNSResolver::instance().resolveHost(host))) ? Protocol::Compression::Enable : Protocol::Compression::Disable; timeouts = ConnectionTimeouts( diff --git a/src/Client/IServerConnection.h b/src/Client/IServerConnection.h index 52382ff9d45..a0c029c79fb 100644 --- a/src/Client/IServerConnection.h +++ b/src/Client/IServerConnection.h @@ -16,6 +16,10 @@ #include +#include +#include +#include +#include namespace DB { @@ -34,9 +38,9 @@ struct Packet ProfileInfo profile_info; std::vector part_uuids; - InitialAllRangesAnnouncement announcement; - ParallelReadRequest request; - ParallelReadResponse response; + /// The part of parallel replicas protocol + std::optional announcement; + std::optional request; std::string server_timezone; diff --git a/src/Client/QueryFuzzer.cpp b/src/Client/QueryFuzzer.cpp index bfcfe659982..39ca7a5ed88 100644 --- a/src/Client/QueryFuzzer.cpp +++ b/src/Client/QueryFuzzer.cpp @@ -1244,7 +1244,7 @@ void QueryFuzzer::fuzzMain(ASTPtr & ast) std::cout << std::endl; WriteBufferFromOStream ast_buf(std::cout, 4096); formatAST(*ast, ast_buf, false /*highlight*/); - ast_buf.next(); + ast_buf.finalize(); std::cout << std::endl << std::endl; } diff --git a/src/Client/Suggest.cpp b/src/Client/Suggest.cpp index 6e989e10f76..1723f85dc16 100644 --- a/src/Client/Suggest.cpp +++ b/src/Client/Suggest.cpp @@ -101,9 +101,8 @@ static String getLoadSuggestionQuery(Int32 suggestion_limit, bool basic_suggesti add_column("name", "columns", true, suggestion_limit); } - /// FIXME: Forbid this query using new analyzer because of bug https://github.com/ClickHouse/ClickHouse/issues/50669 - /// We should remove this restriction after resolving this bug. - query = "SELECT DISTINCT arrayJoin(extractAll(name, '[\\\\w_]{2,}')) AS res FROM (" + query + ") WHERE notEmpty(res) SETTINGS allow_experimental_analyzer=0"; + /// FIXME: This query does not work with the new analyzer because of bug https://github.com/ClickHouse/ClickHouse/issues/50669 + query = "SELECT DISTINCT arrayJoin(extractAll(name, '[\\\\w_]{2,}')) AS res FROM (" + query + ") WHERE notEmpty(res)"; return query; } diff --git a/src/Common/CurrentMetrics.cpp b/src/Common/CurrentMetrics.cpp index f2ddb7a84c0..105a7c0548f 100644 --- a/src/Common/CurrentMetrics.cpp +++ b/src/Common/CurrentMetrics.cpp @@ -188,6 +188,7 @@ M(CacheDetachedFileSegments, "Number of existing detached cache file segments") \ M(FilesystemCacheSize, "Filesystem cache size in bytes") \ M(FilesystemCacheElements, "Filesystem cache elements (file segments)") \ + M(FilesystemCacheDownloadQueueElements, "Filesystem cache elements in download queue") \ M(AsyncInsertCacheSize, "Number of async insert hash id in cache") \ M(S3Requests, "S3 requests") \ M(KeeperAliveConnections, "Number of alive connections") \ diff --git a/src/Common/Exception.cpp b/src/Common/Exception.cpp index 20206b76225..ee268be45f6 100644 --- a/src/Common/Exception.cpp +++ b/src/Common/Exception.cpp @@ -1,5 +1,6 @@ #include "Exception.h" +#include #include #include #include @@ -83,6 +84,7 @@ Exception::Exception(const MessageMasked & msg_masked, int code, bool remote_) : Poco::Exception(msg_masked.msg, code) , remote(remote_) { + capture_thread_frame_pointers = thread_frame_pointers; handle_error_code(msg_masked.msg, code, remote, getStackFramePointers()); } @@ -90,12 +92,14 @@ Exception::Exception(MessageMasked && msg_masked, int code, bool remote_) : Poco::Exception(msg_masked.msg, code) , remote(remote_) { + capture_thread_frame_pointers = thread_frame_pointers; handle_error_code(message(), code, remote, getStackFramePointers()); } Exception::Exception(CreateFromPocoTag, const Poco::Exception & exc) : Poco::Exception(exc.displayText(), ErrorCodes::POCO_EXCEPTION) { + capture_thread_frame_pointers = thread_frame_pointers; #ifdef STD_EXCEPTION_HAS_STACK_TRACE auto * stack_trace_frames = exc.get_stack_trace_frames(); auto stack_trace_size = exc.get_stack_trace_size(); @@ -107,6 +111,7 @@ Exception::Exception(CreateFromPocoTag, const Poco::Exception & exc) Exception::Exception(CreateFromSTDTag, const std::exception & exc) : Poco::Exception(demangle(typeid(exc).name()) + ": " + String(exc.what()), ErrorCodes::STD_EXCEPTION) { + capture_thread_frame_pointers = thread_frame_pointers; #ifdef STD_EXCEPTION_HAS_STACK_TRACE auto * stack_trace_frames = exc.get_stack_trace_frames(); auto stack_trace_size = exc.get_stack_trace_size(); @@ -153,7 +158,17 @@ std::string Exception::getStackTraceString() const auto * stack_trace_frames = get_stack_trace_frames(); auto stack_trace_size = get_stack_trace_size(); __msan_unpoison(stack_trace_frames, stack_trace_size * sizeof(stack_trace_frames[0])); - return StackTrace::toString(stack_trace_frames, 0, stack_trace_size); + String thread_stack_trace; + std::for_each(capture_thread_frame_pointers.rbegin(), capture_thread_frame_pointers.rend(), + [&thread_stack_trace](StackTrace::FramePointers & frame_pointers) + { + thread_stack_trace += + "\nJob's origin stack trace:\n" + + StackTrace::toString(frame_pointers.data(), 0, std::ranges::find(frame_pointers, nullptr) - frame_pointers.begin()); + } + ); + + return StackTrace::toString(stack_trace_frames, 0, stack_trace_size) + thread_stack_trace; #else return trace.toString(); #endif @@ -185,6 +200,9 @@ Exception::FramePointers Exception::getStackFramePointers() const return frame_pointers; } +thread_local bool Exception::enable_job_stack_trace = false; +thread_local std::vector Exception::thread_frame_pointers = {}; + void throwFromErrno(const std::string & s, int code, int the_errno) { diff --git a/src/Common/Exception.h b/src/Common/Exception.h index 170e0d32b3c..f80dfe7f0a2 100644 --- a/src/Common/Exception.h +++ b/src/Common/Exception.h @@ -25,18 +25,27 @@ class Exception : public Poco::Exception public: using FramePointers = std::vector; - Exception() = default; + Exception() + { + capture_thread_frame_pointers = thread_frame_pointers; + } Exception(const PreformattedMessage & msg, int code): Exception(msg.text, code) { + capture_thread_frame_pointers = thread_frame_pointers; message_format_string = msg.format_string; } Exception(PreformattedMessage && msg, int code): Exception(std::move(msg.text), code) { + capture_thread_frame_pointers = thread_frame_pointers; message_format_string = msg.format_string; } + /// Collect call stacks of all previous jobs' schedulings leading to this thread job's execution + static thread_local bool enable_job_stack_trace; + static thread_local std::vector thread_frame_pointers; + protected: // used to remove the sensitive information from exceptions if query_masking_rules is configured struct MessageMasked @@ -66,6 +75,7 @@ public: Exception(int code, T && message) : Exception(message, code) { + capture_thread_frame_pointers = thread_frame_pointers; message_format_string = tryGetStaticFormatString(message); } @@ -80,6 +90,7 @@ public: Exception(int code, FormatStringHelper fmt, Args &&... args) : Exception(fmt::format(fmt.fmt_str, std::forward(args)...), code) { + capture_thread_frame_pointers = thread_frame_pointers; message_format_string = fmt.message_format_string; } @@ -131,6 +142,8 @@ private: protected: std::string_view message_format_string; + /// Local copy of static per-thread thread_frame_pointers, should be mutable to be unpoisoned on printout + mutable std::vector capture_thread_frame_pointers; }; diff --git a/src/Common/OptimizedRegularExpression.cpp b/src/Common/OptimizedRegularExpression.cpp index f2fe922ef19..c542945c78d 100644 --- a/src/Common/OptimizedRegularExpression.cpp +++ b/src/Common/OptimizedRegularExpression.cpp @@ -540,7 +540,7 @@ bool OptimizedRegularExpressionImpl::match(const char * subject, si } } - return re2->Match(StringPieceType(subject, subject_size), 0, subject_size, RegexType::UNANCHORED, nullptr, 0); + return re2->Match({subject, subject_size}, 0, subject_size, RegexType::UNANCHORED, nullptr, 0); } } @@ -585,9 +585,9 @@ bool OptimizedRegularExpressionImpl::match(const char * subject, si return false; } - StringPieceType piece; + std::string_view piece; - if (!RegexType::PartialMatch(StringPieceType(subject, subject_size), *re2, &piece)) + if (!RegexType::PartialMatch({subject, subject_size}, *re2, &piece)) return false; else { @@ -652,10 +652,10 @@ unsigned OptimizedRegularExpressionImpl::match(const char * subject return 0; } - DB::PODArrayWithStackMemory pieces(limit); + DB::PODArrayWithStackMemory pieces(limit); if (!re2->Match( - StringPieceType(subject, subject_size), + {subject, subject_size}, 0, subject_size, RegexType::UNANCHORED, @@ -669,16 +669,16 @@ unsigned OptimizedRegularExpressionImpl::match(const char * subject matches.resize(limit); for (size_t i = 0; i < limit; ++i) { - if (pieces[i] != nullptr) - { - matches[i].offset = pieces[i].data() - subject; - matches[i].length = pieces[i].length(); - } - else + if (pieces[i].empty()) { matches[i].offset = std::string::npos; matches[i].length = 0; } + else + { + matches[i].offset = pieces[i].data() - subject; + matches[i].length = pieces[i].length(); + } } return limit; } diff --git a/src/Common/OptimizedRegularExpression.h b/src/Common/OptimizedRegularExpression.h index f6b59f0a465..51f1bc200e4 100644 --- a/src/Common/OptimizedRegularExpression.h +++ b/src/Common/OptimizedRegularExpression.h @@ -52,7 +52,6 @@ public: using MatchVec = std::vector; using RegexType = std::conditional_t; - using StringPieceType = std::conditional_t; OptimizedRegularExpressionImpl(const std::string & regexp_, int options = 0); /// NOLINT /// StringSearcher store pointers to required_substring, it must be updated on move. diff --git a/src/Common/SensitiveDataMasker.cpp b/src/Common/SensitiveDataMasker.cpp index 34db78d00fb..b59a4758822 100644 --- a/src/Common/SensitiveDataMasker.cpp +++ b/src/Common/SensitiveDataMasker.cpp @@ -5,7 +5,6 @@ #include #include -#include #include @@ -44,7 +43,7 @@ private: const std::string regexp_string; const RE2 regexp; - const re2::StringPiece replacement; + const std::string_view replacement; #ifndef NDEBUG mutable std::atomic matches_count = 0; diff --git a/src/Common/StackTrace.cpp b/src/Common/StackTrace.cpp index e6b7839c4e2..aea0f854fe1 100644 --- a/src/Common/StackTrace.cpp +++ b/src/Common/StackTrace.cpp @@ -412,6 +412,21 @@ void StackTrace::toStringEveryLine(std::function callbac toStringEveryLineImpl(true, {frame_pointers, offset, size}, std::move(callback)); } +void StackTrace::toStringEveryLine(const FramePointers & frame_pointers, std::function callback) +{ + toStringEveryLineImpl(true, {frame_pointers, 0, static_cast(std::ranges::find(frame_pointers, nullptr) - frame_pointers.begin())}, std::move(callback)); +} + +void StackTrace::toStringEveryLine(void ** frame_pointers_raw, size_t offset, size_t size, std::function callback) +{ + __msan_unpoison(frame_pointers_raw, size * sizeof(*frame_pointers_raw)); + + StackTrace::FramePointers frame_pointers{}; + std::copy_n(frame_pointers_raw, size, frame_pointers.begin()); + + toStringEveryLineImpl(true, {frame_pointers, offset, size}, std::move(callback)); +} + using StackTraceCache = std::map>; static StackTraceCache & cacheInstance() diff --git a/src/Common/StackTrace.h b/src/Common/StackTrace.h index 3940c880c5b..656f543d837 100644 --- a/src/Common/StackTrace.h +++ b/src/Common/StackTrace.h @@ -65,6 +65,8 @@ public: static void symbolize(const FramePointers & frame_pointers, size_t offset, size_t size, StackTrace::Frames & frames); void toStringEveryLine(std::function callback) const; + static void toStringEveryLine(const FramePointers & frame_pointers, std::function callback); + static void toStringEveryLine(void ** frame_pointers_raw, size_t offset, size_t size, std::function callback); /// Displaying the addresses can be disabled for security reasons. /// If you turn off addresses, it will be more secure, but we will be unable to help you with debugging. diff --git a/src/Common/ThreadPool.cpp b/src/Common/ThreadPool.cpp index f91849ead66..cc88594d84f 100644 --- a/src/Common/ThreadPool.cpp +++ b/src/Common/ThreadPool.cpp @@ -189,7 +189,9 @@ ReturnType ThreadPoolImpl::scheduleImpl(Job job, Priority priority, std: jobs.emplace(std::move(job), priority, /// Tracing context on this thread is used as parent context for the sub-thread that runs the job - propagate_opentelemetry_tracing_context ? DB::OpenTelemetry::CurrentContext() : DB::OpenTelemetry::TracingContextOnThread()); + propagate_opentelemetry_tracing_context ? DB::OpenTelemetry::CurrentContext() : DB::OpenTelemetry::TracingContextOnThread(), + /// capture_frame_pointers + DB::Exception::enable_job_stack_trace); ++scheduled_jobs; } @@ -348,6 +350,8 @@ void ThreadPoolImpl::worker(typename std::list::iterator thread_ /// A copy of parent trace context DB::OpenTelemetry::TracingContextOnThread parent_thread_trace_context; + std::vector thread_frame_pointers; + /// Get a job from the queue. Job job; @@ -393,6 +397,9 @@ void ThreadPoolImpl::worker(typename std::list::iterator thread_ /// to prevent us from modifying its priority. We have to use const_cast to force move semantics on JobWithPriority::job. job = std::move(const_cast(jobs.top().job)); parent_thread_trace_context = std::move(const_cast(jobs.top().thread_trace_context)); + DB::Exception::enable_job_stack_trace = jobs.top().enable_job_stack_trace; + if (DB::Exception::enable_job_stack_trace) + thread_frame_pointers = std::move(const_cast &>(jobs.top().frame_pointers)); jobs.pop(); /// We don't run jobs after `shutdown` is set, but we have to properly dequeue all jobs and finish them. @@ -411,6 +418,10 @@ void ThreadPoolImpl::worker(typename std::list::iterator thread_ /// Run the job. try { + if (DB::Exception::enable_job_stack_trace) + DB::Exception::thread_frame_pointers = std::move(thread_frame_pointers); + + CurrentMetrics::Increment metric_active_pool_threads(metric_active_threads); job(); diff --git a/src/Common/ThreadPool.h b/src/Common/ThreadPool.h index 70053ff7558..f5721146e09 100644 --- a/src/Common/ThreadPool.h +++ b/src/Common/ThreadPool.h @@ -19,6 +19,8 @@ #include #include #include +#include +#include #include /** Very simple thread pool similar to boost::threadpool. @@ -127,8 +129,19 @@ private: Priority priority; DB::OpenTelemetry::TracingContextOnThread thread_trace_context; - JobWithPriority(Job job_, Priority priority_, const DB::OpenTelemetry::TracingContextOnThread & thread_trace_context_) - : job(job_), priority(priority_), thread_trace_context(thread_trace_context_) {} + /// Call stacks of all jobs' schedulings leading to this one + std::vector frame_pointers; + bool enable_job_stack_trace = false; + + JobWithPriority(Job job_, Priority priority_, const DB::OpenTelemetry::TracingContextOnThread & thread_trace_context_, bool capture_frame_pointers = false) + : job(job_), priority(priority_), thread_trace_context(thread_trace_context_), enable_job_stack_trace(capture_frame_pointers) + { + if (!capture_frame_pointers) + return; + /// Save all previous jobs call stacks and append with current + frame_pointers = DB::Exception::thread_frame_pointers; + frame_pointers.push_back(StackTrace().getFramePointers()); + } bool operator<(const JobWithPriority & rhs) const { diff --git a/src/Common/ThreadStatus.cpp b/src/Common/ThreadStatus.cpp index 9b0743d89c3..7a602afe7e7 100644 --- a/src/Common/ThreadStatus.cpp +++ b/src/Common/ThreadStatus.cpp @@ -67,8 +67,8 @@ ThreadGroup::ThreadGroup() : master_thread_id(CurrentThread::get().thread_id) {} -ThreadStatus::ThreadStatus() - : thread_id{getThreadId()} +ThreadStatus::ThreadStatus(bool check_current_thread_on_destruction_) + : thread_id{getThreadId()}, check_current_thread_on_destruction(check_current_thread_on_destruction_) { last_rusage = std::make_unique(); @@ -201,8 +201,11 @@ ThreadStatus::~ThreadStatus() /// Only change current_thread if it's currently being used by this ThreadStatus /// For example, PushingToViews chain creates and deletes ThreadStatus instances while running in the main query thread - if (current_thread == this) + if (check_current_thread_on_destruction) + { + assert(current_thread == this); current_thread = nullptr; + } } void ThreadStatus::updatePerformanceCounters() diff --git a/src/Common/ThreadStatus.h b/src/Common/ThreadStatus.h index 061959d9f1f..7c8dbdb68bd 100644 --- a/src/Common/ThreadStatus.h +++ b/src/Common/ThreadStatus.h @@ -224,8 +224,10 @@ private: Poco::Logger * log = nullptr; + bool check_current_thread_on_destruction; + public: - ThreadStatus(); + explicit ThreadStatus(bool check_current_thread_on_destruction_ = true); ~ThreadStatus(); ThreadGroupPtr getThreadGroup() const; @@ -290,6 +292,7 @@ public: void flushUntrackedMemory(); private: + void applyGlobalSettings(); void applyQuerySettings(); void initPerformanceCounters(); diff --git a/src/Common/ZooKeeper/CMakeLists.txt b/src/Common/ZooKeeper/CMakeLists.txt index a9a335d1461..3f7e87ff4a7 100644 --- a/src/Common/ZooKeeper/CMakeLists.txt +++ b/src/Common/ZooKeeper/CMakeLists.txt @@ -2,6 +2,8 @@ include("${ClickHouse_SOURCE_DIR}/cmake/dbms_glob_sources.cmake") add_headers_and_sources(clickhouse_common_zookeeper .) +list(APPEND clickhouse_common_zookeeper_sources ${CMAKE_CURRENT_SOURCE_DIR}/../../../src/Coordination/KeeperFeatureFlags.cpp) + # for clickhouse server add_library(clickhouse_common_zookeeper ${clickhouse_common_zookeeper_headers} ${clickhouse_common_zookeeper_sources}) target_compile_definitions (clickhouse_common_zookeeper PRIVATE -DZOOKEEPER_LOG) diff --git a/src/Common/ZooKeeper/IKeeper.h b/src/Common/ZooKeeper/IKeeper.h index 3eb5819df90..2703c1079c0 100644 --- a/src/Common/ZooKeeper/IKeeper.h +++ b/src/Common/ZooKeeper/IKeeper.h @@ -2,7 +2,7 @@ #include #include -#include +#include #include #include @@ -530,7 +530,9 @@ public: const Requests & requests, MultiCallback callback) = 0; - virtual DB::KeeperApiVersion getApiVersion() const = 0; + virtual bool isFeatureEnabled(DB::KeeperFeatureFlag feature_flag) const = 0; + + virtual const DB::KeeperFeatureFlags * getKeeperFeatureFlags() const { return nullptr; } /// Expire session and finish all pending requests virtual void finalize(const String & reason) = 0; diff --git a/src/Common/ZooKeeper/TestKeeper.h b/src/Common/ZooKeeper/TestKeeper.h index 4bffa4e1d4f..9bbd018cfb1 100644 --- a/src/Common/ZooKeeper/TestKeeper.h +++ b/src/Common/ZooKeeper/TestKeeper.h @@ -11,6 +11,7 @@ #include #include #include +#include namespace Coordination @@ -92,9 +93,9 @@ public: void finalize(const String & reason) override; - DB::KeeperApiVersion getApiVersion() const override + bool isFeatureEnabled(DB::KeeperFeatureFlag) const override { - return KeeperApiVersion::ZOOKEEPER_COMPATIBLE; + return false; } struct Node diff --git a/src/Common/ZooKeeper/ZooKeeper.cpp b/src/Common/ZooKeeper/ZooKeeper.cpp index 62807fe2433..5dd7948276d 100644 --- a/src/Common/ZooKeeper/ZooKeeper.cpp +++ b/src/Common/ZooKeeper/ZooKeeper.cpp @@ -865,9 +865,9 @@ bool ZooKeeper::expired() return impl->isExpired(); } -DB::KeeperApiVersion ZooKeeper::getApiVersion() const +bool ZooKeeper::isFeatureEnabled(DB::KeeperFeatureFlag feature_flag) const { - return impl->getApiVersion(); + return impl->isFeatureEnabled(feature_flag); } Int64 ZooKeeper::getClientID() diff --git a/src/Common/ZooKeeper/ZooKeeper.h b/src/Common/ZooKeeper/ZooKeeper.h index f5e00b5f283..1fcb048add2 100644 --- a/src/Common/ZooKeeper/ZooKeeper.h +++ b/src/Common/ZooKeeper/ZooKeeper.h @@ -15,6 +15,7 @@ #include #include #include +#include #include #include @@ -215,7 +216,7 @@ public: /// Returns true, if the session has expired. bool expired(); - DB::KeeperApiVersion getApiVersion() const; + bool isFeatureEnabled(DB::KeeperFeatureFlag feature_flag) const; /// Create a znode. /// Throw an exception if something went wrong. @@ -528,6 +529,8 @@ public: size_t getConnectedZooKeeperIndex() const { return connected_zk_index; } UInt64 getConnectedTime() const { return connected_time; } + const DB::KeeperFeatureFlags * getKeeperFeatureFlags() const { return impl->getKeeperFeatureFlags(); } + private: void init(ZooKeeperArgs args_); @@ -554,7 +557,7 @@ private: template MultiReadResponses multiRead(TIter start, TIter end, RequestFactory request_factory, AsyncFunction async_fun) { - if (getApiVersion() >= DB::KeeperApiVersion::WITH_MULTI_READ) + if (isFeatureEnabled(DB::KeeperFeatureFlag::MULTI_READ)) { Coordination::Requests requests; for (auto it = start; it != end; ++it) @@ -687,7 +690,7 @@ String getZooKeeperConfigName(const Poco::Util::AbstractConfiguration & config); template void addCheckNotExistsRequest(Coordination::Requests & requests, const Client & client, const std::string & path) { - if (client.getApiVersion() >= DB::KeeperApiVersion::WITH_CHECK_NOT_EXISTS) + if (client.isFeatureEnabled(DB::KeeperFeatureFlag::CHECK_NOT_EXISTS)) { auto request = std::make_shared(); request->path = path; diff --git a/src/Common/ZooKeeper/ZooKeeperArgs.cpp b/src/Common/ZooKeeper/ZooKeeperArgs.cpp index 0ebc7f667cb..e99285b0056 100644 --- a/src/Common/ZooKeeper/ZooKeeperArgs.cpp +++ b/src/Common/ZooKeeper/ZooKeeperArgs.cpp @@ -4,6 +4,7 @@ #include #include #include +#include #include namespace DB diff --git a/src/Common/ZooKeeper/ZooKeeperImpl.cpp b/src/Common/ZooKeeper/ZooKeeperImpl.cpp index 14b31c6a411..0f27d078234 100644 --- a/src/Common/ZooKeeper/ZooKeeperImpl.cpp +++ b/src/Common/ZooKeeper/ZooKeeperImpl.cpp @@ -354,7 +354,8 @@ ZooKeeper::ZooKeeper( send_thread = ThreadFromGlobalPool([this] { sendThread(); }); receive_thread = ThreadFromGlobalPool([this] { receiveThread(); }); - initApiVersion(); + initFeatureFlags(); + keeper_feature_flags.logFlags(log); ProfileEvents::increment(ProfileEvents::ZooKeeperInit); } @@ -362,6 +363,16 @@ ZooKeeper::ZooKeeper( { tryLogCurrentException(log, "Failed to connect to ZooKeeper"); + try + { + requests_queue.finish(); + socket.shutdown(); + } + catch (...) + { + tryLogCurrentException(log); + } + send_thread.join(); receive_thread.join(); @@ -1089,44 +1100,64 @@ void ZooKeeper::pushRequest(RequestInfo && info) ProfileEvents::increment(ProfileEvents::ZooKeeperTransactions); } -KeeperApiVersion ZooKeeper::getApiVersion() const +bool ZooKeeper::isFeatureEnabled(KeeperFeatureFlag feature_flag) const { - return keeper_api_version; + return keeper_feature_flags.isEnabled(feature_flag); } -void ZooKeeper::initApiVersion() +void ZooKeeper::initFeatureFlags() { - auto promise = std::make_shared>(); - auto future = promise->get_future(); - - auto callback = [promise](const Coordination::GetResponse & response) mutable + const auto try_get = [&](const std::string & path, const std::string & description) -> std::optional { - promise->set_value(response); + auto promise = std::make_shared>(); + auto future = promise->get_future(); + + auto callback = [promise](const Coordination::GetResponse & response) mutable + { + promise->set_value(response); + }; + + get(path, std::move(callback), {}); + if (future.wait_for(std::chrono::milliseconds(args.operation_timeout_ms)) != std::future_status::ready) + throw Exception(Error::ZOPERATIONTIMEOUT, "Failed to get {}: timeout", description); + + auto response = future.get(); + + if (response.error == Coordination::Error::ZNONODE) + { + LOG_TRACE(log, "Failed to get {}", description); + return std::nullopt; + } + else if (response.error != Coordination::Error::ZOK) + { + throw Exception(response.error, "Failed to get {}", description); + } + + return std::move(response.data); }; - get(keeper_api_version_path, std::move(callback), {}); - if (future.wait_for(std::chrono::milliseconds(args.operation_timeout_ms)) != std::future_status::ready) + if (auto feature_flags = try_get(keeper_api_feature_flags_path, "feature flags"); feature_flags.has_value()) { - throw Exception(Error::ZOPERATIONTIMEOUT, "Failed to get API version: timeout"); + keeper_feature_flags.setFeatureFlags(std::move(*feature_flags)); + return; } - auto response = future.get(); + auto keeper_api_version_string = try_get(keeper_api_version_path, "API version"); - if (response.error == Coordination::Error::ZNONODE) + DB::KeeperApiVersion keeper_api_version{DB::KeeperApiVersion::ZOOKEEPER_COMPATIBLE}; + + if (!keeper_api_version_string.has_value()) { LOG_TRACE(log, "API version not found, assuming {}", keeper_api_version); return; } - else if (response.error != Coordination::Error::ZOK) - { - throw Exception(response.error, "Failed to get API version"); - } + DB::ReadBufferFromOwnString buf(*keeper_api_version_string); uint8_t keeper_version{0}; - DB::ReadBufferFromOwnString buf(response.data); DB::readIntText(keeper_version, buf); keeper_api_version = static_cast(keeper_version); LOG_TRACE(log, "Detected server's API version: {}", keeper_api_version); + keeper_feature_flags.fromApiVersion(keeper_api_version); } @@ -1246,7 +1277,7 @@ void ZooKeeper::list( WatchCallback watch) { std::shared_ptr request{nullptr}; - if (keeper_api_version < Coordination::KeeperApiVersion::WITH_FILTERED_LIST) + if (!isFeatureEnabled(KeeperFeatureFlag::FILTERED_LIST)) { if (list_request_type != ListRequestType::ALL) throw Exception(Error::ZBADARGUMENTS, "Filtered list request type cannot be used because it's not supported by the server"); @@ -1311,7 +1342,7 @@ void ZooKeeper::multi( { ZooKeeperMultiRequest request(requests, default_acls); - if (request.getOpNum() == OpNum::MultiRead && keeper_api_version < Coordination::KeeperApiVersion::WITH_MULTI_READ) + if (request.getOpNum() == OpNum::MultiRead && !isFeatureEnabled(KeeperFeatureFlag::MULTI_READ)) throw Exception(Error::ZBADARGUMENTS, "MultiRead request type cannot be used because it's not supported by the server"); RequestInfo request_info; diff --git a/src/Common/ZooKeeper/ZooKeeperImpl.h b/src/Common/ZooKeeper/ZooKeeperImpl.h index 944c5032fac..44ea993947e 100644 --- a/src/Common/ZooKeeper/ZooKeeperImpl.h +++ b/src/Common/ZooKeeper/ZooKeeperImpl.h @@ -9,6 +9,7 @@ #include #include #include +#include #include #include @@ -181,7 +182,7 @@ public: const Requests & requests, MultiCallback callback) override; - DB::KeeperApiVersion getApiVersion() const override; + bool isFeatureEnabled(KeeperFeatureFlag feature_flag) const override; /// Without forcefully invalidating (finalizing) ZooKeeper session before /// establishing a new one, there was a possibility that server is using @@ -201,6 +202,8 @@ public: void setServerCompletelyStarted(); + const KeeperFeatureFlags * getKeeperFeatureFlags() const override { return &keeper_feature_flags; } + private: ACLs default_acls; Poco::Net::SocketAddress connected_zk_address; @@ -312,12 +315,12 @@ private: void logOperationIfNeeded(const ZooKeeperRequestPtr & request, const ZooKeeperResponsePtr & response = nullptr, bool finalize = false, UInt64 elapsed_ms = 0); - void initApiVersion(); + void initFeatureFlags(); CurrentMetrics::Increment active_session_metric_increment{CurrentMetrics::ZooKeeperSession}; std::shared_ptr zk_log; - DB::KeeperApiVersion keeper_api_version{DB::KeeperApiVersion::ZOOKEEPER_COMPATIBLE}; + DB::KeeperFeatureFlags keeper_feature_flags; }; } diff --git a/src/Common/ZooKeeper/ZooKeeperWithFaultInjection.h b/src/Common/ZooKeeper/ZooKeeperWithFaultInjection.h index bf99cb76798..9d02d674010 100644 --- a/src/Common/ZooKeeper/ZooKeeperWithFaultInjection.h +++ b/src/Common/ZooKeeper/ZooKeeperWithFaultInjection.h @@ -402,9 +402,9 @@ public: ephemeral_nodes.clear(); } - KeeperApiVersion getApiVersion() const + bool isFeatureEnabled(KeeperFeatureFlag feature_flag) const { - return keeper->getApiVersion(); + return keeper->isFeatureEnabled(feature_flag); } private: diff --git a/src/Common/assert_cast.h b/src/Common/assert_cast.h index 604cfaed6e2..0b73ba1cc12 100644 --- a/src/Common/assert_cast.h +++ b/src/Common/assert_cast.h @@ -23,7 +23,7 @@ namespace DB * The exact match of the type is checked. That is, cast to the ancestor will be unsuccessful. */ template -To assert_cast(From && from) +inline To assert_cast(From && from) { #ifndef NDEBUG try diff --git a/src/Common/checkStackSize.cpp b/src/Common/checkStackSize.cpp index 67d163938b4..8847d37df3a 100644 --- a/src/Common/checkStackSize.cpp +++ b/src/Common/checkStackSize.cpp @@ -27,7 +27,7 @@ static thread_local size_t max_stack_size = 0; * @param out_address - if not nullptr, here the address of the stack will be written. * @return stack size */ -size_t getStackSize(void ** out_address) +static size_t getStackSize(void ** out_address) { using namespace DB; @@ -54,7 +54,15 @@ size_t getStackSize(void ** out_address) throwFromErrno("Cannot pthread_attr_get_np", ErrorCodes::CANNOT_PTHREAD_ATTR); # else if (0 != pthread_getattr_np(pthread_self(), &attr)) - throwFromErrno("Cannot pthread_getattr_np", ErrorCodes::CANNOT_PTHREAD_ATTR); + { + if (errno == ENOENT) + { + /// Most likely procfs is not mounted. + return 0; + } + else + throwFromErrno("Cannot pthread_getattr_np", ErrorCodes::CANNOT_PTHREAD_ATTR); + } # endif SCOPE_EXIT({ pthread_attr_destroy(&attr); }); @@ -83,6 +91,10 @@ __attribute__((__weak__)) void checkStackSize() if (!stack_address) max_stack_size = getStackSize(&stack_address); + /// The check is impossible. + if (!max_stack_size) + return; + const void * frame_address = __builtin_frame_address(0); uintptr_t int_frame_address = reinterpret_cast(frame_address); uintptr_t int_stack_address = reinterpret_cast(stack_address); diff --git a/src/Common/getHashOfLoadedBinary.cpp b/src/Common/getHashOfLoadedBinary.cpp index cc0ad0d2143..b81300b8536 100644 --- a/src/Common/getHashOfLoadedBinary.cpp +++ b/src/Common/getHashOfLoadedBinary.cpp @@ -37,7 +37,7 @@ SipHash getHashOfLoadedBinary() std::string getHashOfLoadedBinaryHex() { SipHash hash = getHashOfLoadedBinary(); - std::array checksum; + UInt128 checksum; hash.get128(checksum); return getHexUIntUppercase(checksum); } diff --git a/src/Common/getNumberOfPhysicalCPUCores.cpp b/src/Common/getNumberOfPhysicalCPUCores.cpp index ed82c59140d..8fbb32e911f 100644 --- a/src/Common/getNumberOfPhysicalCPUCores.cpp +++ b/src/Common/getNumberOfPhysicalCPUCores.cpp @@ -1,4 +1,5 @@ #include "getNumberOfPhysicalCPUCores.h" +#include #include "config.h" #if defined(OS_LINUX) @@ -7,6 +8,8 @@ #endif #include +#include +#include #include #include @@ -15,7 +18,7 @@ namespace { #if defined(OS_LINUX) -int32_t readFrom(const char * filename, int default_value) +int32_t readFrom(const std::filesystem::path & filename, int default_value) { std::ifstream infile(filename); if (!infile.is_open()) @@ -31,10 +34,87 @@ int32_t readFrom(const char * filename, int default_value) uint32_t getCGroupLimitedCPUCores(unsigned default_cpu_count) { uint32_t quota_count = default_cpu_count; + std::filesystem::path prefix = "/sys/fs/cgroup"; + /// cgroupsv2 + std::ifstream contr_file(prefix / "cgroup.controllers"); + if (contr_file.is_open()) + { + /// First, we identify the cgroup the process belongs + std::ifstream cgroup_name_file("/proc/self/cgroup"); + if (!cgroup_name_file.is_open()) + return default_cpu_count; + + // cgroup_name_file always starts with '0::/' for v2 + cgroup_name_file.ignore(4); + std::string cgroup_name; + cgroup_name_file >> cgroup_name; + + std::filesystem::path current_cgroup; + if (cgroup_name.empty()) + current_cgroup = prefix; + else + current_cgroup = prefix / cgroup_name; + + // Looking for cpu.max in directories from the current cgroup to the top level + // It does not stop on the first time since the child could have a greater value than parent + while (current_cgroup != prefix.parent_path()) + { + std::ifstream cpu_max_file(current_cgroup / "cpu.max"); + current_cgroup = current_cgroup.parent_path(); + if (cpu_max_file.is_open()) + { + std::string cpu_limit_str; + float cpu_period; + cpu_max_file >> cpu_limit_str >> cpu_period; + if (cpu_limit_str != "max" && cpu_period != 0) + { + float cpu_limit = std::stof(cpu_limit_str); + quota_count = std::min(static_cast(ceil(cpu_limit / cpu_period)), quota_count); + } + } + } + current_cgroup = prefix / cgroup_name; + // Looking for cpuset.cpus.effective in directories from the current cgroup to the top level + while (current_cgroup != prefix.parent_path()) + { + std::ifstream cpuset_cpus_file(current_cgroup / "cpuset.cpus.effective"); + current_cgroup = current_cgroup.parent_path(); + if (cpuset_cpus_file.is_open()) + { + // The line in the file is "0,2-4,6,9-14" cpu numbers + // It's always grouped and ordered + std::vector cpu_ranges; + std::string cpuset_line; + cpuset_cpus_file >> cpuset_line; + if (cpuset_line.empty()) + continue; + boost::split(cpu_ranges, cpuset_line, boost::is_any_of(",")); + uint32_t cpus_count = 0; + for (const std::string& cpu_number_or_range : cpu_ranges) + { + std::vector cpu_range; + boost::split(cpu_range, cpu_number_or_range, boost::is_any_of("-")); + + if (cpu_range.size() == 2) + { + int start = std::stoi(cpu_range[0]); + int end = std::stoi(cpu_range[1]); + cpus_count += (end - start) + 1; + } + else + cpus_count++; + } + quota_count = std::min(cpus_count, quota_count); + break; + } + } + return quota_count; + } + /// cgroupsv1 /// Return the number of milliseconds per period process is guaranteed to run. /// -1 for no quota - int cgroup_quota = readFrom("/sys/fs/cgroup/cpu/cpu.cfs_quota_us", -1); - int cgroup_period = readFrom("/sys/fs/cgroup/cpu/cpu.cfs_period_us", -1); + int cgroup_quota = readFrom(prefix / "cpu/cpu.cfs_quota_us", -1); + int cgroup_period = readFrom(prefix / "cpu/cpu.cfs_period_us", -1); if (cgroup_quota > -1 && cgroup_period > 0) quota_count = static_cast(ceil(static_cast(cgroup_quota) / static_cast(cgroup_period))); diff --git a/src/Common/parseGlobs.cpp b/src/Common/parseGlobs.cpp index f8d331c2b76..33747f6eece 100644 --- a/src/Common/parseGlobs.cpp +++ b/src/Common/parseGlobs.cpp @@ -3,7 +3,6 @@ #include #include #include -#include #include #include #include @@ -33,14 +32,14 @@ std::string makeRegexpPatternFromGlobs(const std::string & initial_str_with_glob std::string escaped_with_globs = buf_for_escaping.str(); static const re2::RE2 enum_or_range(R"({([\d]+\.\.[\d]+|[^{}*,]+,[^{}*]*[^{}*,])})"); /// regexp for {expr1,expr2,expr3} or {M..N}, where M and N - non-negative integers, expr's should be without "{", "}", "*" and "," - re2::StringPiece input(escaped_with_globs); - re2::StringPiece matched; + std::string_view input(escaped_with_globs); + std::string_view matched; std::ostringstream oss_for_replacing; // STYLE_CHECK_ALLOW_STD_STRING_STREAM oss_for_replacing.exceptions(std::ios::failbit); size_t current_index = 0; while (RE2::FindAndConsume(&input, enum_or_range, &matched)) { - std::string buffer = matched.ToString(); + std::string buffer(matched); oss_for_replacing << escaped_with_globs.substr(current_index, matched.data() - escaped_with_globs.data() - current_index - 1) << '('; if (buffer.find(',') == std::string::npos) diff --git a/src/Compression/CompressedReadBufferBase.cpp b/src/Compression/CompressedReadBufferBase.cpp index 278210d770a..dd19955d010 100644 --- a/src/Compression/CompressedReadBufferBase.cpp +++ b/src/Compression/CompressedReadBufferBase.cpp @@ -49,8 +49,8 @@ static void validateChecksum(char * data, size_t size, const Checksum expected_c /// TODO mess up of endianness in error message. message << "Checksum doesn't match: corrupted data." - " Reference: " + getHexUIntLowercase(expected_checksum.first) + getHexUIntLowercase(expected_checksum.second) - + ". Actual: " + getHexUIntLowercase(calculated_checksum.first) + getHexUIntLowercase(calculated_checksum.second) + " Reference: " + getHexUIntLowercase(expected_checksum) + + ". Actual: " + getHexUIntLowercase(calculated_checksum) + ". Size of compressed block: " + toString(size); const char * message_hardware_failure = "This is most likely due to hardware failure. " @@ -95,8 +95,8 @@ static void validateChecksum(char * data, size_t size, const Checksum expected_c } /// Check if the difference caused by single bit flip in stored checksum. - size_t difference = std::popcount(expected_checksum.first ^ calculated_checksum.first) - + std::popcount(expected_checksum.second ^ calculated_checksum.second); + size_t difference = std::popcount(expected_checksum.low64 ^ calculated_checksum.low64) + + std::popcount(expected_checksum.high64 ^ calculated_checksum.high64); if (difference == 1) { @@ -194,8 +194,8 @@ size_t CompressedReadBufferBase::readCompressedData(size_t & size_decompressed, { Checksum checksum; ReadBufferFromMemory checksum_in(own_compressed_buffer.data(), sizeof(checksum)); - readBinaryLittleEndian(checksum.first, checksum_in); - readBinaryLittleEndian(checksum.second, checksum_in); + readBinaryLittleEndian(checksum.low64, checksum_in); + readBinaryLittleEndian(checksum.high64, checksum_in); validateChecksum(compressed_buffer, size_compressed_without_checksum, checksum); } @@ -238,8 +238,8 @@ size_t CompressedReadBufferBase::readCompressedDataBlockForAsynchronous(size_t & { Checksum checksum; ReadBufferFromMemory checksum_in(own_compressed_buffer.data(), sizeof(checksum)); - readBinaryLittleEndian(checksum.first, checksum_in); - readBinaryLittleEndian(checksum.second, checksum_in); + readBinaryLittleEndian(checksum.low64, checksum_in); + readBinaryLittleEndian(checksum.high64, checksum_in); validateChecksum(compressed_buffer, size_compressed_without_checksum, checksum); } diff --git a/src/Compression/CompressedWriteBuffer.cpp b/src/Compression/CompressedWriteBuffer.cpp index cb2ee1140d0..f16330332ab 100644 --- a/src/Compression/CompressedWriteBuffer.cpp +++ b/src/Compression/CompressedWriteBuffer.cpp @@ -38,8 +38,8 @@ void CompressedWriteBuffer::nextImpl() CityHash_v1_0_2::uint128 checksum = CityHash_v1_0_2::CityHash128(out_compressed_ptr, compressed_size); - writeBinaryLittleEndian(checksum.first, out); - writeBinaryLittleEndian(checksum.second, out); + writeBinaryLittleEndian(checksum.low64, out); + writeBinaryLittleEndian(checksum.high64, out); out.position() += compressed_size; } @@ -50,8 +50,8 @@ void CompressedWriteBuffer::nextImpl() CityHash_v1_0_2::uint128 checksum = CityHash_v1_0_2::CityHash128(compressed_buffer.data(), compressed_size); - writeBinaryLittleEndian(checksum.first, out); - writeBinaryLittleEndian(checksum.second, out); + writeBinaryLittleEndian(checksum.low64, out); + writeBinaryLittleEndian(checksum.high64, out); out.write(compressed_buffer.data(), compressed_size); } diff --git a/src/Compression/CompressionCodecLZ4.cpp b/src/Compression/CompressionCodecLZ4.cpp index a39052f80b7..3dbb6be9a99 100644 --- a/src/Compression/CompressionCodecLZ4.cpp +++ b/src/Compression/CompressionCodecLZ4.cpp @@ -42,7 +42,6 @@ private: UInt32 getMaxCompressedDataSize(UInt32 uncompressed_size) const override; mutable LZ4::PerformanceStatistics lz4_stat; - ASTPtr codec_desc; }; diff --git a/src/Coordination/Changelog.cpp b/src/Coordination/Changelog.cpp index c0dfbc2cbc3..94062140bac 100644 --- a/src/Coordination/Changelog.cpp +++ b/src/Coordination/Changelog.cpp @@ -1,18 +1,19 @@ #include #include +#include #include #include +#include #include #include +#include #include #include #include -#include #include #include +#include #include -#include -#include #include @@ -24,20 +25,41 @@ namespace ErrorCodes extern const int CHECKSUM_DOESNT_MATCH; extern const int CORRUPTED_DATA; extern const int UNKNOWN_FORMAT_VERSION; + extern const int NOT_IMPLEMENTED; + extern const int BAD_ARGUMENTS; extern const int LOGICAL_ERROR; } namespace { +constexpr std::string_view tmp_prefix = "tmp_"; + +void moveFileBetweenDisks(DiskPtr disk_from, ChangelogFileDescriptionPtr description, DiskPtr disk_to, const std::string & path_to) +{ + /// we use empty file with prefix tmp_ to detect incomplete copies + /// if a copy is complete we don't care from which disk we use the same file + /// so it's okay if a failure happens after removing of tmp file but before we remove + /// the changelog from the source disk + auto from_path = fs::path(description->path); + auto tmp_changelog_name = from_path.parent_path() / (std::string{tmp_prefix} + from_path.filename().string()); + { + auto buf = disk_to->writeFile(tmp_changelog_name); + buf->finalize(); + } + disk_from->copyFile(from_path, *disk_to, path_to, {}); + disk_to->removeFile(tmp_changelog_name); + disk_from->removeFile(description->path); + description->path = path_to; + description->disk = disk_to; +} + constexpr auto DEFAULT_PREFIX = "changelog"; -std::string formatChangelogPath( - const std::string & prefix, const std::string & name_prefix, uint64_t from_index, uint64_t to_index, const std::string & extension) +inline std::string +formatChangelogPath(const std::string & name_prefix, uint64_t from_index, uint64_t to_index, const std::string & extension) { - std::filesystem::path path(prefix); - path /= std::filesystem::path(fmt::format("{}_{}_{}.{}", name_prefix, from_index, to_index, extension)); - return path; + return fmt::format("{}_{}_{}.{}", name_prefix, from_index, to_index, extension); } ChangelogFileDescriptionPtr getChangelogFileDescription(const std::filesystem::path & path) @@ -89,17 +111,19 @@ class ChangelogWriter public: ChangelogWriter( std::map & existing_changelogs_, - const std::filesystem::path & changelogs_dir_, + KeeperContextPtr keeper_context_, LogFileSettings log_file_settings_) : existing_changelogs(existing_changelogs_) , log_file_settings(log_file_settings_) - , changelogs_dir(changelogs_dir_) + , keeper_context(std::move(keeper_context_)) , log(&Poco::Logger::get("Changelog")) { } void setFile(ChangelogFileDescriptionPtr file_description, WriteMode mode) { + auto disk = getDisk(); + try { if (mode == WriteMode::Append && file_description->expectedEntriesCountInLog() != log_file_settings.rotate_interval) @@ -110,7 +134,7 @@ public: file_description->expectedEntriesCountInLog()); // we have a file we need to finalize first - if (tryGetFileBuffer() && prealloc_done) + if (tryGetFileBaseBuffer() && prealloc_done) { finalizeCurrentFile(); @@ -118,27 +142,55 @@ public: // if we wrote at least 1 log in the log file we can rename the file to reflect correctly the // contained logs // file can be deleted from disk earlier by compaction - if (!current_file_description->deleted && last_index_written - && *last_index_written != current_file_description->to_log_index) + if (!current_file_description->deleted) { - auto new_path = formatChangelogPath( - changelogs_dir, - current_file_description->prefix, - current_file_description->from_log_index, - *last_index_written, - current_file_description->extension); - std::filesystem::rename(current_file_description->path, new_path); - current_file_description->path = std::move(new_path); + auto log_disk = current_file_description->disk; + const auto & path = current_file_description->path; + std::string new_path = path; + if (last_index_written && *last_index_written != current_file_description->to_log_index) + { + new_path = formatChangelogPath( + current_file_description->prefix, + current_file_description->from_log_index, + *last_index_written, + current_file_description->extension); + } + + if (disk == log_disk) + { + if (path != new_path) + { + try + { + disk->moveFile(path, new_path); + } + catch (...) + { + tryLogCurrentException(log, fmt::format("File rename failed on disk {}", disk->getName())); + } + current_file_description->path = std::move(new_path); + } + } + else + { + moveFileBetweenDisks(log_disk, current_file_description, disk, new_path); + } } } - file_buf = std::make_unique( - file_description->path, DBMS_DEFAULT_BUFFER_SIZE, mode == WriteMode::Rewrite ? -1 : (O_APPEND | O_CREAT | O_WRONLY)); + auto latest_log_disk = getLatestLogDisk(); + assert(file_description->disk == latest_log_disk); + file_buf = latest_log_disk->writeFile(file_description->path, DBMS_DEFAULT_BUFFER_SIZE, mode); + assert(file_buf); last_index_written.reset(); current_file_description = std::move(file_description); if (log_file_settings.compress_logs) - compressed_buffer = std::make_unique(std::move(file_buf), /* compression level = */ 3, /* append_to_existing_file_ = */ mode == WriteMode::Append); + compressed_buffer = std::make_unique( + std::move(file_buf), + /* compressi)on level = */ 3, + /* append_to_existing_file_ = */ mode == WriteMode::Append, + [latest_log_disk, path = current_file_description->path] { return latest_log_disk->readFile(path); }); prealloc_done = false; } @@ -149,12 +201,12 @@ public: } } - bool isFileSet() const { return tryGetFileBuffer() != nullptr; } - + /// There is bug when compressed_buffer has value, file_buf's ownership transfer to compressed_buffer + bool isFileSet() const { return compressed_buffer != nullptr || file_buf != nullptr; } bool appendRecord(ChangelogRecord && record) { - const auto * file_buffer = tryGetFileBuffer(); + const auto * file_buffer = tryGetFileBaseBuffer(); assert(file_buffer && current_file_description); assert(record.header.index - getStartIndex() <= current_file_description->expectedEntriesCountInLog()); @@ -211,7 +263,7 @@ public: void flush() { - auto * file_buffer = tryGetFileBuffer(); + auto * file_buffer = tryGetFileBaseBuffer(); if (file_buffer) { /// Fsync file system if needed @@ -236,12 +288,12 @@ public: new_description->from_log_index = new_start_log_index; new_description->to_log_index = new_start_log_index + log_file_settings.rotate_interval - 1; new_description->extension = "bin"; + new_description->disk = getLatestLogDisk(); if (log_file_settings.compress_logs) new_description->extension += "." + toContentEncodingName(CompressionMethod::Zstd); new_description->path = formatChangelogPath( - changelogs_dir, new_description->prefix, new_start_log_index, new_start_log_index + log_file_settings.rotate_interval - 1, @@ -260,17 +312,15 @@ public: } private: - void finalizeCurrentFile() { - const auto * file_buffer = tryGetFileBuffer(); - assert(file_buffer && prealloc_done); + assert(prealloc_done); assert(current_file_description); // compact can delete the file and we don't need to do anything if (current_file_description->deleted) { - LOG_WARNING(log, "Log {} is already deleted", file_buffer->getFileName()); + LOG_WARNING(log, "Log {} is already deleted", current_file_description->path); return; } @@ -279,27 +329,36 @@ private: flush(); - if (log_file_settings.max_size != 0) + const auto * file_buffer = tryGetFileBuffer(); + + if (log_file_settings.max_size != 0 && file_buffer) { int res = -1; do { res = ftruncate(file_buffer->getFD(), initial_file_size + file_buffer->count()); - } - while (res < 0 && errno == EINTR); + } while (res < 0 && errno == EINTR); if (res != 0) LOG_WARNING(log, "Could not ftruncate file. Error: {}, errno: {}", errnoToString(), errno); } if (log_file_settings.compress_logs) + { compressed_buffer.reset(); + } else + { + chassert(file_buf); + file_buf->finalize(); file_buf.reset(); + } } WriteBuffer & getBuffer() { + /// TODO: unify compressed_buffer and file_buf, + /// compressed_buffer can use its NestedBuffer directly if compress_logs=false if (compressed_buffer) return *compressed_buffer; @@ -319,38 +378,42 @@ private: return *file_buffer; } - const WriteBufferFromFile * tryGetFileBuffer() const - { - return const_cast(this)->tryGetFileBuffer(); - } + const WriteBufferFromFile * tryGetFileBuffer() const { return const_cast(this)->tryGetFileBuffer(); } WriteBufferFromFile * tryGetFileBuffer() { if (compressed_buffer) return dynamic_cast(compressed_buffer->getNestedBuffer()); - if (file_buf) - return file_buf.get(); + return dynamic_cast(file_buf.get()); + } - return nullptr; + WriteBufferFromFileBase * tryGetFileBaseBuffer() + { + if (compressed_buffer) + return dynamic_cast(compressed_buffer->getNestedBuffer()); + + return file_buf.get(); } void tryPreallocateForFile() { - if (log_file_settings.max_size == 0) + const auto * file_buffer = tryGetFileBuffer(); + + if (log_file_settings.max_size == 0 || !file_buffer) { initial_file_size = 0; prealloc_done = true; return; } - const auto & file_buffer = getFileBuffer(); #ifdef OS_LINUX { int res = -1; do { - res = fallocate(file_buffer.getFD(), FALLOC_FL_KEEP_SIZE, 0, log_file_settings.max_size + log_file_settings.overallocate_size); + res = fallocate( + file_buffer->getFD(), FALLOC_FL_KEEP_SIZE, 0, log_file_settings.max_size + log_file_settings.overallocate_size); } while (res < 0 && errno == EINTR); if (res != 0) @@ -365,15 +428,21 @@ private: } } #endif - initial_file_size = getSizeFromFileDescriptor(file_buffer.getFD()); + initial_file_size = getSizeFromFileDescriptor(file_buffer->getFD()); prealloc_done = true; } + DiskPtr getLatestLogDisk() const { return keeper_context->getLatestLogDisk(); } + + DiskPtr getDisk() const { return keeper_context->getLogDisk(); } + + bool isLocalDisk() const { return dynamic_cast(getDisk().get()) != nullptr; } + std::map & existing_changelogs; ChangelogFileDescriptionPtr current_file_description{nullptr}; - std::unique_ptr file_buf; + std::unique_ptr file_buf; std::optional last_index_written; size_t initial_file_size{0}; @@ -383,7 +452,7 @@ private: LogFileSettings log_file_settings; - const std::filesystem::path changelogs_dir; + KeeperContextPtr keeper_context; Poco::Logger * const log; }; @@ -413,10 +482,10 @@ struct ChangelogReadResult class ChangelogReader { public: - explicit ChangelogReader(const std::string & filepath_) : filepath(filepath_) + explicit ChangelogReader(DiskPtr disk_, const std::string & filepath_) : disk(disk_), filepath(filepath_) { auto compression_method = chooseCompressionMethod(filepath, ""); - auto read_buffer_from_file = std::make_unique(filepath); + auto read_buffer_from_file = disk->readFile(filepath); read_buf = wrapReadBufferWithCompressionMethod(std::move(read_buffer_from_file), compression_method); } @@ -512,37 +581,103 @@ public: } private: + DiskPtr disk; std::string filepath; std::unique_ptr read_buf; }; -Changelog::Changelog( - const std::string & changelogs_dir_, - Poco::Logger * log_, - LogFileSettings log_file_settings) - : changelogs_dir(changelogs_dir_) - , changelogs_detached_dir(changelogs_dir / "detached") +Changelog::Changelog(Poco::Logger * log_, LogFileSettings log_file_settings, KeeperContextPtr keeper_context_) + : changelogs_detached_dir("detached") , rotate_interval(log_file_settings.rotate_interval) , log(log_) , write_operations(std::numeric_limits::max()) , append_completion_queue(std::numeric_limits::max()) + , keeper_context(std::move(keeper_context_)) { - /// Load all files in changelog directory - namespace fs = std::filesystem; - if (!fs::exists(changelogs_dir)) - fs::create_directories(changelogs_dir); - - for (const auto & p : fs::directory_iterator(changelogs_dir)) + if (auto latest_log_disk = getLatestLogDisk(); + log_file_settings.force_sync && dynamic_cast(latest_log_disk.get()) == nullptr) { - if (p == changelogs_detached_dir) - continue; - - auto file_description = getChangelogFileDescription(p.path()); - existing_changelogs[file_description->from_log_index] = std::move(file_description); + throw DB::Exception( + DB::ErrorCodes::BAD_ARGUMENTS, + "force_sync is set to true for logs but disk '{}' cannot satisfy such guarantee because it's not of type DiskLocal.\n" + "If you want to use force_sync and same disk for all logs, please set keeper_server.log_storage_disk to a local disk.\n" + "If you want to use force_sync and different disk only for old logs, please set 'keeper_server.log_storage_disk' to any " + "supported disk and 'keeper_server.latest_log_storage_disk' to a local disk.\n" + "Otherwise, disable force_sync", + latest_log_disk->getName()); } + /// Load all files on changelog disks + + const auto load_from_disk = [&](const auto & disk) + { + LOG_TRACE(log, "Reading from disk {}", disk->getName()); + std::unordered_map incomplete_files; + + const auto clean_incomplete_file = [&](const auto & file_path) + { + if (auto incomplete_it = incomplete_files.find(fs::path(file_path).filename()); incomplete_it != incomplete_files.end()) + { + LOG_TRACE(log, "Removing {} from {}", file_path, disk->getName()); + disk->removeFile(file_path); + disk->removeFile(incomplete_it->second); + incomplete_files.erase(incomplete_it); + return true; + } + + return false; + }; + + std::vector changelog_files; + for (auto it = disk->iterateDirectory(""); it->isValid(); it->next()) + { + if (it->name() == changelogs_detached_dir) + continue; + + if (it->name().starts_with(tmp_prefix)) + { + incomplete_files.emplace(it->name().substr(tmp_prefix.size()), it->path()); + continue; + } + + if (clean_incomplete_file(it->path())) + continue; + + changelog_files.push_back(it->path()); + } + + for (const auto & changelog_file : changelog_files) + { + if (clean_incomplete_file(fs::path(changelog_file).filename())) + continue; + + auto file_description = getChangelogFileDescription(changelog_file); + file_description->disk = disk; + + LOG_TRACE(log, "Found {} on {}", changelog_file, disk->getName()); + auto [changelog_it, inserted] = existing_changelogs.insert_or_assign(file_description->from_log_index, std::move(file_description)); + + if (!inserted) + LOG_WARNING(log, "Found duplicate entries for {}, will use the entry from {}", changelog_it->second->path, disk->getName()); + } + + for (const auto & [name, path] : incomplete_files) + disk->removeFile(path); + }; + + /// Load all files from old disks + for (const auto & disk : keeper_context->getOldLogDisks()) + load_from_disk(disk); + + auto disk = getDisk(); + load_from_disk(disk); + + auto latest_log_disk = getLatestLogDisk(); + if (disk != latest_log_disk) + load_from_disk(latest_log_disk); + if (existing_changelogs.empty()) - LOG_WARNING(log, "No logs exists in {}. It's Ok if it's the first run of clickhouse-keeper.", changelogs_dir.generic_string()); + LOG_WARNING(log, "No logs exists in {}. It's Ok if it's the first run of clickhouse-keeper.", disk->getPath()); clean_log_thread = ThreadFromGlobalPool([this] { cleanLogThread(); }); @@ -550,8 +685,7 @@ Changelog::Changelog( append_completion_thread = ThreadFromGlobalPool([this] { appendCompletionThread(); }); - current_writer = std::make_unique( - existing_changelogs, changelogs_dir, log_file_settings); + current_writer = std::make_unique(existing_changelogs, keeper_context, log_file_settings); } void Changelog::readChangelogAndInitWriter(uint64_t last_commited_log_index, uint64_t logs_to_keep) @@ -623,7 +757,7 @@ void Changelog::readChangelogAndInitWriter(uint64_t last_commited_log_index, uin break; } - ChangelogReader reader(changelog_description.path); + ChangelogReader reader(changelog_description.disk, changelog_description.path); last_log_read_result = reader.readChangelog(logs, start_to_read_from, log); last_log_read_result->log_start_index = changelog_description.from_log_index; @@ -684,13 +818,13 @@ void Changelog::readChangelogAndInitWriter(uint64_t last_commited_log_index, uin assert(existing_changelogs.find(last_log_read_result->log_start_index) != existing_changelogs.end()); assert(existing_changelogs.find(last_log_read_result->log_start_index)->first == existing_changelogs.rbegin()->first); - /// Continue to write into incomplete existing log if it doesn't finished with error + /// Continue to write into incomplete existing log if it didn't finish with error const auto & description = existing_changelogs[last_log_read_result->log_start_index]; if (last_log_read_result->last_read_index == 0 || last_log_read_result->error) /// If it's broken log then remove it { LOG_INFO(log, "Removing chagelog {} because it's empty or read finished with error", description->path); - std::filesystem::remove(description->path); + description->disk->removeFile(description->path); existing_changelogs.erase(last_log_read_result->log_start_index); std::erase_if(logs, [last_log_read_result](const auto & item) { return item.first >= last_log_read_result->log_start_index; }); } @@ -699,55 +833,124 @@ void Changelog::readChangelogAndInitWriter(uint64_t last_commited_log_index, uin initWriter(description); } } + else if (last_log_read_result.has_value()) + { + /// check if we need to move completed log to another disk + auto latest_log_disk = getLatestLogDisk(); + auto disk = getDisk(); + + auto & description = existing_changelogs.at(last_log_read_result->log_start_index); + if (latest_log_disk != disk && latest_log_disk == description->disk) + moveFileBetweenDisks(latest_log_disk, description, disk, description->path); + } /// Start new log if we don't initialize writer from previous log. All logs can be "complete". if (!current_writer->isFileSet()) current_writer->rotate(max_log_id + 1); + /// Move files to correct disks + auto latest_start_index = current_writer->getStartIndex(); + auto latest_log_disk = getLatestLogDisk(); + auto disk = getDisk(); + for (const auto & [start_index, description] : existing_changelogs) + { + /// latest log should already be on latest_log_disk + if (start_index == latest_start_index) + { + chassert(description->disk == latest_log_disk); + continue; + } + + if (description->disk != disk) + moveFileBetweenDisks(description->disk, description, disk, description->path); + } + + initialized = true; } void Changelog::initWriter(ChangelogFileDescriptionPtr description) { + if (description->expectedEntriesCountInLog() != rotate_interval) + LOG_TRACE( + log, + "Looks like rotate_logs_interval was changed, current {}, expected entries in last log {}", + rotate_interval, + description->expectedEntriesCountInLog()); + LOG_TRACE(log, "Continue to write into {}", description->path); + + auto log_disk = description->disk; + auto latest_log_disk = getLatestLogDisk(); + if (log_disk != latest_log_disk) + moveFileBetweenDisks(log_disk, description, latest_log_disk, description->path); + current_writer->setFile(std::move(description), WriteMode::Append); } namespace { -std::string getCurrentTimestampFolder() -{ - const auto timestamp = LocalDateTime{std::time(nullptr)}; - return fmt::format( - "{:02}{:02}{:02}T{:02}{:02}{:02}", - timestamp.year(), - timestamp.month(), - timestamp.day(), - timestamp.hour(), - timestamp.minute(), - timestamp.second()); + std::string getCurrentTimestampFolder() + { + const auto timestamp = LocalDateTime{std::time(nullptr)}; + return fmt::format( + "{:02}{:02}{:02}T{:02}{:02}{:02}", + timestamp.year(), + timestamp.month(), + timestamp.day(), + timestamp.hour(), + timestamp.minute(), + timestamp.second()); + } + } +DiskPtr Changelog::getDisk() const +{ + return keeper_context->getLogDisk(); +} + +DiskPtr Changelog::getLatestLogDisk() const +{ + return keeper_context->getLatestLogDisk(); } void Changelog::removeExistingLogs(ChangelogIter begin, ChangelogIter end) { - const auto timestamp_folder = changelogs_detached_dir / getCurrentTimestampFolder(); + auto disk = getDisk(); + + const auto timestamp_folder = (fs::path(changelogs_detached_dir) / getCurrentTimestampFolder()).generic_string(); for (auto itr = begin; itr != end;) { - if (!std::filesystem::exists(timestamp_folder)) + if (!disk->exists(timestamp_folder)) { - LOG_WARNING(log, "Moving broken logs to {}", timestamp_folder.generic_string()); - std::filesystem::create_directories(timestamp_folder); + LOG_WARNING(log, "Moving broken logs to {}", timestamp_folder); + disk->createDirectories(timestamp_folder); } LOG_WARNING(log, "Removing changelog {}", itr->second->path); const std::filesystem::path & path = itr->second->path; const auto new_path = timestamp_folder / path.filename(); - std::filesystem::rename(path, new_path); + + auto changelog_disk = itr->second->disk; + if (changelog_disk == disk) + { + try + { + disk->moveFile(path.generic_string(), new_path.generic_string()); + } + catch (const DB::Exception & e) + { + if (e.code() == DB::ErrorCodes::NOT_IMPLEMENTED) + moveFileBetweenDisks(changelog_disk, itr->second, disk, new_path); + } + } + else + moveFileBetweenDisks(changelog_disk, itr->second, disk, new_path); + itr = existing_changelogs.erase(itr); } } @@ -882,7 +1085,6 @@ void Changelog::writeAt(uint64_t index, const LogEntryPtr & log_entry) throw Exception(ErrorCodes::LOGICAL_ERROR, "Changelog must be initialized before writing records"); { - std::lock_guard lock(writer_mutex); /// This write_at require to overwrite everything in this file and also in previous file(s) const bool go_to_previous_file = index < current_writer->getStartIndex(); @@ -898,13 +1100,18 @@ void Changelog::writeAt(uint64_t index, const LogEntryPtr & log_entry) else description = std::prev(index_changelog)->second; + auto log_disk = description->disk; + auto latest_log_disk = getLatestLogDisk(); + if (log_disk != latest_log_disk) + moveFileBetweenDisks(log_disk, description, latest_log_disk, description->path); + current_writer->setFile(std::move(description), WriteMode::Append); /// Remove all subsequent files if overwritten something in previous one auto to_remove_itr = existing_changelogs.upper_bound(index); for (auto itr = to_remove_itr; itr != existing_changelogs.end();) { - std::filesystem::remove(itr->second->path); + itr->second->disk->removeFile(itr->second->path); itr = existing_changelogs.erase(itr); } } @@ -954,14 +1161,22 @@ void Changelog::compact(uint64_t up_to_log_index) LOG_INFO(log, "Removing changelog {} because of compaction", changelog_description.path); /// If failed to push to queue for background removing, then we will remove it now - if (!log_files_to_delete_queue.tryPush(changelog_description.path, 1)) + if (!log_files_to_delete_queue.tryPush({changelog_description.path, changelog_description.disk}, 1)) { - std::error_code ec; - std::filesystem::remove(changelog_description.path, ec); - if (ec) - LOG_WARNING(log, "Failed to remove changelog {} in compaction, error message: {}", changelog_description.path, ec.message()); - else - LOG_INFO(log, "Removed changelog {} because of compaction", changelog_description.path); + try + { + changelog_description.disk->removeFile(changelog_description.path); + LOG_INFO(log, "Removed changelog {} because of compaction.", changelog_description.path); + } + catch (Exception & e) + { + LOG_WARNING( + log, "Failed to remove changelog {} in compaction, error message: {}", changelog_description.path, e.message()); + } + catch (...) + { + tryLogCurrentException(log); + } } changelog_description.deleted = true; @@ -1151,14 +1366,23 @@ Changelog::~Changelog() void Changelog::cleanLogThread() { - std::string path; - while (log_files_to_delete_queue.pop(path)) + std::pair path_with_disk; + while (log_files_to_delete_queue.pop(path_with_disk)) { - std::error_code ec; - if (std::filesystem::remove(path, ec)) + const auto & [path, disk] = path_with_disk; + try + { + disk->removeFile(path); LOG_INFO(log, "Removed changelog {} because of compaction.", path); - else - LOG_WARNING(log, "Failed to remove changelog {} in compaction, error message: {}", path, ec.message()); + } + catch (Exception & e) + { + LOG_WARNING(log, "Failed to remove changelog {} in compaction, error message: {}", path, e.message()); + } + catch (...) + { + tryLogCurrentException(log); + } } } diff --git a/src/Coordination/Changelog.h b/src/Coordination/Changelog.h index 3c09370182d..9789a6b03dd 100644 --- a/src/Coordination/Changelog.h +++ b/src/Coordination/Changelog.h @@ -11,6 +11,7 @@ #include #include #include +#include namespace DB { @@ -59,6 +60,7 @@ struct ChangelogFileDescription uint64_t to_log_index; std::string extension; + DiskPtr disk; std::string path; bool deleted = false; @@ -87,9 +89,9 @@ class Changelog { public: Changelog( - const std::string & changelogs_dir_, Poco::Logger * log_, - LogFileSettings log_file_settings); + LogFileSettings log_file_settings, + KeeperContextPtr keeper_context_); Changelog(Changelog &&) = delete; @@ -152,6 +154,9 @@ private: /// Pack log_entry into changelog record static ChangelogRecord buildRecord(uint64_t index, const LogEntryPtr & log_entry); + DiskPtr getDisk() const; + DiskPtr getLatestLogDisk() const; + /// Currently existing changelogs std::map existing_changelogs; @@ -169,8 +174,7 @@ private: /// Clean useless log files in a background thread void cleanLogThread(); - const std::filesystem::path changelogs_dir; - const std::filesystem::path changelogs_detached_dir; + const String changelogs_detached_dir; const uint64_t rotate_interval; Poco::Logger * log; @@ -185,7 +189,7 @@ private: uint64_t max_log_id = 0; /// For compaction, queue of delete not used logs /// 128 is enough, even if log is not removed, it's not a problem - ConcurrentBoundedQueue log_files_to_delete_queue{128}; + ConcurrentBoundedQueue> log_files_to_delete_queue{128}; ThreadFromGlobalPool clean_log_thread; struct AppendLog @@ -223,6 +227,8 @@ private: nuraft::wptr raft_server; + KeeperContextPtr keeper_context; + bool initialized = false; }; diff --git a/src/Coordination/CoordinationSettings.cpp b/src/Coordination/CoordinationSettings.cpp index 7a66134f43f..2b2709baf8e 100644 --- a/src/Coordination/CoordinationSettings.cpp +++ b/src/Coordination/CoordinationSettings.cpp @@ -36,7 +36,7 @@ void CoordinationSettings::loadFromConfig(const String & config_elem, const Poco } -const String KeeperConfigurationAndSettings::DEFAULT_FOUR_LETTER_WORD_CMD = "conf,cons,crst,envi,ruok,srst,srvr,stat,wchs,dirs,mntr,isro,rcvr,apiv,csnp,lgif,rqld,rclc,clrs"; +const String KeeperConfigurationAndSettings::DEFAULT_FOUR_LETTER_WORD_CMD = "conf,cons,crst,envi,ruok,srst,srvr,stat,wchs,dirs,mntr,isro,rcvr,apiv,csnp,lgif,rqld,rclc,clrs,ftfl"; KeeperConfigurationAndSettings::KeeperConfigurationAndSettings() : server_id(NOT_EXIST) @@ -85,14 +85,6 @@ void KeeperConfigurationAndSettings::dump(WriteBufferFromOwnString & buf) const writeText(four_letter_word_allow_list, buf); buf.write('\n'); - writeText("log_storage_path=", buf); - writeText(log_storage_path, buf); - buf.write('\n'); - - writeText("snapshot_storage_path=", buf); - writeText(snapshot_storage_path, buf); - buf.write('\n'); - /// coordination_settings writeText("max_requests_batch_size=", buf); @@ -188,61 +180,9 @@ KeeperConfigurationAndSettings::loadFromConfig(const Poco::Util::AbstractConfigu DEFAULT_FOUR_LETTER_WORD_CMD)); - ret->log_storage_path = getLogsPathFromConfig(config, standalone_keeper_); - ret->snapshot_storage_path = getSnapshotsPathFromConfig(config, standalone_keeper_); - - ret->state_file_path = getStateFilePathFromConfig(config, standalone_keeper_); - ret->coordination_settings->loadFromConfig("keeper_server.coordination_settings", config); return ret; } -String KeeperConfigurationAndSettings::getLogsPathFromConfig(const Poco::Util::AbstractConfiguration & config, bool standalone_keeper_) -{ - /// the most specialized path - if (config.has("keeper_server.log_storage_path")) - return config.getString("keeper_server.log_storage_path"); - - if (config.has("keeper_server.storage_path")) - return std::filesystem::path{config.getString("keeper_server.storage_path")} / "logs"; - - if (standalone_keeper_) - return std::filesystem::path{config.getString("path", KEEPER_DEFAULT_PATH)} / "logs"; - else - return std::filesystem::path{config.getString("path", DBMS_DEFAULT_PATH)} / "coordination/logs"; -} - -String KeeperConfigurationAndSettings::getSnapshotsPathFromConfig(const Poco::Util::AbstractConfiguration & config, bool standalone_keeper_) -{ - /// the most specialized path - if (config.has("keeper_server.snapshot_storage_path")) - return config.getString("keeper_server.snapshot_storage_path"); - - if (config.has("keeper_server.storage_path")) - return std::filesystem::path{config.getString("keeper_server.storage_path")} / "snapshots"; - - if (standalone_keeper_) - return std::filesystem::path{config.getString("path", KEEPER_DEFAULT_PATH)} / "snapshots"; - else - return std::filesystem::path{config.getString("path", DBMS_DEFAULT_PATH)} / "coordination/snapshots"; -} - -String KeeperConfigurationAndSettings::getStateFilePathFromConfig(const Poco::Util::AbstractConfiguration & config, bool standalone_keeper_) -{ - if (config.has("keeper_server.storage_path")) - return std::filesystem::path{config.getString("keeper_server.storage_path")} / "state"; - - if (config.has("keeper_server.snapshot_storage_path")) - return std::filesystem::path(config.getString("keeper_server.snapshot_storage_path")).parent_path() / "state"; - - if (config.has("keeper_server.log_storage_path")) - return std::filesystem::path(config.getString("keeper_server.log_storage_path")).parent_path() / "state"; - - if (standalone_keeper_) - return std::filesystem::path{config.getString("path", KEEPER_DEFAULT_PATH)} / "state"; - else - return std::filesystem::path{config.getString("path", DBMS_DEFAULT_PATH)} / "coordination/state"; -} - } diff --git a/src/Coordination/CoordinationSettings.h b/src/Coordination/CoordinationSettings.h index 81be3c3eaa4..220265b6cb0 100644 --- a/src/Coordination/CoordinationSettings.h +++ b/src/Coordination/CoordinationSettings.h @@ -82,17 +82,8 @@ struct KeeperConfigurationAndSettings bool standalone_keeper; CoordinationSettingsPtr coordination_settings; - String log_storage_path; - String snapshot_storage_path; - String state_file_path; - void dump(WriteBufferFromOwnString & buf) const; static std::shared_ptr loadFromConfig(const Poco::Util::AbstractConfiguration & config, bool standalone_keeper_); - -private: - static String getLogsPathFromConfig(const Poco::Util::AbstractConfiguration & config, bool standalone_keeper_); - static String getSnapshotsPathFromConfig(const Poco::Util::AbstractConfiguration & config, bool standalone_keeper_); - static String getStateFilePathFromConfig(const Poco::Util::AbstractConfiguration & config, bool standalone_keeper_); }; using KeeperConfigurationAndSettingsPtr = std::shared_ptr; diff --git a/src/Coordination/FourLetterCommand.cpp b/src/Coordination/FourLetterCommand.cpp index 7077e792fd8..0490213d296 100644 --- a/src/Coordination/FourLetterCommand.cpp +++ b/src/Coordination/FourLetterCommand.cpp @@ -9,9 +9,11 @@ #include #include #include +#include "Coordination/KeeperFeatureFlags.h" #include #include #include +#include #include #include @@ -153,6 +155,9 @@ void FourLetterCommandFactory::registerCommands(KeeperDispatcher & keeper_dispat FourLetterCommandPtr clean_resources_command = std::make_shared(keeper_dispatcher); factory.registerCommand(clean_resources_command); + FourLetterCommandPtr feature_flags_command = std::make_shared(keeper_dispatcher); + factory.registerCommand(feature_flags_command); + factory.initializeAllowList(keeper_dispatcher); factory.setInitialize(true); } @@ -292,6 +297,7 @@ String ConfCommand::run() StringBuffer buf; keeper_dispatcher.getKeeperConfigurationAndSettings()->dump(buf); + keeper_dispatcher.getKeeperContext()->dumpConfiguration(buf); return buf.str(); } @@ -486,7 +492,7 @@ String RecoveryCommand::run() String ApiVersionCommand::run() { - return toString(static_cast(Coordination::current_keeper_api_version)); + return toString(static_cast(KeeperApiVersion::WITH_MULTI_READ)); } String CreateSnapshotCommand::run() @@ -535,4 +541,29 @@ String CleanResourcesCommand::run() return "ok"; } +String FeatureFlagsCommand::run() +{ + const auto & feature_flags = keeper_dispatcher.getKeeperContext()->getFeatureFlags(); + + StringBuffer ret; + + auto append = [&ret] (const String & key, uint8_t value) -> void + { + writeText(key, ret); + writeText('\t', ret); + writeText(std::to_string(value), ret); + writeText('\n', ret); + }; + + for (const auto & [feature_flag, name] : magic_enum::enum_entries()) + { + std::string feature_flag_string(name); + boost::to_lower(feature_flag_string); + append(feature_flag_string, feature_flags.isEnabled(feature_flag)); + } + + return ret.str(); + +} + } diff --git a/src/Coordination/FourLetterCommand.h b/src/Coordination/FourLetterCommand.h index c1a91303c05..eb2cf9419ae 100644 --- a/src/Coordination/FourLetterCommand.h +++ b/src/Coordination/FourLetterCommand.h @@ -401,4 +401,16 @@ struct CleanResourcesCommand : public IFourLetterCommand ~CleanResourcesCommand() override = default; }; +struct FeatureFlagsCommand : public IFourLetterCommand +{ + explicit FeatureFlagsCommand(KeeperDispatcher & keeper_dispatcher_) + : IFourLetterCommand(keeper_dispatcher_) + { + } + + String name() override { return "ftfl"; } + String run() override; + ~FeatureFlagsCommand() override = default; +}; + } diff --git a/src/Coordination/KeeperAsynchronousMetrics.cpp b/src/Coordination/KeeperAsynchronousMetrics.cpp index 2d523a26dcc..890079e98f7 100644 --- a/src/Coordination/KeeperAsynchronousMetrics.cpp +++ b/src/Coordination/KeeperAsynchronousMetrics.cpp @@ -28,8 +28,8 @@ void updateKeeperInformation(KeeperDispatcher & keeper_dispatcher, AsynchronousM size_t zxid = 0; size_t session_with_watches = 0; size_t paths_watched = 0; - size_t snapshot_dir_size = 0; - size_t log_dir_size = 0; + //size_t snapshot_dir_size = 0; + //size_t log_dir_size = 0; if (keeper_dispatcher.isServerActive()) { @@ -49,8 +49,8 @@ void updateKeeperInformation(KeeperDispatcher & keeper_dispatcher, AsynchronousM latest_snapshot_size = state_machine.getLatestSnapshotBufSize(); session_with_watches = state_machine.getSessionsWithWatchesCount(); paths_watched = state_machine.getWatchedPathsCount(); - snapshot_dir_size = keeper_dispatcher.getSnapDirSize(); - log_dir_size = keeper_dispatcher.getLogDirSize(); + //snapshot_dir_size = keeper_dispatcher.getSnapDirSize(); + //log_dir_size = keeper_dispatcher.getLogDirSize(); # if defined(__linux__) || defined(__APPLE__) open_file_descriptor_count = getCurrentProcessFDCount(); @@ -85,8 +85,8 @@ void updateKeeperInformation(KeeperDispatcher & keeper_dispatcher, AsynchronousM new_values["KeeperZxid"] = { zxid, "The current transaction id number (zxid) in ClickHouse Keeper." }; new_values["KeeperSessionWithWatches"] = { session_with_watches, "The number of client sessions of ClickHouse Keeper having watches." }; new_values["KeeperPathsWatched"] = { paths_watched, "The number of different paths watched by the clients of ClickHouse Keeper." }; - new_values["KeeperSnapshotDirSize"] = { snapshot_dir_size, "The size of the snapshots directory of ClickHouse Keeper, in bytes." }; - new_values["KeeperLogDirSize"] = { log_dir_size, "The size of the logs directory of ClickHouse Keeper, in bytes." }; + //new_values["KeeperSnapshotDirSize"] = { snapshot_dir_size, "The size of the snapshots directory of ClickHouse Keeper, in bytes." }; + //new_values["KeeperLogDirSize"] = { log_dir_size, "The size of the logs directory of ClickHouse Keeper, in bytes." }; auto keeper_log_info = keeper_dispatcher.getKeeperLogInfo(); @@ -108,8 +108,8 @@ void updateKeeperInformation(KeeperDispatcher & keeper_dispatcher, AsynchronousM } KeeperAsynchronousMetrics::KeeperAsynchronousMetrics( - TinyContextPtr tiny_context_, int update_period_seconds, const ProtocolServerMetricsFunc & protocol_server_metrics_func_) - : AsynchronousMetrics(update_period_seconds, protocol_server_metrics_func_), tiny_context(std::move(tiny_context_)) + ContextPtr context_, int update_period_seconds, const ProtocolServerMetricsFunc & protocol_server_metrics_func_) + : AsynchronousMetrics(update_period_seconds, protocol_server_metrics_func_), context(std::move(context_)) { } @@ -117,7 +117,7 @@ void KeeperAsynchronousMetrics::updateImpl(AsynchronousMetricValues & new_values { #if USE_NURAFT { - auto keeper_dispatcher = tiny_context->tryGetKeeperDispatcher(); + auto keeper_dispatcher = context->tryGetKeeperDispatcher(); if (keeper_dispatcher) updateKeeperInformation(*keeper_dispatcher, new_values); } diff --git a/src/Coordination/KeeperAsynchronousMetrics.h b/src/Coordination/KeeperAsynchronousMetrics.h index 8fa27336bc5..14092c11c15 100644 --- a/src/Coordination/KeeperAsynchronousMetrics.h +++ b/src/Coordination/KeeperAsynchronousMetrics.h @@ -1,6 +1,6 @@ #pragma once -#include +#include #include namespace DB @@ -13,10 +13,10 @@ class KeeperAsynchronousMetrics : public AsynchronousMetrics { public: KeeperAsynchronousMetrics( - TinyContextPtr tiny_context_, int update_period_seconds, const ProtocolServerMetricsFunc & protocol_server_metrics_func_); + ContextPtr context_, int update_period_seconds, const ProtocolServerMetricsFunc & protocol_server_metrics_func_); private: - TinyContextPtr tiny_context; + ContextPtr context; void updateImpl(AsynchronousMetricValues & new_values, TimePoint update_time, TimePoint current_time) override; }; diff --git a/src/Coordination/KeeperConstants.h b/src/Coordination/KeeperConstants.h index 4b5a5b54be0..84cbb0ab7c5 100644 --- a/src/Coordination/KeeperConstants.h +++ b/src/Coordination/KeeperConstants.h @@ -5,6 +5,7 @@ namespace DB { +/// left for backwards compatibility enum class KeeperApiVersion : uint8_t { ZOOKEEPER_COMPATIBLE = 0, @@ -13,15 +14,8 @@ enum class KeeperApiVersion : uint8_t WITH_CHECK_NOT_EXISTS, }; -inline constexpr auto current_keeper_api_version = KeeperApiVersion::WITH_CHECK_NOT_EXISTS; - const std::string keeper_system_path = "/keeper"; const std::string keeper_api_version_path = keeper_system_path + "/api_version"; - -using PathWithData = std::pair; -const std::vector child_system_paths_with_data -{ - {keeper_api_version_path, toString(static_cast(current_keeper_api_version))} -}; +const std::string keeper_api_feature_flags_path = keeper_system_path + "/feature_flags"; } diff --git a/src/Coordination/KeeperContext.cpp b/src/Coordination/KeeperContext.cpp new file mode 100644 index 00000000000..3c3c0500540 --- /dev/null +++ b/src/Coordination/KeeperContext.cpp @@ -0,0 +1,324 @@ +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +namespace DB +{ + +namespace ErrorCodes +{ + +extern const int BAD_ARGUMENTS; + +} + +KeeperContext::KeeperContext(bool standalone_keeper_) + : disk_selector(std::make_shared()) + , standalone_keeper(standalone_keeper_) +{ + /// enable by default some feature flags + feature_flags.enableFeatureFlag(KeeperFeatureFlag::FILTERED_LIST); + feature_flags.enableFeatureFlag(KeeperFeatureFlag::MULTI_READ); + system_nodes_with_data[keeper_api_feature_flags_path] = feature_flags.getFeatureFlags(); + + /// for older clients, the default is equivalent to WITH_MULTI_READ version + system_nodes_with_data[keeper_api_version_path] = toString(static_cast(KeeperApiVersion::WITH_MULTI_READ)); +} + +void KeeperContext::initialize(const Poco::Util::AbstractConfiguration & config) +{ + digest_enabled = config.getBool("keeper_server.digest_enabled", false); + ignore_system_path_on_startup = config.getBool("keeper_server.ignore_system_path_on_startup", false); + + initializeFeatureFlags(config); + initializeDisks(config); +} + +void KeeperContext::initializeDisks(const Poco::Util::AbstractConfiguration & config) +{ + disk_selector->initialize(config, "storage_configuration.disks", Context::getGlobalContextInstance()); + + log_storage = getLogsPathFromConfig(config); + + if (config.has("keeper_server.latest_log_storage_disk")) + latest_log_storage = config.getString("keeper_server.latest_log_storage_disk"); + else + latest_log_storage = log_storage; + + const auto collect_old_disk_names = [&](const std::string_view key_prefix, std::vector & disk_names) + { + Poco::Util::AbstractConfiguration::Keys disk_name_keys; + config.keys("keeper_server", disk_name_keys); + for (const auto & key : disk_name_keys) + { + if (key.starts_with(key_prefix)) + disk_names.push_back(config.getString(fmt::format("keeper_server.{}", key))); + } + }; + + collect_old_disk_names("old_log_storage_disk", old_log_disk_names); + collect_old_disk_names("old_snapshot_storage_disk", old_snapshot_disk_names); + + snapshot_storage = getSnapshotsPathFromConfig(config); + + if (config.has("keeper_server.latest_snapshot_storage_disk")) + latest_snapshot_storage = config.getString("keeper_server.latest_snapshot_storage_disk"); + else + latest_snapshot_storage = snapshot_storage; + + state_file_storage = getStatePathFromConfig(config); +} + +KeeperContext::Phase KeeperContext::getServerState() const +{ + return server_state; +} + +void KeeperContext::setServerState(KeeperContext::Phase server_state_) +{ + server_state = server_state_; +} + +bool KeeperContext::ignoreSystemPathOnStartup() const +{ + return ignore_system_path_on_startup; +} + +bool KeeperContext::digestEnabled() const +{ + return digest_enabled; +} + +void KeeperContext::setDigestEnabled(bool digest_enabled_) +{ + digest_enabled = digest_enabled_; +} + +DiskPtr KeeperContext::getDisk(const Storage & storage) const +{ + if (const auto * storage_disk = std::get_if(&storage)) + return *storage_disk; + + const auto & disk_name = std::get(storage); + return disk_selector->get(disk_name); +} + +DiskPtr KeeperContext::getLogDisk() const +{ + return getDisk(log_storage); +} + +std::vector KeeperContext::getOldLogDisks() const +{ + std::vector old_log_disks; + old_log_disks.reserve(old_log_disk_names.size()); + + for (const auto & disk_name : old_log_disk_names) + old_log_disks.push_back(disk_selector->get(disk_name)); + + return old_log_disks; +} + +DiskPtr KeeperContext::getLatestLogDisk() const +{ + return getDisk(latest_log_storage); +} + +void KeeperContext::setLogDisk(DiskPtr disk) +{ + log_storage = disk; + latest_log_storage = std::move(disk); +} + +DiskPtr KeeperContext::getLatestSnapshotDisk() const +{ + return getDisk(latest_snapshot_storage); +} + +DiskPtr KeeperContext::getSnapshotDisk() const +{ + return getDisk(snapshot_storage); +} + +std::vector KeeperContext::getOldSnapshotDisks() const +{ + std::vector old_snapshot_disks; + old_snapshot_disks.reserve(old_snapshot_disk_names.size()); + + for (const auto & disk_name : old_snapshot_disk_names) + old_snapshot_disks.push_back(disk_selector->get(disk_name)); + + return old_snapshot_disks; +} + +void KeeperContext::setSnapshotDisk(DiskPtr disk) +{ + snapshot_storage = std::move(disk); + latest_snapshot_storage = snapshot_storage; +} + +DiskPtr KeeperContext::getStateFileDisk() const +{ + return getDisk(state_file_storage); +} + +void KeeperContext::setStateFileDisk(DiskPtr disk) +{ + state_file_storage = std::move(disk); +} + +const std::unordered_map & KeeperContext::getSystemNodesWithData() const +{ + return system_nodes_with_data; +} + +const KeeperFeatureFlags & KeeperContext::getFeatureFlags() const +{ + return feature_flags; +} + +void KeeperContext::dumpConfiguration(WriteBufferFromOwnString & buf) const +{ + auto dump_disk_info = [&](const std::string_view prefix, const IDisk & disk) + { + writeText(fmt::format("{}_path=", prefix), buf); + writeText(disk.getPath(), buf); + buf.write('\n'); + + writeText(fmt::format("{}_disk=", prefix), buf); + writeText(disk.getName(), buf); + buf.write('\n'); + + }; + + { + auto log_disk = getDisk(log_storage); + dump_disk_info("log_storage", *log_disk); + + auto latest_log_disk = getDisk(latest_log_storage); + if (log_disk != latest_log_disk) + dump_disk_info("latest_log_storage", *latest_log_disk); + } + + { + auto snapshot_disk = getDisk(snapshot_storage); + dump_disk_info("snapshot_storage", *snapshot_disk); + } +} + +KeeperContext::Storage KeeperContext::getLogsPathFromConfig(const Poco::Util::AbstractConfiguration & config) const +{ + const auto create_local_disk = [](const auto & path) + { + if (!fs::exists(path)) + fs::create_directories(path); + + return std::make_shared("LocalLogDisk", path, 0); + }; + + /// the most specialized path + if (config.has("keeper_server.log_storage_path")) + return create_local_disk(config.getString("keeper_server.log_storage_path")); + + if (config.has("keeper_server.log_storage_disk")) + return config.getString("keeper_server.log_storage_disk"); + + if (config.has("keeper_server.storage_path")) + return create_local_disk(std::filesystem::path{config.getString("keeper_server.storage_path")} / "logs"); + + if (standalone_keeper) + return create_local_disk(std::filesystem::path{config.getString("path", KEEPER_DEFAULT_PATH)} / "logs"); + else + return create_local_disk(std::filesystem::path{config.getString("path", DBMS_DEFAULT_PATH)} / "coordination/logs"); +} + +KeeperContext::Storage KeeperContext::getSnapshotsPathFromConfig(const Poco::Util::AbstractConfiguration & config) const +{ + const auto create_local_disk = [](const auto & path) + { + if (!fs::exists(path)) + fs::create_directories(path); + + return std::make_shared("LocalSnapshotDisk", path, 0); + }; + + /// the most specialized path + if (config.has("keeper_server.snapshot_storage_path")) + return create_local_disk(config.getString("keeper_server.snapshot_storage_path")); + + if (config.has("keeper_server.snapshot_storage_disk")) + return config.getString("keeper_server.snapshot_storage_disk"); + + if (config.has("keeper_server.storage_path")) + return create_local_disk(std::filesystem::path{config.getString("keeper_server.storage_path")} / "snapshots"); + + if (standalone_keeper) + return create_local_disk(std::filesystem::path{config.getString("path", KEEPER_DEFAULT_PATH)} / "snapshots"); + else + return create_local_disk(std::filesystem::path{config.getString("path", DBMS_DEFAULT_PATH)} / "coordination/snapshots"); +} + +KeeperContext::Storage KeeperContext::getStatePathFromConfig(const Poco::Util::AbstractConfiguration & config) const +{ + const auto create_local_disk = [](const auto & path) + { + if (!fs::exists(path)) + fs::create_directories(path); + + return std::make_shared("LocalStateFileDisk", path, 0); + }; + + if (config.has("keeper_server.state_storage_disk")) + return config.getString("keeper_server.state_storage_disk"); + + if (config.has("keeper_server.storage_path")) + return create_local_disk(std::filesystem::path{config.getString("keeper_server.storage_path")}); + + if (config.has("keeper_server.snapshot_storage_path")) + return create_local_disk(std::filesystem::path(config.getString("keeper_server.snapshot_storage_path")).parent_path()); + + if (config.has("keeper_server.log_storage_path")) + return create_local_disk(std::filesystem::path(config.getString("keeper_server.log_storage_path")).parent_path()); + + if (standalone_keeper) + return create_local_disk(std::filesystem::path{config.getString("path", KEEPER_DEFAULT_PATH)}); + else + return create_local_disk(std::filesystem::path{config.getString("path", DBMS_DEFAULT_PATH)} / "coordination"); +} + +void KeeperContext::initializeFeatureFlags(const Poco::Util::AbstractConfiguration & config) +{ + static const std::string feature_flags_key = "keeper_server.feature_flags"; + if (config.has(feature_flags_key)) + { + Poco::Util::AbstractConfiguration::Keys keys; + config.keys(feature_flags_key, keys); + for (const auto & key : keys) + { + auto feature_flag_string = boost::to_upper_copy(key); + auto feature_flag = magic_enum::enum_cast(feature_flag_string); + + if (!feature_flag.has_value()) + throw Exception(ErrorCodes::BAD_ARGUMENTS, "Invalid feature flag defined in config for Keeper: {}", key); + + auto is_enabled = config.getBool(feature_flags_key + "." + key); + if (is_enabled) + feature_flags.enableFeatureFlag(feature_flag.value()); + else + feature_flags.disableFeatureFlag(feature_flag.value()); + } + + system_nodes_with_data[keeper_api_feature_flags_path] = feature_flags.getFeatureFlags(); + } + + feature_flags.logFlags(&Poco::Logger::get("KeeperContext")); +} + +} diff --git a/src/Coordination/KeeperContext.h b/src/Coordination/KeeperContext.h index 64fa8cea6ec..229dbd51ab2 100644 --- a/src/Coordination/KeeperContext.h +++ b/src/Coordination/KeeperContext.h @@ -1,10 +1,22 @@ #pragma once +#include + +#include +#include +#include + +#include +#include + namespace DB { -struct KeeperContext +class KeeperContext { +public: + explicit KeeperContext(bool standalone_keeper_); + enum class Phase : uint8_t { INIT, @@ -12,10 +24,67 @@ struct KeeperContext SHUTDOWN }; + void initialize(const Poco::Util::AbstractConfiguration & config); + + Phase getServerState() const; + void setServerState(Phase server_state_); + + bool ignoreSystemPathOnStartup() const; + + bool digestEnabled() const; + void setDigestEnabled(bool digest_enabled_); + + DiskPtr getLatestLogDisk() const; + DiskPtr getLogDisk() const; + std::vector getOldLogDisks() const; + void setLogDisk(DiskPtr disk); + + DiskPtr getLatestSnapshotDisk() const; + DiskPtr getSnapshotDisk() const; + std::vector getOldSnapshotDisks() const; + void setSnapshotDisk(DiskPtr disk); + + DiskPtr getStateFileDisk() const; + void setStateFileDisk(DiskPtr disk); + + const std::unordered_map & getSystemNodesWithData() const; + const KeeperFeatureFlags & getFeatureFlags() const; + + void dumpConfiguration(WriteBufferFromOwnString & buf) const; +private: + /// local disk defined using path or disk name + using Storage = std::variant; + + void initializeFeatureFlags(const Poco::Util::AbstractConfiguration & config); + void initializeDisks(const Poco::Util::AbstractConfiguration & config); + + Storage getLogsPathFromConfig(const Poco::Util::AbstractConfiguration & config) const; + Storage getSnapshotsPathFromConfig(const Poco::Util::AbstractConfiguration & config) const; + Storage getStatePathFromConfig(const Poco::Util::AbstractConfiguration & config) const; + + DiskPtr getDisk(const Storage & storage) const; + Phase server_state{Phase::INIT}; bool ignore_system_path_on_startup{false}; bool digest_enabled{true}; + + std::shared_ptr disk_selector; + + Storage log_storage; + Storage latest_log_storage; + Storage snapshot_storage; + Storage latest_snapshot_storage; + Storage state_file_storage; + + std::vector old_log_disk_names; + std::vector old_snapshot_disk_names; + + bool standalone_keeper; + + std::unordered_map system_nodes_with_data; + + KeeperFeatureFlags feature_flags; }; using KeeperContextPtr = std::shared_ptr; diff --git a/src/Coordination/KeeperDispatcher.cpp b/src/Coordination/KeeperDispatcher.cpp index d64134f3024..9d9df5c7f30 100644 --- a/src/Coordination/KeeperDispatcher.cpp +++ b/src/Coordination/KeeperDispatcher.cpp @@ -38,8 +38,6 @@ namespace ProfileEvents extern const Event MemoryAllocatorPurgeTimeMicroseconds; } -namespace fs = std::filesystem; - namespace DB { @@ -238,13 +236,13 @@ void KeeperDispatcher::snapshotThread() try { - auto snapshot_path = task.create_snapshot(std::move(task.snapshot)); + auto snapshot_file_info = task.create_snapshot(std::move(task.snapshot)); - if (snapshot_path.empty()) + if (snapshot_file_info.path.empty()) continue; if (isLeader()) - snapshot_s3.uploadSnapshot(snapshot_path); + snapshot_s3.uploadSnapshot(snapshot_file_info); } catch (...) { @@ -336,28 +334,39 @@ void KeeperDispatcher::initialize(const Poco::Util::AbstractConfiguration & conf snapshot_s3.startup(config, macros); - server = std::make_unique(configuration_and_settings, config, responses_queue, snapshots_queue, snapshot_s3, [this](const KeeperStorage::RequestForSession & request_for_session) - { - /// check if we have queue of read requests depending on this request to be committed - std::lock_guard lock(read_request_queue_mutex); - if (auto it = read_request_queue.find(request_for_session.session_id); it != read_request_queue.end()) + keeper_context = std::make_shared(standalone_keeper); + keeper_context->initialize(config); + + server = std::make_unique( + configuration_and_settings, + config, + responses_queue, + snapshots_queue, + keeper_context, + snapshot_s3, + [this](const KeeperStorage::RequestForSession & request_for_session) { - auto & xid_to_request_queue = it->second; - - if (auto request_queue_it = xid_to_request_queue.find(request_for_session.request->xid); request_queue_it != xid_to_request_queue.end()) + /// check if we have queue of read requests depending on this request to be committed + std::lock_guard lock(read_request_queue_mutex); + if (auto it = read_request_queue.find(request_for_session.session_id); it != read_request_queue.end()) { - for (const auto & read_request : request_queue_it->second) - { - if (server->isLeaderAlive()) - server->putLocalReadRequest(read_request); - else - addErrorResponses({read_request}, Coordination::Error::ZCONNECTIONLOSS); - } + auto & xid_to_request_queue = it->second; - xid_to_request_queue.erase(request_queue_it); + if (auto request_queue_it = xid_to_request_queue.find(request_for_session.request->xid); + request_queue_it != xid_to_request_queue.end()) + { + for (const auto & read_request : request_queue_it->second) + { + if (server->isLeaderAlive()) + server->putLocalReadRequest(read_request); + else + addErrorResponses({read_request}, Coordination::Error::ZCONNECTIONLOSS); + } + + xid_to_request_queue.erase(request_queue_it); + } } - } - }); + }); try { @@ -766,35 +775,37 @@ void KeeperDispatcher::updateKeeperStatLatency(uint64_t process_time_ms) keeper_stats.updateLatency(process_time_ms); } -static uint64_t getDirSize(const fs::path & dir) +static uint64_t getTotalSize(const DiskPtr & disk, const std::string & path = "") { checkStackSize(); - if (!fs::exists(dir)) - return 0; - fs::directory_iterator it(dir); - fs::directory_iterator end; - - uint64_t size{0}; - while (it != end) + uint64_t size = 0; + for (auto it = disk->iterateDirectory(path); it->isValid(); it->next()) { - if (it->is_regular_file()) - size += fs::file_size(*it); + if (disk->isFile(it->path())) + size += disk->getFileSize(it->path()); else - size += getDirSize(it->path()); - ++it; + size += getTotalSize(disk, it->path()); } + return size; } uint64_t KeeperDispatcher::getLogDirSize() const { - return getDirSize(configuration_and_settings->log_storage_path); + auto log_disk = keeper_context->getLogDisk(); + auto size = getTotalSize(log_disk); + + auto latest_log_disk = keeper_context->getLatestLogDisk(); + if (log_disk != latest_log_disk) + size += getTotalSize(latest_log_disk); + + return size; } uint64_t KeeperDispatcher::getSnapDirSize() const { - return getDirSize(configuration_and_settings->snapshot_storage_path); + return getTotalSize(keeper_context->getSnapshotDisk()); } Keeper4LWInfo KeeperDispatcher::getKeeper4LWInfo() const diff --git a/src/Coordination/KeeperDispatcher.h b/src/Coordination/KeeperDispatcher.h index 4b8b134cf8f..1b44f0f6ced 100644 --- a/src/Coordination/KeeperDispatcher.h +++ b/src/Coordination/KeeperDispatcher.h @@ -81,6 +81,8 @@ private: KeeperSnapshotManagerS3 snapshot_s3; + KeeperContextPtr keeper_context; + /// Thread put requests to raft void requestThread(); /// Thread put responses for subscribed sessions @@ -198,6 +200,11 @@ public: return configuration_and_settings; } + const KeeperContextPtr & getKeeperContext() const + { + return keeper_context; + } + void incrementPacketsSent() { keeper_stats.incrementPacketsSent(); diff --git a/src/Coordination/KeeperFeatureFlags.cpp b/src/Coordination/KeeperFeatureFlags.cpp new file mode 100644 index 00000000000..d0cd1c86b55 --- /dev/null +++ b/src/Coordination/KeeperFeatureFlags.cpp @@ -0,0 +1,92 @@ +#include +#include +#include +#include + +namespace DB +{ + +namespace +{ + +std::pair getByteAndBitIndex(size_t num) +{ + size_t byte_idx = num / 8; + auto bit_idx = (7 - num % 8); + return {byte_idx, bit_idx}; +} + +} + +KeeperFeatureFlags::KeeperFeatureFlags() +{ + /// get byte idx of largest value + auto [byte_idx, _] = getByteAndBitIndex(magic_enum::enum_count() - 1); + feature_flags = std::string(byte_idx + 1, 0); +} + +KeeperFeatureFlags::KeeperFeatureFlags(std::string feature_flags_) + : feature_flags(std::move(feature_flags_)) +{} + +void KeeperFeatureFlags::fromApiVersion(KeeperApiVersion keeper_api_version) +{ + if (keeper_api_version == KeeperApiVersion::ZOOKEEPER_COMPATIBLE) + return; + + if (keeper_api_version >= KeeperApiVersion::WITH_FILTERED_LIST) + enableFeatureFlag(KeeperFeatureFlag::FILTERED_LIST); + + if (keeper_api_version >= KeeperApiVersion::WITH_MULTI_READ) + enableFeatureFlag(KeeperFeatureFlag::MULTI_READ); + + if (keeper_api_version >= KeeperApiVersion::WITH_CHECK_NOT_EXISTS) + enableFeatureFlag(KeeperFeatureFlag::CHECK_NOT_EXISTS); +} + +bool KeeperFeatureFlags::isEnabled(KeeperFeatureFlag feature_flag) const +{ + auto [byte_idx, bit_idx] = getByteAndBitIndex(magic_enum::enum_integer(feature_flag)); + + if (byte_idx > feature_flags.size()) + return false; + + return feature_flags[byte_idx] & (1 << bit_idx); +} + +void KeeperFeatureFlags::setFeatureFlags(std::string feature_flags_) +{ + feature_flags = std::move(feature_flags_); +} + +void KeeperFeatureFlags::enableFeatureFlag(KeeperFeatureFlag feature_flag) +{ + auto [byte_idx, bit_idx] = getByteAndBitIndex(magic_enum::enum_integer(feature_flag)); + chassert(byte_idx < feature_flags.size()); + + feature_flags[byte_idx] |= (1 << bit_idx); +} + +void KeeperFeatureFlags::disableFeatureFlag(KeeperFeatureFlag feature_flag) +{ + auto [byte_idx, bit_idx] = getByteAndBitIndex(magic_enum::enum_integer(feature_flag)); + chassert(byte_idx < feature_flags.size()); + + feature_flags[byte_idx] &= ~(1 << bit_idx); +} + +const std::string & KeeperFeatureFlags::getFeatureFlags() const +{ + return feature_flags; +} + +void KeeperFeatureFlags::logFlags(Poco::Logger * log) const +{ + for (const auto & [feature_flag, feature_flag_name] : magic_enum::enum_entries()) + { + auto is_enabled = isEnabled(feature_flag); + LOG_INFO(log, "Keeper feature flag {}: {}", feature_flag_name, is_enabled ? "enabled" : "disabled"); + } +} + +} diff --git a/src/Coordination/KeeperFeatureFlags.h b/src/Coordination/KeeperFeatureFlags.h new file mode 100644 index 00000000000..6c48915f60c --- /dev/null +++ b/src/Coordination/KeeperFeatureFlags.h @@ -0,0 +1,39 @@ +#pragma once + +#include + +namespace DB +{ + +/// these values cannot be reordered or removed, only new values can be added +enum class KeeperFeatureFlag : size_t +{ + FILTERED_LIST = 0, + MULTI_READ, + CHECK_NOT_EXISTS, +}; + +class KeeperFeatureFlags +{ +public: + KeeperFeatureFlags(); + + explicit KeeperFeatureFlags(std::string feature_flags_); + + /// backwards compatibility + void fromApiVersion(KeeperApiVersion keeper_api_version); + + bool isEnabled(KeeperFeatureFlag feature) const; + + void setFeatureFlags(std::string feature_flags_); + const std::string & getFeatureFlags() const; + + void enableFeatureFlag(KeeperFeatureFlag feature); + void disableFeatureFlag(KeeperFeatureFlag feature); + + void logFlags(Poco::Logger * log) const; +private: + std::string feature_flags; +}; + +} diff --git a/src/Coordination/KeeperLogStore.cpp b/src/Coordination/KeeperLogStore.cpp index 10619a44517..a9153475dbc 100644 --- a/src/Coordination/KeeperLogStore.cpp +++ b/src/Coordination/KeeperLogStore.cpp @@ -1,14 +1,14 @@ #include #include +#include #include namespace DB { -KeeperLogStore::KeeperLogStore( - const std::string & changelogs_path, LogFileSettings log_file_settings) +KeeperLogStore::KeeperLogStore(LogFileSettings log_file_settings, KeeperContextPtr keeper_context) : log(&Poco::Logger::get("KeeperLogStore")) - , changelog(changelogs_path, log, log_file_settings) + , changelog(log, log_file_settings, keeper_context) { if (log_file_settings.force_sync) LOG_INFO(log, "force_sync enabled"); diff --git a/src/Coordination/KeeperLogStore.h b/src/Coordination/KeeperLogStore.h index e48e3a32463..6e71d8c55cf 100644 --- a/src/Coordination/KeeperLogStore.h +++ b/src/Coordination/KeeperLogStore.h @@ -4,6 +4,7 @@ #include #include #include +#include #include namespace DB @@ -13,7 +14,7 @@ namespace DB class KeeperLogStore : public nuraft::log_store { public: - KeeperLogStore(const std::string & changelogs_path, LogFileSettings log_file_settings); + KeeperLogStore(LogFileSettings log_file_settings, KeeperContextPtr keeper_context); /// Read log storage from filesystem starting from last_commited_log_index void init(uint64_t last_commited_log_index, uint64_t logs_to_keep); diff --git a/src/Coordination/KeeperServer.cpp b/src/Coordination/KeeperServer.cpp index fda76c52684..82c843287c1 100644 --- a/src/Coordination/KeeperServer.cpp +++ b/src/Coordination/KeeperServer.cpp @@ -26,6 +26,7 @@ #include #include #include +#include namespace DB { @@ -108,25 +109,22 @@ KeeperServer::KeeperServer( const Poco::Util::AbstractConfiguration & config, ResponsesQueue & responses_queue_, SnapshotsQueue & snapshots_queue_, + KeeperContextPtr keeper_context_, KeeperSnapshotManagerS3 & snapshot_manager_s3, KeeperStateMachine::CommitCallback commit_callback) : server_id(configuration_and_settings_->server_id) , coordination_settings(configuration_and_settings_->coordination_settings) , log(&Poco::Logger::get("KeeperServer")) , is_recovering(config.getBool("keeper_server.force_recovery", false)) - , keeper_context{std::make_shared()} + , keeper_context{std::move(keeper_context_)} , create_snapshot_on_exit(config.getBool("keeper_server.create_snapshot_on_exit", true)) { if (coordination_settings->quorum_reads) LOG_WARNING(log, "Quorum reads enabled, Keeper will work slower."); - keeper_context->digest_enabled = config.getBool("keeper_server.digest_enabled", false); - keeper_context->ignore_system_path_on_startup = config.getBool("keeper_server.ignore_system_path_on_startup", false); - state_machine = nuraft::cs_new( responses_queue_, snapshots_queue_, - configuration_and_settings_->snapshot_storage_path, coordination_settings, keeper_context, config.getBool("keeper_server.upload_snapshot_on_exit", true) ? &snapshot_manager_s3 : nullptr, @@ -136,10 +134,10 @@ KeeperServer::KeeperServer( state_manager = nuraft::cs_new( server_id, "keeper_server", - configuration_and_settings_->log_storage_path, - configuration_and_settings_->state_file_path, + "state", config, - coordination_settings); + coordination_settings, + keeper_context); } /** @@ -415,7 +413,7 @@ void KeeperServer::startup(const Poco::Util::AbstractConfiguration & config, boo launchRaftServer(config, enable_ipv6); - keeper_context->server_state = KeeperContext::Phase::RUNNING; + keeper_context->setServerState(KeeperContext::Phase::RUNNING); } void KeeperServer::shutdownRaftServer() @@ -430,7 +428,7 @@ void KeeperServer::shutdownRaftServer() raft_instance->shutdown(); - keeper_context->server_state = KeeperContext::Phase::SHUTDOWN; + keeper_context->setServerState(KeeperContext::Phase::SHUTDOWN); if (create_snapshot_on_exit) raft_instance->create_snapshot(); diff --git a/src/Coordination/KeeperServer.h b/src/Coordination/KeeperServer.h index db4e9c1962e..8f416b1f48c 100644 --- a/src/Coordination/KeeperServer.h +++ b/src/Coordination/KeeperServer.h @@ -72,6 +72,7 @@ public: const Poco::Util::AbstractConfiguration & config_, ResponsesQueue & responses_queue_, SnapshotsQueue & snapshots_queue_, + KeeperContextPtr keeper_context_, KeeperSnapshotManagerS3 & snapshot_manager_s3, KeeperStateMachine::CommitCallback commit_callback); diff --git a/src/Coordination/KeeperSnapshotManager.cpp b/src/Coordination/KeeperSnapshotManager.cpp index 8b80db3e520..a216272a9e1 100644 --- a/src/Coordination/KeeperSnapshotManager.cpp +++ b/src/Coordination/KeeperSnapshotManager.cpp @@ -9,13 +9,15 @@ #include #include #include -#include #include #include #include #include +#include #include #include +#include "Core/Field.h" +#include namespace DB @@ -30,6 +32,25 @@ namespace ErrorCodes namespace { + constexpr std::string_view tmp_prefix = "tmp_"; + + void moveFileBetweenDisks(DiskPtr disk_from, const std::string & path_from, DiskPtr disk_to, const std::string & path_to) + { + /// we use empty file with prefix tmp_ to detect incomplete copies + /// if a copy is complete we don't care from which disk we use the same file + /// so it's okay if a failure happens after removing of tmp file but before we remove + /// the snapshot from the source disk + auto from_path = fs::path(path_from); + auto tmp_snapshot_name = from_path.parent_path() / (std::string{tmp_prefix} + from_path.filename().string()); + { + auto buf = disk_to->writeFile(tmp_snapshot_name); + buf->finalize(); + } + disk_from->copyFile(from_path, *disk_to, path_to, {}); + disk_to->removeFile(tmp_snapshot_name); + disk_from->removeFile(path_from); + } + uint64_t getSnapshotPathUpToLogIdx(const String & snapshot_path) { std::filesystem::path path(snapshot_path); @@ -41,7 +62,7 @@ namespace std::string getSnapshotFileName(uint64_t up_to_log_idx, bool compress_zstd) { - auto base = std::string{"snapshot_"} + std::to_string(up_to_log_idx) + ".bin"; + auto base = fmt::format("snapshot_{}.bin", up_to_log_idx); if (compress_zstd) base += ".zstd"; return base; @@ -156,7 +177,7 @@ void KeeperStorageSnapshot::serialize(const KeeperStorageSnapshot & snapshot, Wr if (snapshot.version >= SnapshotVersion::V5) { writeBinary(snapshot.zxid, out); - if (keeper_context->digest_enabled) + if (keeper_context->digestEnabled()) { writeBinary(static_cast(KeeperStorage::CURRENT_DIGEST_VERSION), out); writeBinary(snapshot.nodes_digest, out); @@ -185,7 +206,7 @@ void KeeperStorageSnapshot::serialize(const KeeperStorageSnapshot & snapshot, Wr } /// Serialize data tree - writeBinary(snapshot.snapshot_container_size - child_system_paths_with_data.size(), out); + writeBinary(snapshot.snapshot_container_size - keeper_context->getSystemNodesWithData().size(), out); size_t counter = 0; for (auto it = snapshot.begin; counter < snapshot.snapshot_container_size; ++counter) { @@ -267,7 +288,7 @@ void KeeperStorageSnapshot::deserialize(SnapshotDeserializationResult & deserial deserialization_result.snapshot_meta = deserializeSnapshotMetadata(in); KeeperStorage & storage = *deserialization_result.storage; - bool recalculate_digest = keeper_context->digest_enabled; + bool recalculate_digest = keeper_context->digestEnabled(); if (version >= SnapshotVersion::V5) { readBinary(storage.zxid, in); @@ -349,7 +370,7 @@ void KeeperStorageSnapshot::deserialize(SnapshotDeserializationResult & deserial const std::string error_msg = fmt::format("Cannot read node on path {} from a snapshot because it is used as a system node", path); if (match_result == IS_CHILD) { - if (keeper_context->ignore_system_path_on_startup || keeper_context->server_state != KeeperContext::Phase::INIT) + if (keeper_context->ignoreSystemPathOnStartup() || keeper_context->getServerState() != KeeperContext::Phase::INIT) { LOG_ERROR(&Poco::Logger::get("KeeperSnapshotManager"), "{}. Ignoring it", error_msg); continue; @@ -365,7 +386,7 @@ void KeeperStorageSnapshot::deserialize(SnapshotDeserializationResult & deserial { if (!is_node_empty(node)) { - if (keeper_context->ignore_system_path_on_startup || keeper_context->server_state != KeeperContext::Phase::INIT) + if (keeper_context->ignoreSystemPathOnStartup() || keeper_context->getServerState() != KeeperContext::Phase::INIT) { LOG_ERROR(&Poco::Logger::get("KeeperSnapshotManager"), "{}. Ignoring it", error_msg); node = KeeperStorage::Node{}; @@ -394,9 +415,9 @@ void KeeperStorageSnapshot::deserialize(SnapshotDeserializationResult & deserial { if (itr.key != "/") { - auto parent_path = parentPath(itr.key); + auto parent_path = parentNodePath(itr.key); storage.container.updateValue( - parent_path, [version, path = itr.key](KeeperStorage::Node & value) { value.addChild(getBaseName(path), /*update_size*/ version < SnapshotVersion::V4); }); + parent_path, [version, path = itr.key](KeeperStorage::Node & value) { value.addChild(getBaseNodeName(path), /*update_size*/ version < SnapshotVersion::V4); }); } } @@ -507,70 +528,110 @@ KeeperStorageSnapshot::~KeeperStorageSnapshot() } KeeperSnapshotManager::KeeperSnapshotManager( - const std::string & snapshots_path_, size_t snapshots_to_keep_, const KeeperContextPtr & keeper_context_, bool compress_snapshots_zstd_, const std::string & superdigest_, size_t storage_tick_time_) - : snapshots_path(snapshots_path_) - , snapshots_to_keep(snapshots_to_keep_) + : snapshots_to_keep(snapshots_to_keep_) , compress_snapshots_zstd(compress_snapshots_zstd_) , superdigest(superdigest_) , storage_tick_time(storage_tick_time_) , keeper_context(keeper_context_) { - namespace fs = std::filesystem; - - if (!fs::exists(snapshots_path)) - fs::create_directories(snapshots_path); - - for (const auto & p : fs::directory_iterator(snapshots_path)) + const auto load_snapshot_from_disk = [&](const auto & disk) { - const auto & path = p.path(); + LOG_TRACE(log, "Reading from disk {}", disk->getName()); + std::unordered_map incomplete_files; - if (!path.has_filename()) - continue; - - if (startsWith(path.filename(), "tmp_")) /// Unfinished tmp files + const auto clean_incomplete_file = [&](const auto & file_path) { - std::filesystem::remove(p); - continue; + if (auto incomplete_it = incomplete_files.find(fs::path(file_path).filename()); incomplete_it != incomplete_files.end()) + { + LOG_TRACE(log, "Removing {} from {}", file_path, disk->getName()); + disk->removeFile(file_path); + disk->removeFile(incomplete_it->second); + incomplete_files.erase(incomplete_it); + return true; + } + + return false; + }; + + std::vector snapshot_files; + for (auto it = disk->iterateDirectory(""); it->isValid(); it->next()) + { + if (it->name().starts_with(tmp_prefix)) + { + incomplete_files.emplace(it->name().substr(tmp_prefix.size()), it->path()); + continue; + } + + if (it->name().starts_with("snapshot_") && !clean_incomplete_file(it->path())) + snapshot_files.push_back(it->path()); } - /// Not snapshot file - if (!startsWith(path.filename(), "snapshot_")) + for (const auto & snapshot_file : snapshot_files) { - continue; + if (clean_incomplete_file(fs::path(snapshot_file).filename())) + continue; + + LOG_TRACE(log, "Found {} on {}", snapshot_file, disk->getName()); + size_t snapshot_up_to = getSnapshotPathUpToLogIdx(snapshot_file); + auto [_, inserted] = existing_snapshots.insert_or_assign(snapshot_up_to, SnapshotFileInfo{snapshot_file, disk}); + + if (!inserted) + LOG_WARNING( + &Poco::Logger::get("KeeperSnapshotManager"), + "Found another snapshots with last log idx {}, will use snapshot from disk {}", + snapshot_up_to, + disk->getName()); } - size_t snapshot_up_to = getSnapshotPathUpToLogIdx(p.path()); - existing_snapshots[snapshot_up_to] = p.path(); - } + for (const auto & [name, path] : incomplete_files) + disk->removeFile(path); + }; + + for (const auto & disk : keeper_context->getOldSnapshotDisks()) + load_snapshot_from_disk(disk); + + auto disk = getDisk(); + load_snapshot_from_disk(disk); + + auto latest_snapshot_disk = getLatestSnapshotDisk(); + if (latest_snapshot_disk != disk) + load_snapshot_from_disk(latest_snapshot_disk); removeOutdatedSnapshotsIfNeeded(); + moveSnapshotsIfNeeded(); } - -std::string KeeperSnapshotManager::serializeSnapshotBufferToDisk(nuraft::buffer & buffer, uint64_t up_to_log_idx) +SnapshotFileInfo KeeperSnapshotManager::serializeSnapshotBufferToDisk(nuraft::buffer & buffer, uint64_t up_to_log_idx) { ReadBufferFromNuraftBuffer reader(buffer); auto snapshot_file_name = getSnapshotFileName(up_to_log_idx, compress_snapshots_zstd); auto tmp_snapshot_file_name = "tmp_" + snapshot_file_name; - std::string tmp_snapshot_path = std::filesystem::path{snapshots_path} / tmp_snapshot_file_name; - std::string new_snapshot_path = std::filesystem::path{snapshots_path} / snapshot_file_name; - WriteBufferFromFile plain_buf(tmp_snapshot_path); - copyData(reader, plain_buf); - plain_buf.sync(); + auto disk = getLatestSnapshotDisk(); - std::filesystem::rename(tmp_snapshot_path, new_snapshot_path); + { + auto buf = disk->writeFile(tmp_snapshot_file_name); + buf->finalize(); + } - existing_snapshots.emplace(up_to_log_idx, new_snapshot_path); + auto plain_buf = disk->writeFile(snapshot_file_name); + copyData(reader, *plain_buf); + plain_buf->sync(); + plain_buf->finalize(); + + disk->removeFile(tmp_snapshot_file_name); + + existing_snapshots.emplace(up_to_log_idx, SnapshotFileInfo{snapshot_file_name, disk}); removeOutdatedSnapshotsIfNeeded(); + moveSnapshotsIfNeeded(); - return new_snapshot_path; + return {snapshot_file_name, disk}; } nuraft::ptr KeeperSnapshotManager::deserializeLatestSnapshotBufferFromDisk() @@ -584,7 +645,8 @@ nuraft::ptr KeeperSnapshotManager::deserializeLatestSnapshotBuff } catch (const DB::Exception &) { - std::filesystem::remove(latest_itr->second); + const auto & [path, disk] = latest_itr->second; + disk->removeFile(path); existing_snapshots.erase(latest_itr->first); tryLogCurrentException(__PRETTY_FUNCTION__); } @@ -595,10 +657,10 @@ nuraft::ptr KeeperSnapshotManager::deserializeLatestSnapshotBuff nuraft::ptr KeeperSnapshotManager::deserializeSnapshotBufferFromDisk(uint64_t up_to_log_idx) const { - const std::string & snapshot_path = existing_snapshots.at(up_to_log_idx); + const auto & [snapshot_path, snapshot_disk] = existing_snapshots.at(up_to_log_idx); WriteBufferFromNuraftBuffer writer; - ReadBufferFromFile reader(snapshot_path); - copyData(reader, writer); + auto reader = snapshot_disk->readFile(snapshot_path); + copyData(*reader, writer); return writer.getBuffer(); } @@ -659,30 +721,75 @@ SnapshotDeserializationResult KeeperSnapshotManager::restoreFromLatestSnapshot() return deserializeSnapshotFromBuffer(buffer); } +DiskPtr KeeperSnapshotManager::getDisk() const +{ + return keeper_context->getSnapshotDisk(); +} + +DiskPtr KeeperSnapshotManager::getLatestSnapshotDisk() const +{ + return keeper_context->getLatestSnapshotDisk(); +} + void KeeperSnapshotManager::removeOutdatedSnapshotsIfNeeded() { while (existing_snapshots.size() > snapshots_to_keep) removeSnapshot(existing_snapshots.begin()->first); } +void KeeperSnapshotManager::moveSnapshotsIfNeeded() +{ + /// move snapshots to correct disks + + auto disk = getDisk(); + auto latest_snapshot_disk = getLatestSnapshotDisk(); + auto latest_snapshot_idx = getLatestSnapshotIndex(); + + for (auto & [idx, file_info] : existing_snapshots) + { + if (idx == latest_snapshot_idx) + { + if (file_info.disk != latest_snapshot_disk) + { + moveFileBetweenDisks(file_info.disk, file_info.path, latest_snapshot_disk, file_info.path); + file_info.disk = latest_snapshot_disk; + } + } + else + { + if (file_info.disk != disk) + { + moveFileBetweenDisks(file_info.disk, file_info.path, disk, file_info.path); + file_info.disk = disk; + } + } + } + +} + void KeeperSnapshotManager::removeSnapshot(uint64_t log_idx) { auto itr = existing_snapshots.find(log_idx); if (itr == existing_snapshots.end()) throw Exception(ErrorCodes::UNKNOWN_SNAPSHOT, "Unknown snapshot with log index {}", log_idx); - std::filesystem::remove(itr->second); + const auto & [path, disk] = itr->second; + disk->removeFile(path); existing_snapshots.erase(itr); } -std::pair KeeperSnapshotManager::serializeSnapshotToDisk(const KeeperStorageSnapshot & snapshot) +SnapshotFileInfo KeeperSnapshotManager::serializeSnapshotToDisk(const KeeperStorageSnapshot & snapshot) { auto up_to_log_idx = snapshot.snapshot_meta->get_last_log_idx(); auto snapshot_file_name = getSnapshotFileName(up_to_log_idx, compress_snapshots_zstd); auto tmp_snapshot_file_name = "tmp_" + snapshot_file_name; - std::string tmp_snapshot_path = std::filesystem::path{snapshots_path} / tmp_snapshot_file_name; - std::string new_snapshot_path = std::filesystem::path{snapshots_path} / snapshot_file_name; - auto writer = std::make_unique(tmp_snapshot_path, O_WRONLY | O_TRUNC | O_CREAT | O_CLOEXEC | O_APPEND); + auto disk = getLatestSnapshotDisk(); + { + auto buf = disk->writeFile(tmp_snapshot_file_name); + buf->finalize(); + } + + auto writer = disk->writeFile(snapshot_file_name); std::unique_ptr compressed_writer; if (compress_snapshots_zstd) compressed_writer = wrapWriteBufferWithCompressionMethod(std::move(writer), CompressionMethod::Zstd, 3); @@ -693,14 +800,13 @@ std::pair KeeperSnapshotManager::serializeSnapshot compressed_writer->finalize(); compressed_writer->sync(); - std::error_code ec; - std::filesystem::rename(tmp_snapshot_path, new_snapshot_path, ec); - if (!ec) - { - existing_snapshots.emplace(up_to_log_idx, new_snapshot_path); - removeOutdatedSnapshotsIfNeeded(); - } - return {new_snapshot_path, ec}; + disk->removeFile(tmp_snapshot_file_name); + + existing_snapshots.emplace(up_to_log_idx, SnapshotFileInfo{snapshot_file_name, disk}); + removeOutdatedSnapshotsIfNeeded(); + moveSnapshotsIfNeeded(); + + return {snapshot_file_name, disk}; } } diff --git a/src/Coordination/KeeperSnapshotManager.h b/src/Coordination/KeeperSnapshotManager.h index 52647712083..9bb287b9276 100644 --- a/src/Coordination/KeeperSnapshotManager.h +++ b/src/Coordination/KeeperSnapshotManager.h @@ -6,6 +6,7 @@ #include #include #include +#include namespace DB { @@ -86,8 +87,14 @@ public: uint64_t nodes_digest; }; +struct SnapshotFileInfo +{ + std::string path; + DiskPtr disk; +}; + using KeeperStorageSnapshotPtr = std::shared_ptr; -using CreateSnapshotCallback = std::function; +using CreateSnapshotCallback = std::function; using SnapshotMetaAndStorage = std::pair; @@ -98,7 +105,6 @@ class KeeperSnapshotManager { public: KeeperSnapshotManager( - const std::string & snapshots_path_, size_t snapshots_to_keep_, const KeeperContextPtr & keeper_context_, bool compress_snapshots_zstd_ = true, @@ -112,10 +118,10 @@ public: nuraft::ptr serializeSnapshotToBuffer(const KeeperStorageSnapshot & snapshot) const; /// Serialize already compressed snapshot to disk (return path) - std::string serializeSnapshotBufferToDisk(nuraft::buffer & buffer, uint64_t up_to_log_idx); + SnapshotFileInfo serializeSnapshotBufferToDisk(nuraft::buffer & buffer, uint64_t up_to_log_idx); /// Serialize snapshot directly to disk - std::pair serializeSnapshotToDisk(const KeeperStorageSnapshot & snapshot); + SnapshotFileInfo serializeSnapshotToDisk(const KeeperStorageSnapshot & snapshot); SnapshotDeserializationResult deserializeSnapshotFromBuffer(nuraft::ptr buffer) const; @@ -139,30 +145,39 @@ public: return 0; } - std::string getLatestSnapshotPath() const + SnapshotFileInfo getLatestSnapshotInfo() const { if (!existing_snapshots.empty()) { - const auto & path = existing_snapshots.at(getLatestSnapshotIndex()); - std::error_code ec; - if (std::filesystem::exists(path, ec)) - return path; + const auto & [path, disk] = existing_snapshots.at(getLatestSnapshotIndex()); + + try + { + if (disk->exists(path)) + return {path, disk}; + } + catch (...) + { + } } - return ""; + return {"", nullptr}; } private: void removeOutdatedSnapshotsIfNeeded(); + void moveSnapshotsIfNeeded(); + + DiskPtr getDisk() const; + DiskPtr getLatestSnapshotDisk() const; /// Checks first 4 buffer bytes to became sure that snapshot compressed with /// ZSTD codec. static bool isZstdCompressed(nuraft::ptr buffer); - const std::string snapshots_path; /// How many snapshots to keep before remove const size_t snapshots_to_keep; /// All existing snapshots in our path (log_index -> path) - std::map existing_snapshots; + std::map existing_snapshots; /// Compress snapshots in common ZSTD format instead of custom ClickHouse block LZ4 format const bool compress_snapshots_zstd; /// Superdigest for deserialization of storage @@ -171,6 +186,8 @@ private: size_t storage_tick_time; KeeperContextPtr keeper_context; + + Poco::Logger * log = &Poco::Logger::get("KeeperSnapshotManager"); }; /// Keeper create snapshots in background thread. KeeperStateMachine just create diff --git a/src/Coordination/KeeperSnapshotManagerS3.cpp b/src/Coordination/KeeperSnapshotManagerS3.cpp index 1afe0b352c5..580e166e302 100644 --- a/src/Coordination/KeeperSnapshotManagerS3.cpp +++ b/src/Coordination/KeeperSnapshotManagerS3.cpp @@ -132,8 +132,9 @@ std::shared_ptr KeeperSnapshotManagerS return snapshot_s3_client; } -void KeeperSnapshotManagerS3::uploadSnapshotImpl(const std::string & snapshot_path) +void KeeperSnapshotManagerS3::uploadSnapshotImpl(const SnapshotFileInfo & snapshot_file_info) { + const auto & [snapshot_path, snapshot_disk] = snapshot_file_info; try { auto s3_client = getSnapshotS3Client(); @@ -154,8 +155,9 @@ void KeeperSnapshotManagerS3::uploadSnapshotImpl(const std::string & snapshot_pa }; }; - LOG_INFO(log, "Will try to upload snapshot on {} to S3", snapshot_path); - ReadBufferFromFile snapshot_file(snapshot_path); + LOG_INFO(log, "Will try to upload snapshot on {} to S3", snapshot_file_info.path); + + auto snapshot_file = snapshot_disk->readFile(snapshot_file_info.path); auto snapshot_name = fs::path(snapshot_path).filename().string(); auto lock_file = fmt::format(".{}_LOCK", snapshot_name); @@ -222,7 +224,7 @@ void KeeperSnapshotManagerS3::uploadSnapshotImpl(const std::string & snapshot_pa }); WriteBufferFromS3 snapshot_writer = create_writer(snapshot_name); - copyData(snapshot_file, snapshot_writer); + copyData(*snapshot_file, snapshot_writer); snapshot_writer.finalize(); LOG_INFO(log, "Successfully uploaded {} to S3", snapshot_path); @@ -240,31 +242,31 @@ void KeeperSnapshotManagerS3::snapshotS3Thread() while (!shutdown_called) { - std::string snapshot_path; - if (!snapshots_s3_queue.pop(snapshot_path)) + SnapshotFileInfo snapshot_file_info; + if (!snapshots_s3_queue.pop(snapshot_file_info)) break; if (shutdown_called) break; - uploadSnapshotImpl(snapshot_path); + uploadSnapshotImpl(snapshot_file_info); } } -void KeeperSnapshotManagerS3::uploadSnapshot(const std::string & path, bool async_upload) +void KeeperSnapshotManagerS3::uploadSnapshot(const SnapshotFileInfo & file_info, bool async_upload) { if (getSnapshotS3Client() == nullptr) return; if (async_upload) { - if (!snapshots_s3_queue.push(path)) - LOG_WARNING(log, "Failed to add snapshot {} to S3 queue", path); + if (!snapshots_s3_queue.push(file_info)) + LOG_WARNING(log, "Failed to add snapshot {} to S3 queue", file_info.path); return; } - uploadSnapshotImpl(path); + uploadSnapshotImpl(file_info); } void KeeperSnapshotManagerS3::startup(const Poco::Util::AbstractConfiguration & config, const MultiVersion::Version & macros) diff --git a/src/Coordination/KeeperSnapshotManagerS3.h b/src/Coordination/KeeperSnapshotManagerS3.h index eff7868bba9..e17cf5a1cfb 100644 --- a/src/Coordination/KeeperSnapshotManagerS3.h +++ b/src/Coordination/KeeperSnapshotManagerS3.h @@ -6,10 +6,13 @@ #include #include +#include + #if USE_AWS_S3 #include #include + #include #endif @@ -24,13 +27,13 @@ public: /// 'macros' are used to substitute macros in endpoint of disks void updateS3Configuration(const Poco::Util::AbstractConfiguration & config, const MultiVersion::Version & macros); - void uploadSnapshot(const std::string & path, bool async_upload = true); + void uploadSnapshot(const SnapshotFileInfo & file_info, bool async_upload = true); /// 'macros' are used to substitute macros in endpoint of disks void startup(const Poco::Util::AbstractConfiguration & config, const MultiVersion::Version & macros); void shutdown(); private: - using SnapshotS3Queue = ConcurrentBoundedQueue; + using SnapshotS3Queue = ConcurrentBoundedQueue; SnapshotS3Queue snapshots_s3_queue; /// Upload new snapshots to S3 @@ -48,7 +51,7 @@ private: std::shared_ptr getSnapshotS3Client() const; - void uploadSnapshotImpl(const std::string & snapshot_path); + void uploadSnapshotImpl(const SnapshotFileInfo & snapshot_file_info); /// Thread upload snapshots to S3 in the background void snapshotS3Thread(); @@ -60,7 +63,7 @@ public: KeeperSnapshotManagerS3() = default; void updateS3Configuration(const Poco::Util::AbstractConfiguration &, const MultiVersion::Version &) {} - void uploadSnapshot(const std::string &, [[maybe_unused]] bool async_upload = true) {} + void uploadSnapshot(const SnapshotFileInfo &, [[maybe_unused]] bool async_upload = true) {} void startup(const Poco::Util::AbstractConfiguration &, const MultiVersion::Version &) {} diff --git a/src/Coordination/KeeperStateMachine.cpp b/src/Coordination/KeeperStateMachine.cpp index 7d251ad48b9..5c84f23fc60 100644 --- a/src/Coordination/KeeperStateMachine.cpp +++ b/src/Coordination/KeeperStateMachine.cpp @@ -14,6 +14,8 @@ #include #include "Coordination/KeeperStorage.h" +#include + namespace ProfileEvents { @@ -33,17 +35,11 @@ namespace DB namespace ErrorCodes { extern const int LOGICAL_ERROR; - extern const int SYSTEM_ERROR; -} - -namespace -{ } KeeperStateMachine::KeeperStateMachine( ResponsesQueue & responses_queue_, SnapshotsQueue & snapshots_queue_, - const std::string & snapshots_path_, const CoordinationSettingsPtr & coordination_settings_, const KeeperContextPtr & keeper_context_, KeeperSnapshotManagerS3 * snapshot_manager_s3_, @@ -52,7 +48,6 @@ KeeperStateMachine::KeeperStateMachine( : commit_callback(commit_callback_) , coordination_settings(coordination_settings_) , snapshot_manager( - snapshots_path_, coordination_settings->snapshots_to_keep, keeper_context_, coordination_settings->compress_snapshots_with_zstd_format, @@ -69,6 +64,16 @@ KeeperStateMachine::KeeperStateMachine( { } +namespace +{ + +bool isLocalDisk(const IDisk & disk) +{ + return dynamic_cast(&disk) != nullptr; +} + +} + void KeeperStateMachine::init() { /// Do everything without mutexes, no other threads exist. @@ -83,9 +88,13 @@ void KeeperStateMachine::init() try { - auto snapshot_deserialization_result - = snapshot_manager.deserializeSnapshotFromBuffer(snapshot_manager.deserializeSnapshotBufferFromDisk(latest_log_index)); - latest_snapshot_path = snapshot_manager.getLatestSnapshotPath(); + latest_snapshot_buf = snapshot_manager.deserializeSnapshotBufferFromDisk(latest_log_index); + auto snapshot_deserialization_result = snapshot_manager.deserializeSnapshotFromBuffer(latest_snapshot_buf); + latest_snapshot_info = snapshot_manager.getLatestSnapshotInfo(); + + if (isLocalDisk(*latest_snapshot_info.disk)) + latest_snapshot_buf = nullptr; + storage = std::move(snapshot_deserialization_result.storage); latest_snapshot_meta = snapshot_deserialization_result.snapshot_meta; cluster_config = snapshot_deserialization_result.cluster_config; @@ -276,7 +285,7 @@ bool KeeperStateMachine::preprocess(const KeeperStorage::RequestForSession & req std::abort(); } - if (keeper_context->digest_enabled && request_for_session.digest) + if (keeper_context->digestEnabled() && request_for_session.digest) assertDigest(*request_for_session.digest, storage->getNodesDigest(false), *request_for_session.request, false); return true; @@ -333,7 +342,7 @@ nuraft::ptr KeeperStateMachine::commit(const uint64_t log_idx, n response_for_session.session_id); } - if (keeper_context->digest_enabled && request_for_session->digest) + if (keeper_context->digestEnabled() && request_for_session->digest) assertDigest(*request_for_session->digest, storage->getNodesDigest(true), *request_for_session->request, true); } @@ -371,8 +380,13 @@ bool KeeperStateMachine::apply_snapshot(nuraft::snapshot & s) { /// deserialize and apply snapshot to storage std::lock_guard lock(storage_and_responses_lock); - auto snapshot_deserialization_result - = snapshot_manager.deserializeSnapshotFromBuffer(snapshot_manager.deserializeSnapshotBufferFromDisk(s.get_last_log_idx())); + + SnapshotDeserializationResult snapshot_deserialization_result; + if (latest_snapshot_ptr) + snapshot_deserialization_result = snapshot_manager.deserializeSnapshotFromBuffer(latest_snapshot_ptr); + else + snapshot_deserialization_result + = snapshot_manager.deserializeSnapshotFromBuffer(snapshot_manager.deserializeSnapshotBufferFromDisk(s.get_last_log_idx())); /// maybe some logs were preprocessed with log idx larger than the snapshot idx /// we have to apply them to the new storage @@ -464,19 +478,24 @@ void KeeperStateMachine::create_snapshot(nuraft::snapshot & s, nuraft::async_res } else { - auto [path, error_code] = snapshot_manager.serializeSnapshotToDisk(*snapshot); - if (error_code) - { - throw Exception( - ErrorCodes::SYSTEM_ERROR, - "Snapshot {} was created failed, error: {}", - snapshot->snapshot_meta->get_last_log_idx(), - error_code.message()); - } - latest_snapshot_path = path; latest_snapshot_meta = snapshot->snapshot_meta; + /// we rely on the fact that the snapshot disk cannot be changed during runtime + if (isLocalDisk(*keeper_context->getLatestSnapshotDisk())) + { + auto snapshot_info = snapshot_manager.serializeSnapshotToDisk(*snapshot); + latest_snapshot_info = std::move(snapshot_info); + latest_snapshot_buf = nullptr; + } + else + { + auto snapshot_buf = snapshot_manager.serializeSnapshotToBuffer(*snapshot); + auto snapshot_info = snapshot_manager.serializeSnapshotBufferToDisk(*snapshot_buf, snapshot->snapshot_meta->get_last_log_idx()); + latest_snapshot_info = std::move(snapshot_info); + latest_snapshot_buf = std::move(snapshot_buf); + } + ProfileEvents::increment(ProfileEvents::KeeperSnapshotCreations); - LOG_DEBUG(log, "Created persistent snapshot {} with path {}", latest_snapshot_meta->get_last_log_idx(), path); + LOG_DEBUG(log, "Created persistent snapshot {} with path {}", latest_snapshot_meta->get_last_log_idx(), latest_snapshot_info.path); } } @@ -500,19 +519,19 @@ void KeeperStateMachine::create_snapshot(nuraft::snapshot & s, nuraft::async_res when_done(ret, exception); - return ret ? latest_snapshot_path : ""; + return ret ? latest_snapshot_info : SnapshotFileInfo{}; }; - if (keeper_context->server_state == KeeperContext::Phase::SHUTDOWN) + if (keeper_context->getServerState() == KeeperContext::Phase::SHUTDOWN) { LOG_INFO(log, "Creating a snapshot during shutdown because 'create_snapshot_on_exit' is enabled."); - auto snapshot_path = snapshot_task.create_snapshot(std::move(snapshot_task.snapshot)); + auto snapshot_file_info = snapshot_task.create_snapshot(std::move(snapshot_task.snapshot)); - if (!snapshot_path.empty() && snapshot_manager_s3) + if (!snapshot_file_info.path.empty() && snapshot_manager_s3) { - LOG_INFO(log, "Uploading snapshot {} during shutdown because 'upload_snapshot_on_exit' is enabled.", snapshot_path); - snapshot_manager_s3->uploadSnapshot(snapshot_path, /* asnyc_upload */ false); + LOG_INFO(log, "Uploading snapshot {} during shutdown because 'upload_snapshot_on_exit' is enabled.", snapshot_file_info.path); + snapshot_manager_s3->uploadSnapshot(snapshot_file_info, /* asnyc_upload */ false); } return; @@ -533,14 +552,20 @@ void KeeperStateMachine::save_logical_snp_obj( nuraft::ptr snp_buf = s.serialize(); nuraft::ptr cloned_meta = nuraft::snapshot::deserialize(*snp_buf); + nuraft::ptr cloned_buffer; + + /// we rely on the fact that the snapshot disk cannot be changed during runtime + if (!isLocalDisk(*keeper_context->getSnapshotDisk())) + cloned_buffer = nuraft::buffer::clone(data); + try { std::lock_guard lock(snapshots_lock); /// Serialize snapshot to disk - auto result_path = snapshot_manager.serializeSnapshotBufferToDisk(data, s.get_last_log_idx()); - latest_snapshot_path = result_path; + latest_snapshot_info = snapshot_manager.serializeSnapshotBufferToDisk(data, s.get_last_log_idx()); latest_snapshot_meta = cloned_meta; - LOG_DEBUG(log, "Saved snapshot {} to path {}", s.get_last_log_idx(), result_path); + latest_snapshot_buf = std::move(cloned_buffer); + LOG_DEBUG(log, "Saved snapshot {} to path {}", s.get_last_log_idx(), latest_snapshot_info.path); obj_id++; ProfileEvents::increment(ProfileEvents::KeeperSaveSnapshot); } @@ -600,11 +625,23 @@ int KeeperStateMachine::read_logical_snp_obj( latest_snapshot_meta->get_last_log_idx()); return -1; } - if (bufferFromFile(log, latest_snapshot_path, data_out)) + + const auto & [path, disk] = latest_snapshot_info; + if (isLocalDisk(*disk)) { - LOG_WARNING(log, "Error reading snapshot {} from {}", s.get_last_log_idx(), latest_snapshot_path); - return -1; + auto full_path = fs::path(disk->getPath()) / path; + if (bufferFromFile(log, full_path, data_out)) + { + LOG_WARNING(log, "Error reading snapshot {} from {}", s.get_last_log_idx(), full_path); + return -1; + } } + else + { + chassert(latest_snapshot_buf); + data_out = nuraft::buffer::clone(*latest_snapshot_buf); + } + is_last_obj = true; ProfileEvents::increment(ProfileEvents::KeeperReadSnapshot); diff --git a/src/Coordination/KeeperStateMachine.h b/src/Coordination/KeeperStateMachine.h index f6d81d23056..b47a9b5cc42 100644 --- a/src/Coordination/KeeperStateMachine.h +++ b/src/Coordination/KeeperStateMachine.h @@ -26,7 +26,6 @@ public: KeeperStateMachine( ResponsesQueue & responses_queue_, SnapshotsQueue & snapshots_queue_, - const std::string & snapshots_path_, const CoordinationSettingsPtr & coordination_settings_, const KeeperContextPtr & keeper_context_, KeeperSnapshotManagerS3 * snapshot_manager_s3_, @@ -128,7 +127,7 @@ private: /// In our state machine we always have a single snapshot which is stored /// in memory in compressed (serialized) format. SnapshotMetadataPtr latest_snapshot_meta = nullptr; - std::string latest_snapshot_path; + SnapshotFileInfo latest_snapshot_info; nuraft::ptr latest_snapshot_buf = nullptr; CoordinationSettingsPtr coordination_settings; diff --git a/src/Coordination/KeeperStateManager.cpp b/src/Coordination/KeeperStateManager.cpp index 8736fb7d4e3..450fd04b61d 100644 --- a/src/Coordination/KeeperStateManager.cpp +++ b/src/Coordination/KeeperStateManager.cpp @@ -8,6 +8,7 @@ #include #include #include +#include #include namespace DB @@ -22,6 +23,8 @@ namespace ErrorCodes namespace { +const std::string copy_lock_file = "STATE_COPY_LOCK"; + bool isLocalhost(const std::string & hostname) { try @@ -212,12 +215,14 @@ KeeperStateManager::parseServersConfiguration(const Poco::Util::AbstractConfigur return result; } -KeeperStateManager::KeeperStateManager( - int server_id_, const std::string & host, int port, const std::string & logs_path, const std::string & state_file_path) +KeeperStateManager::KeeperStateManager(int server_id_, const std::string & host, int port, KeeperContextPtr keeper_context_) : my_server_id(server_id_) , secure(false) - , log_store(nuraft::cs_new(logs_path, LogFileSettings{.force_sync =false, .compress_logs = false, .rotate_interval = 5000})) - , server_state_path(state_file_path) + , log_store(nuraft::cs_new( + LogFileSettings{.force_sync = false, .compress_logs = false, .rotate_interval = 5000}, + keeper_context_)) + , server_state_file_name("state") + , keeper_context(keeper_context_) , logger(&Poco::Logger::get("KeeperStateManager")) { auto peer_config = nuraft::cs_new(my_server_id, host + ":" + std::to_string(port)); @@ -230,16 +235,15 @@ KeeperStateManager::KeeperStateManager( KeeperStateManager::KeeperStateManager( int my_server_id_, const std::string & config_prefix_, - const std::string & log_storage_path, - const std::string & state_file_path, + const std::string & server_state_file_name_, const Poco::Util::AbstractConfiguration & config, - const CoordinationSettingsPtr & coordination_settings) + const CoordinationSettingsPtr & coordination_settings, + KeeperContextPtr keeper_context_) : my_server_id(my_server_id_) , secure(config.getBool(config_prefix_ + ".raft_configuration.secure", false)) , config_prefix(config_prefix_) , configuration_wrapper(parseServersConfiguration(config, false)) , log_store(nuraft::cs_new( - log_storage_path, LogFileSettings { .force_sync = coordination_settings->force_sync, @@ -247,8 +251,10 @@ KeeperStateManager::KeeperStateManager( .rotate_interval = coordination_settings->rotate_log_storage_interval, .max_size = coordination_settings->max_log_file_size, .overallocate_size = coordination_settings->log_file_overallocate_size - })) - , server_state_path(state_file_path) + }, + keeper_context_)) + , server_state_file_name(server_state_file_name_) + , keeper_context(keeper_context_) , logger(&Poco::Logger::get("KeeperStateManager")) { } @@ -287,16 +293,21 @@ void KeeperStateManager::save_config(const nuraft::cluster_config & config) configuration_wrapper.cluster_config = nuraft::cluster_config::deserialize(*buf); } -const std::filesystem::path & KeeperStateManager::getOldServerStatePath() +const String & KeeperStateManager::getOldServerStatePath() { static auto old_path = [this] { - return server_state_path.parent_path() / (server_state_path.filename().generic_string() + "-OLD"); + return server_state_file_name + "-OLD"; }(); return old_path; } +DiskPtr KeeperStateManager::getStateFileDisk() const +{ + return keeper_context->getStateFileDisk(); +} + namespace { enum ServerStateVersion : uint8_t @@ -312,51 +323,61 @@ void KeeperStateManager::save_state(const nuraft::srv_state & state) { const auto & old_path = getOldServerStatePath(); - if (std::filesystem::exists(server_state_path)) - std::filesystem::rename(server_state_path, old_path); + auto disk = getStateFileDisk(); - WriteBufferFromFile server_state_file(server_state_path, DBMS_DEFAULT_BUFFER_SIZE, O_TRUNC | O_CREAT | O_WRONLY); + if (disk->exists(server_state_file_name)) + { + auto buf = disk->writeFile(copy_lock_file); + buf->finalize(); + disk->copyFile(server_state_file_name, *disk, old_path); + disk->removeFile(copy_lock_file); + disk->removeFile(old_path); + } + + auto server_state_file = disk->writeFile(server_state_file_name); auto buf = state.serialize(); // calculate checksum SipHash hash; hash.update(current_server_state_version); hash.update(reinterpret_cast(buf->data_begin()), buf->size()); - writeIntBinary(hash.get64(), server_state_file); + writeIntBinary(hash.get64(), *server_state_file); - writeIntBinary(static_cast(current_server_state_version), server_state_file); + writeIntBinary(static_cast(current_server_state_version), *server_state_file); - server_state_file.write(reinterpret_cast(buf->data_begin()), buf->size()); - server_state_file.sync(); - server_state_file.close(); + server_state_file->write(reinterpret_cast(buf->data_begin()), buf->size()); + server_state_file->sync(); + server_state_file->finalize(); - std::filesystem::remove(old_path); + disk->removeFileIfExists(old_path); } nuraft::ptr KeeperStateManager::read_state() { const auto & old_path = getOldServerStatePath(); - const auto try_read_file = [this](const auto & path) -> nuraft::ptr + auto disk = getStateFileDisk(); + + const auto try_read_file = [&](const auto & path) -> nuraft::ptr { try { - ReadBufferFromFile read_buf(path); - auto content_size = read_buf.getFileSize(); + auto read_buf = disk->readFile(path); + auto content_size = read_buf->getFileSize(); if (content_size == 0) return nullptr; uint64_t read_checksum{0}; - readIntBinary(read_checksum, read_buf); + readIntBinary(read_checksum, *read_buf); uint8_t version; - readIntBinary(version, read_buf); + readIntBinary(version, *read_buf); auto buffer_size = content_size - sizeof read_checksum - sizeof version; auto state_buf = nuraft::buffer::alloc(buffer_size); - read_buf.readStrict(reinterpret_cast(state_buf->data_begin()), buffer_size); + read_buf->readStrict(reinterpret_cast(state_buf->data_begin()), buffer_size); SipHash hash; hash.update(version); @@ -366,15 +387,15 @@ nuraft::ptr KeeperStateManager::read_state() { constexpr auto error_format = "Invalid checksum while reading state from {}. Got {}, expected {}"; #ifdef NDEBUG - LOG_ERROR(logger, error_format, path.generic_string(), hash.get64(), read_checksum); + LOG_ERROR(logger, error_format, path, hash.get64(), read_checksum); return nullptr; #else - throw Exception(ErrorCodes::CORRUPTED_DATA, error_format, path.generic_string(), hash.get64(), read_checksum); + throw Exception(ErrorCodes::CORRUPTED_DATA, error_format, disk->getPath() + path, hash.get64(), read_checksum); #endif } auto state = nuraft::srv_state::deserialize(*state_buf); - LOG_INFO(logger, "Read state from {}", path.generic_string()); + LOG_INFO(logger, "Read state from {}", fs::path(disk->getPath()) / path); return state; } catch (const std::exception & e) @@ -385,37 +406,45 @@ nuraft::ptr KeeperStateManager::read_state() throw; } - LOG_ERROR(logger, "Failed to deserialize state from {}", path.generic_string()); + LOG_ERROR(logger, "Failed to deserialize state from {}", disk->getPath() + path); return nullptr; } }; - if (std::filesystem::exists(server_state_path)) + if (disk->exists(server_state_file_name)) { - auto state = try_read_file(server_state_path); + auto state = try_read_file(server_state_file_name); if (state) { - if (std::filesystem::exists(old_path)) - std::filesystem::remove(old_path); - + disk->removeFileIfExists(old_path); return state; } - std::filesystem::remove(server_state_path); + disk->removeFile(server_state_file_name); } - if (std::filesystem::exists(old_path)) + if (disk->exists(old_path)) { - auto state = try_read_file(old_path); - - if (state) + if (disk->exists(copy_lock_file)) { - std::filesystem::rename(old_path, server_state_path); - return state; + disk->removeFile(old_path); + disk->removeFile(copy_lock_file); } - - std::filesystem::remove(old_path); + else + { + auto state = try_read_file(old_path); + if (state) + { + disk->moveFile(old_path, server_state_file_name); + return state; + } + disk->removeFile(old_path); + } + } + else if (disk->exists(copy_lock_file)) + { + disk->removeFile(copy_lock_file); } LOG_WARNING(logger, "No state was read"); diff --git a/src/Coordination/KeeperStateManager.h b/src/Coordination/KeeperStateManager.h index 5d210f8c0ea..f24f0c2b1e5 100644 --- a/src/Coordination/KeeperStateManager.h +++ b/src/Coordination/KeeperStateManager.h @@ -39,18 +39,17 @@ public: KeeperStateManager( int server_id_, const std::string & config_prefix_, - const std::string & log_storage_path, - const std::string & state_file_path, + const std::string & server_state_file_name_, const Poco::Util::AbstractConfiguration & config, - const CoordinationSettingsPtr & coordination_settings); + const CoordinationSettingsPtr & coordination_settings, + KeeperContextPtr keeper_context_); /// Constructor for tests KeeperStateManager( int server_id_, const std::string & host, int port, - const std::string & logs_path, - const std::string & state_file_path); + KeeperContextPtr keeper_context_); void loadLogStore(uint64_t last_commited_index, uint64_t logs_to_keep); @@ -111,7 +110,9 @@ public: ConfigUpdateActions getConfigurationDiff(const Poco::Util::AbstractConfiguration & config) const; private: - const std::filesystem::path & getOldServerStatePath(); + const String & getOldServerStatePath(); + + DiskPtr getStateFileDisk() const; /// Wrapper struct for Keeper cluster config. We parse this /// info from XML files. @@ -136,7 +137,9 @@ private: nuraft::ptr log_store; - const std::filesystem::path server_state_path; + const String server_state_file_name; + + KeeperContextPtr keeper_context; Poco::Logger * logger; diff --git a/src/Coordination/KeeperStorage.cpp b/src/Coordination/KeeperStorage.cpp index 66d6b0f5843..884aacc4558 100644 --- a/src/Coordination/KeeperStorage.cpp +++ b/src/Coordination/KeeperStorage.cpp @@ -128,7 +128,7 @@ KeeperStorage::ResponsesForSessions processWatchesImpl( watches.erase(watch_it); } - auto parent_path = parentPath(path); + auto parent_path = parentNodePath(path); Strings paths_to_check_for_list_watches; if (event_type == Coordination::Event::CREATED) @@ -276,23 +276,23 @@ void KeeperStorage::initializeSystemNodes() [](auto & node) { ++node.stat.numChildren; - node.addChild(getBaseName(keeper_system_path)); + node.addChild(getBaseNodeName(keeper_system_path)); } ); addDigest(updated_root_it->value, "/"); } // insert child system nodes - for (const auto & [path, data] : child_system_paths_with_data) + for (const auto & [path, data] : keeper_context->getSystemNodesWithData()) { - assert(keeper_api_version_path.starts_with(keeper_system_path)); + assert(path.starts_with(keeper_system_path)); Node child_system_node; child_system_node.setData(data); auto [map_key, _] = container.insert(std::string{path}, child_system_node); /// Take child path from key owned by map. - auto child_path = getBaseName(map_key->getKey()); + auto child_path = getBaseNodeName(map_key->getKey()); container.updateValue( - parentPath(StringRef(path)), + parentNodePath(StringRef(path)), [child_path](auto & parent) { // don't update stats so digest is okay @@ -728,7 +728,7 @@ bool KeeperStorage::createNode( bool is_sequental, Coordination::ACLs node_acls) { - auto parent_path = parentPath(path); + auto parent_path = parentNodePath(path); auto node_it = container.find(parent_path); if (node_it == container.end()) @@ -751,7 +751,7 @@ bool KeeperStorage::createNode( created_node.is_sequental = is_sequental; auto [map_key, _] = container.insert(path, created_node); /// Take child path from key owned by map. - auto child_path = getBaseName(map_key->getKey()); + auto child_path = getBaseNodeName(map_key->getKey()); container.updateValue( parent_path, [child_path](KeeperStorage::Node & parent) @@ -781,8 +781,8 @@ bool KeeperStorage::removeNode(const std::string & path, int32_t version) acl_map.removeUsage(prev_node.acl_id); container.updateValue( - parentPath(path), - [child_basename = getBaseName(node_it->key)](KeeperStorage::Node & parent) + parentNodePath(path), + [child_basename = getBaseNodeName(node_it->key)](KeeperStorage::Node & parent) { parent.removeChild(child_basename); chassert(parent.stat.numChildren == static_cast(parent.getChildren().size())); @@ -866,7 +866,7 @@ Coordination::ACLs getNodeACLs(KeeperStorage & storage, StringRef path, bool is_ void handleSystemNodeModification(const KeeperContext & keeper_context, std::string_view error_msg) { - if (keeper_context.server_state == KeeperContext::Phase::INIT && !keeper_context.ignore_system_path_on_startup) + if (keeper_context.getServerState() == KeeperContext::Phase::INIT && !keeper_context.ignoreSystemPathOnStartup()) throw Exception( ErrorCodes::LOGICAL_ERROR, "{}. Ignoring it can lead to data loss. " @@ -929,7 +929,7 @@ struct KeeperStorageCreateRequestProcessor final : public KeeperStorageRequestPr bool checkAuth(KeeperStorage & storage, int64_t session_id, bool is_local) const override { auto path = zk_request->getPath(); - return storage.checkACL(parentPath(path), Coordination::ACL::Create, session_id, is_local); + return storage.checkACL(parentNodePath(path), Coordination::ACL::Create, session_id, is_local); } std::vector @@ -940,7 +940,7 @@ struct KeeperStorageCreateRequestProcessor final : public KeeperStorageRequestPr std::vector new_deltas; - auto parent_path = parentPath(request.path); + auto parent_path = parentNodePath(request.path); auto parent_node = storage.uncommitted_state.getNode(parent_path); if (parent_node == nullptr) return {KeeperStorage::Delta{zxid, Coordination::Error::ZNONODE}}; @@ -971,7 +971,7 @@ struct KeeperStorageCreateRequestProcessor final : public KeeperStorageRequestPr if (storage.uncommitted_state.getNode(path_created)) return {KeeperStorage::Delta{zxid, Coordination::Error::ZNODEEXISTS}}; - if (getBaseName(path_created).size == 0) + if (getBaseNodeName(path_created).size == 0) return {KeeperStorage::Delta{zxid, Coordination::Error::ZBADARGUMENTS}}; Coordination::ACLs node_acls; @@ -1060,7 +1060,7 @@ struct KeeperStorageGetRequestProcessor final : public KeeperStorageRequestProce ProfileEvents::increment(ProfileEvents::KeeperGetRequest); Coordination::ZooKeeperGetRequest & request = dynamic_cast(*zk_request); - if (request.path == Coordination::keeper_api_version_path) + if (request.path == Coordination::keeper_api_feature_flags_path) return {}; if (!storage.uncommitted_state.getNode(request.path)) @@ -1121,7 +1121,7 @@ struct KeeperStorageRemoveRequestProcessor final : public KeeperStorageRequestPr { bool checkAuth(KeeperStorage & storage, int64_t session_id, bool is_local) const override { - return storage.checkACL(parentPath(zk_request->getPath()), Coordination::ACL::Delete, session_id, is_local); + return storage.checkACL(parentNodePath(zk_request->getPath()), Coordination::ACL::Delete, session_id, is_local); } using KeeperStorageRequestProcessor::KeeperStorageRequestProcessor; @@ -1143,7 +1143,7 @@ struct KeeperStorageRemoveRequestProcessor final : public KeeperStorageRequestPr const auto update_parent_pzxid = [&]() { - auto parent_path = parentPath(request.path); + auto parent_path = parentNodePath(request.path); if (!storage.uncommitted_state.getNode(parent_path)) return; @@ -1178,7 +1178,7 @@ struct KeeperStorageRemoveRequestProcessor final : public KeeperStorageRequestPr update_parent_pzxid(); new_deltas.emplace_back( - std::string{parentPath(request.path)}, + std::string{parentNodePath(request.path)}, zxid, KeeperStorage::UpdateNodeDelta{[](KeeperStorage::Node & parent) { @@ -1321,7 +1321,7 @@ struct KeeperStorageSetRequestProcessor final : public KeeperStorageRequestProce request.version}); new_deltas.emplace_back( - parentPath(request.path).toString(), + parentNodePath(request.path).toString(), zxid, KeeperStorage::UpdateNodeDelta { @@ -1481,7 +1481,7 @@ struct KeeperStorageCheckRequestProcessor final : public KeeperStorageRequestPro bool checkAuth(KeeperStorage & storage, int64_t session_id, bool is_local) const override { auto path = zk_request->getPath(); - return storage.checkACL(check_not_exists ? parentPath(path) : path, Coordination::ACL::Read, session_id, is_local); + return storage.checkACL(check_not_exists ? parentNodePath(path) : path, Coordination::ACL::Read, session_id, is_local); } std::vector @@ -2024,7 +2024,7 @@ KeeperStorageRequestProcessorsFactory::KeeperStorageRequestProcessorsFactory() UInt64 KeeperStorage::calculateNodesDigest(UInt64 current_digest, const std::vector & new_deltas) const { - if (!keeper_context->digest_enabled) + if (!keeper_context->digestEnabled()) return current_digest; std::unordered_map> updated_nodes; @@ -2122,7 +2122,7 @@ void KeeperStorage::preprocessRequest( TransactionInfo transaction{.zxid = new_last_zxid}; uint64_t new_digest = getNodesDigest(false).value; SCOPE_EXIT({ - if (keeper_context->digest_enabled) + if (keeper_context->digestEnabled()) // if the version of digest we got from the leader is the same as the one this instances has, we can simply copy the value // and just check the digest on the commit // a mistake can happen while applying the changes to the uncommitted_state so for now let's just recalculate the digest here also @@ -2145,7 +2145,7 @@ void KeeperStorage::preprocessRequest( { new_deltas.emplace_back ( - parentPath(ephemeral_path).toString(), + parentNodePath(ephemeral_path).toString(), new_last_zxid, UpdateNodeDelta { @@ -2338,7 +2338,7 @@ void KeeperStorage::rollbackRequest(int64_t rollback_zxid, bool allow_missing) KeeperStorage::Digest KeeperStorage::getNodesDigest(bool committed) const { - if (!keeper_context->digest_enabled) + if (!keeper_context->digestEnabled()) return {.version = DigestVersion::NO_DIGEST}; if (committed || uncommitted_transactions.empty()) @@ -2349,13 +2349,13 @@ KeeperStorage::Digest KeeperStorage::getNodesDigest(bool committed) const void KeeperStorage::removeDigest(const Node & node, const std::string_view path) { - if (keeper_context->digest_enabled) + if (keeper_context->digestEnabled()) nodes_digest -= node.getDigest(path); } void KeeperStorage::addDigest(const Node & node, const std::string_view path) { - if (keeper_context->digest_enabled) + if (keeper_context->digestEnabled()) { node.invalidateDigestCache(); nodes_digest += node.getDigest(path); diff --git a/src/Coordination/Standalone/Context.cpp b/src/Coordination/Standalone/Context.cpp new file mode 100644 index 00000000000..84083169df3 --- /dev/null +++ b/src/Coordination/Standalone/Context.cpp @@ -0,0 +1,321 @@ +#include + +#include +#include +#include + +#include + +#include + +#include +#include + +namespace ProfileEvents +{ + extern const Event ContextLock; +} + +namespace CurrentMetrics +{ + extern const Metric ContextLockWait; + extern const Metric BackgroundSchedulePoolTask; + extern const Metric BackgroundSchedulePoolSize; + extern const Metric IOWriterThreads; + extern const Metric IOWriterThreadsActive; +} + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int LOGICAL_ERROR; +} + +struct ContextSharedPart : boost::noncopyable +{ + ContextSharedPart() + : macros(std::make_unique()) + {} + + /// For access of most of shared objects. Recursive mutex. + mutable std::recursive_mutex mutex; + + mutable std::mutex keeper_dispatcher_mutex; + mutable std::shared_ptr keeper_dispatcher TSA_GUARDED_BY(keeper_dispatcher_mutex); + + ServerSettings server_settings; + + String path; /// Path to the data directory, with a slash at the end. + ConfigurationPtr config; /// Global configuration settings. + MultiVersion macros; /// Substitutions extracted from config. + mutable std::unique_ptr schedule_pool; /// A thread pool that can run different jobs in background + RemoteHostFilter remote_host_filter; /// Allowed URL from config.xml + /// + mutable std::unique_ptr asynchronous_remote_fs_reader; + mutable std::unique_ptr asynchronous_local_fs_reader; + mutable std::unique_ptr synchronous_local_fs_reader; + + mutable std::unique_ptr threadpool_writer; + + mutable ThrottlerPtr remote_read_throttler; /// A server-wide throttler for remote IO reads + mutable ThrottlerPtr remote_write_throttler; /// A server-wide throttler for remote IO writes + + mutable ThrottlerPtr local_read_throttler; /// A server-wide throttler for local IO reads + mutable ThrottlerPtr local_write_throttler; /// A server-wide throttler for local IO writes + +}; + +Context::Context() = default; +Context::~Context() = default; +Context::Context(const Context &) = default; +Context & Context::operator=(const Context &) = default; + +SharedContextHolder::SharedContextHolder(SharedContextHolder &&) noexcept = default; +SharedContextHolder & SharedContextHolder::operator=(SharedContextHolder &&) noexcept = default; +SharedContextHolder::SharedContextHolder() = default; +SharedContextHolder::~SharedContextHolder() = default; +SharedContextHolder::SharedContextHolder(std::unique_ptr shared_context) + : shared(std::move(shared_context)) {} + +void SharedContextHolder::reset() { shared.reset(); } + +void Context::makeGlobalContext() +{ + initGlobal(); + global_context = shared_from_this(); +} + +ContextMutablePtr Context::createGlobal(ContextSharedPart * shared) +{ + auto res = std::shared_ptr(new Context); + res->shared = shared; + return res; +} + +void Context::initGlobal() +{ + assert(!global_context_instance); + global_context_instance = shared_from_this(); +} + +SharedContextHolder Context::createShared() +{ + return SharedContextHolder(std::make_unique()); +} + +ContextMutablePtr Context::getGlobalContext() const +{ + auto ptr = global_context.lock(); + if (!ptr) throw Exception(ErrorCodes::LOGICAL_ERROR, "There is no global context or global context has expired"); + return ptr; +} + +std::unique_lock Context::getLock() const +{ + ProfileEvents::increment(ProfileEvents::ContextLock); + CurrentMetrics::Increment increment{CurrentMetrics::ContextLockWait}; + return std::unique_lock(shared->mutex); +} + +String Context::getPath() const +{ + auto lock = getLock(); + return shared->path; +} + +void Context::setPath(const String & path) +{ + auto lock = getLock(); + shared->path = path; +} + +MultiVersion::Version Context::getMacros() const +{ + return shared->macros.get(); +} + +void Context::setMacros(std::unique_ptr && macros) +{ + shared->macros.set(std::move(macros)); +} + +BackgroundSchedulePool & Context::getSchedulePool() const +{ + auto lock = getLock(); + if (!shared->schedule_pool) + { + shared->schedule_pool = std::make_unique( + shared->server_settings.background_schedule_pool_size, + CurrentMetrics::BackgroundSchedulePoolTask, + CurrentMetrics::BackgroundSchedulePoolSize, + "BgSchPool"); + } + + return *shared->schedule_pool; +} + +void Context::setRemoteHostFilter(const Poco::Util::AbstractConfiguration & config) +{ + shared->remote_host_filter.setValuesFromConfig(config); +} + +const RemoteHostFilter & Context::getRemoteHostFilter() const +{ + return shared->remote_host_filter; +} + +IAsynchronousReader & Context::getThreadPoolReader(FilesystemReaderType type) const +{ + auto lock = getLock(); + + switch (type) + { + case FilesystemReaderType::ASYNCHRONOUS_REMOTE_FS_READER: + { + if (!shared->asynchronous_remote_fs_reader) + shared->asynchronous_remote_fs_reader = createThreadPoolReader(type, getConfigRef()); + return *shared->asynchronous_remote_fs_reader; + } + case FilesystemReaderType::ASYNCHRONOUS_LOCAL_FS_READER: + { + if (!shared->asynchronous_local_fs_reader) + shared->asynchronous_local_fs_reader = createThreadPoolReader(type, getConfigRef()); + + return *shared->asynchronous_local_fs_reader; + } + case FilesystemReaderType::SYNCHRONOUS_LOCAL_FS_READER: + { + if (!shared->synchronous_local_fs_reader) + shared->synchronous_local_fs_reader = createThreadPoolReader(type, getConfigRef()); + + return *shared->synchronous_local_fs_reader; + } + } +} + +std::shared_ptr Context::getFilesystemCacheLog() const +{ + return nullptr; +} + +std::shared_ptr Context::getFilesystemReadPrefetchesLog() const +{ + return nullptr; +} + +void Context::setConfig(const ConfigurationPtr & config) +{ + auto lock = getLock(); + shared->config = config; +} + +const Poco::Util::AbstractConfiguration & Context::getConfigRef() const +{ + auto lock = getLock(); + return shared->config ? *shared->config : Poco::Util::Application::instance().config(); +} + +std::shared_ptr Context::getAsyncReadCounters() const +{ + auto lock = getLock(); + if (!async_read_counters) + async_read_counters = std::make_shared(); + return async_read_counters; +} + +ThreadPool & Context::getThreadPoolWriter() const +{ + const auto & config = getConfigRef(); + + auto lock = getLock(); + + if (!shared->threadpool_writer) + { + auto pool_size = config.getUInt(".threadpool_writer_pool_size", 100); + auto queue_size = config.getUInt(".threadpool_writer_queue_size", 1000000); + + shared->threadpool_writer = std::make_unique( + CurrentMetrics::IOWriterThreads, CurrentMetrics::IOWriterThreadsActive, pool_size, pool_size, queue_size); + } + + return *shared->threadpool_writer; +} + +ThrottlerPtr Context::getRemoteReadThrottler() const +{ + return nullptr; +} + +ThrottlerPtr Context::getRemoteWriteThrottler() const +{ + return nullptr; +} + +ThrottlerPtr Context::getLocalReadThrottler() const +{ + return nullptr; +} + +ThrottlerPtr Context::getLocalWriteThrottler() const +{ + return nullptr; +} + +ReadSettings Context::getReadSettings() const +{ + return ReadSettings{}; +} + +void Context::initializeKeeperDispatcher([[maybe_unused]] bool start_async) const +{ + const auto & config_ref = getConfigRef(); + + std::lock_guard lock(shared->keeper_dispatcher_mutex); + + if (shared->keeper_dispatcher) + throw Exception(ErrorCodes::LOGICAL_ERROR, "Trying to initialize Keeper multiple times"); + + if (config_ref.has("keeper_server")) + { + shared->keeper_dispatcher = std::make_shared(); + shared->keeper_dispatcher->initialize(config_ref, true, start_async, getMacros()); + } +} + +std::shared_ptr Context::getKeeperDispatcher() const +{ + std::lock_guard lock(shared->keeper_dispatcher_mutex); + if (!shared->keeper_dispatcher) + throw Exception(ErrorCodes::LOGICAL_ERROR, "Keeper must be initialized before requests"); + + return shared->keeper_dispatcher; +} + +std::shared_ptr Context::tryGetKeeperDispatcher() const +{ + std::lock_guard lock(shared->keeper_dispatcher_mutex); + return shared->keeper_dispatcher; +} + +void Context::shutdownKeeperDispatcher() const +{ + std::lock_guard lock(shared->keeper_dispatcher_mutex); + if (shared->keeper_dispatcher) + { + shared->keeper_dispatcher->shutdown(); + shared->keeper_dispatcher.reset(); + } +} + +void Context::updateKeeperConfiguration([[maybe_unused]] const Poco::Util::AbstractConfiguration & config_) +{ + std::lock_guard lock(shared->keeper_dispatcher_mutex); + if (!shared->keeper_dispatcher) + return; + + shared->keeper_dispatcher->updateConfiguration(getConfigRef(), getMacros()); +} + +} diff --git a/src/Coordination/Standalone/Context.h b/src/Coordination/Standalone/Context.h new file mode 100644 index 00000000000..c2eee981aaa --- /dev/null +++ b/src/Coordination/Standalone/Context.h @@ -0,0 +1,120 @@ +#pragma once + +#include + +#include + +#include +#include + +#include + +#include +#include + +#include + +#include + +#include + +namespace DB +{ + +struct ContextSharedPart; +class Macros; +class FilesystemCacheLog; +class FilesystemReadPrefetchesLog; + +/// A small class which owns ContextShared. +/// We don't use something like unique_ptr directly to allow ContextShared type to be incomplete. +struct SharedContextHolder +{ + ~SharedContextHolder(); + SharedContextHolder(); + explicit SharedContextHolder(std::unique_ptr shared_context); + SharedContextHolder(SharedContextHolder &&) noexcept; + + SharedContextHolder & operator=(SharedContextHolder &&) noexcept; + + ContextSharedPart * get() const { return shared.get(); } + void reset(); +private: + std::unique_ptr shared; +}; + + +class Context : public std::enable_shared_from_this +{ +private: + /// Use copy constructor or createGlobal() instead + Context(); + Context(const Context &); + Context & operator=(const Context &); + + std::unique_lock getLock() const; + + ContextWeakMutablePtr global_context; + inline static ContextPtr global_context_instance; + ContextSharedPart * shared; + + /// Query metrics for reading data asynchronously with IAsynchronousReader. + mutable std::shared_ptr async_read_counters; + + Settings settings; /// Setting for query execution. +public: + /// Create initial Context with ContextShared and etc. + static ContextMutablePtr createGlobal(ContextSharedPart * shared); + static SharedContextHolder createShared(); + + ContextMutablePtr getGlobalContext() const; + static ContextPtr getGlobalContextInstance() { return global_context_instance; } + + void makeGlobalContext(); + void initGlobal(); + + ~Context(); + + using ConfigurationPtr = Poco::AutoPtr; + + /// Global application configuration settings. + void setConfig(const ConfigurationPtr & config); + const Poco::Util::AbstractConfiguration & getConfigRef() const; + + const Settings & getSettingsRef() const { return settings; } + + String getPath() const; + void setPath(const String & path); + + MultiVersion::Version getMacros() const; + void setMacros(std::unique_ptr && macros); + + BackgroundSchedulePool & getSchedulePool() const; + + /// Storage of allowed hosts from config.xml + void setRemoteHostFilter(const Poco::Util::AbstractConfiguration & config); + const RemoteHostFilter & getRemoteHostFilter() const; + + std::shared_ptr getFilesystemCacheLog() const; + std::shared_ptr getFilesystemReadPrefetchesLog() const; + + IAsynchronousReader & getThreadPoolReader(FilesystemReaderType type) const; + std::shared_ptr getAsyncReadCounters() const; + ThreadPool & getThreadPoolWriter() const; + + ThrottlerPtr getRemoteReadThrottler() const; + ThrottlerPtr getRemoteWriteThrottler() const; + + ThrottlerPtr getLocalReadThrottler() const; + ThrottlerPtr getLocalWriteThrottler() const; + + ReadSettings getReadSettings() const; + + std::shared_ptr getKeeperDispatcher() const; + std::shared_ptr tryGetKeeperDispatcher() const; + void initializeKeeperDispatcher(bool start_async) const; + void shutdownKeeperDispatcher() const; + void updateKeeperConfiguration(const Poco::Util::AbstractConfiguration & config); +}; + +} diff --git a/src/Coordination/Standalone/Settings.cpp b/src/Coordination/Standalone/Settings.cpp new file mode 100644 index 00000000000..12a7a42ffac --- /dev/null +++ b/src/Coordination/Standalone/Settings.cpp @@ -0,0 +1,24 @@ +#include + +namespace DB +{ + +IMPLEMENT_SETTINGS_TRAITS(SettingsTraits, LIST_OF_SETTINGS) + +std::vector Settings::getAllRegisteredNames() const +{ + std::vector all_settings; + for (const auto & setting_field : all()) + { + all_settings.push_back(setting_field.getName()); + } + return all_settings; +} + +void Settings::set(std::string_view name, const Field & value) +{ + BaseSettings::set(name, value); +} + + +} diff --git a/src/Coordination/Standalone/ThreadStatusExt.cpp b/src/Coordination/Standalone/ThreadStatusExt.cpp new file mode 100644 index 00000000000..97f7287be8c --- /dev/null +++ b/src/Coordination/Standalone/ThreadStatusExt.cpp @@ -0,0 +1,14 @@ +#include + +namespace DB +{ + +void CurrentThread::detachFromGroupIfNotDetached() +{ +} + +void CurrentThread::attachToGroup(const ThreadGroupPtr &) +{ +} + +} diff --git a/src/Coordination/TinyContext.cpp b/src/Coordination/TinyContext.cpp deleted file mode 100644 index 47b0a48dcda..00000000000 --- a/src/Coordination/TinyContext.cpp +++ /dev/null @@ -1,87 +0,0 @@ -#include - -#include -#include - -namespace DB -{ - -namespace ErrorCodes -{ - extern const int LOGICAL_ERROR; -} - -void TinyContext::setConfig(const ConfigurationPtr & config_) -{ - std::lock_guard lock(keeper_dispatcher_mutex); - config = config_; -} - -const Poco::Util::AbstractConfiguration & TinyContext::getConfigRef() const -{ - std::lock_guard lock(keeper_dispatcher_mutex); - return config ? *config : Poco::Util::Application::instance().config(); -} - - -void TinyContext::initializeKeeperDispatcher([[maybe_unused]] bool start_async) const -{ - const auto & config_ref = getConfigRef(); - - std::lock_guard lock(keeper_dispatcher_mutex); - - if (keeper_dispatcher) - throw Exception(ErrorCodes::LOGICAL_ERROR, "Trying to initialize Keeper multiple times"); - - if (config_ref.has("keeper_server")) - { - keeper_dispatcher = std::make_shared(); - - MultiVersion::Version macros; - - if (config_ref.has("macros")) - macros = std::make_unique(config_ref, "macros", &Poco::Logger::get("TinyContext")); - keeper_dispatcher->initialize(config_ref, true, start_async, macros); - } -} - -std::shared_ptr TinyContext::getKeeperDispatcher() const -{ - std::lock_guard lock(keeper_dispatcher_mutex); - if (!keeper_dispatcher) - throw Exception(ErrorCodes::LOGICAL_ERROR, "Keeper must be initialized before requests"); - - return keeper_dispatcher; -} - -std::shared_ptr TinyContext::tryGetKeeperDispatcher() const -{ - std::lock_guard lock(keeper_dispatcher_mutex); - return keeper_dispatcher; -} - -void TinyContext::shutdownKeeperDispatcher() const -{ - std::lock_guard lock(keeper_dispatcher_mutex); - if (keeper_dispatcher) - { - keeper_dispatcher->shutdown(); - keeper_dispatcher.reset(); - } -} - -void TinyContext::updateKeeperConfiguration([[maybe_unused]] const Poco::Util::AbstractConfiguration & config_) -{ - std::lock_guard lock(keeper_dispatcher_mutex); - if (!keeper_dispatcher) - return; - - MultiVersion::Version macros; - - if (config_.has("macros")) - macros = std::make_unique(config_, "macros", &Poco::Logger::get("TinyContext")); - - keeper_dispatcher->updateConfiguration(config_, macros); -} - -} diff --git a/src/Coordination/TinyContext.h b/src/Coordination/TinyContext.h deleted file mode 100644 index b966d445004..00000000000 --- a/src/Coordination/TinyContext.h +++ /dev/null @@ -1,36 +0,0 @@ -#pragma once -#include -#include - -#include -#include - -namespace DB -{ - -class KeeperDispatcher; - -class TinyContext : public std::enable_shared_from_this -{ -public: - std::shared_ptr getKeeperDispatcher() const; - std::shared_ptr tryGetKeeperDispatcher() const; - void initializeKeeperDispatcher(bool start_async) const; - void shutdownKeeperDispatcher() const; - void updateKeeperConfiguration(const Poco::Util::AbstractConfiguration & config); - - using ConfigurationPtr = Poco::AutoPtr; - - void setConfig(const ConfigurationPtr & config); - const Poco::Util::AbstractConfiguration & getConfigRef() const; - -private: - mutable std::mutex keeper_dispatcher_mutex; - mutable std::shared_ptr keeper_dispatcher TSA_GUARDED_BY(keeper_dispatcher_mutex); - - ConfigurationPtr config TSA_GUARDED_BY(keeper_dispatcher_mutex); -}; - -using TinyContextPtr = std::shared_ptr; - -} diff --git a/src/Coordination/ZooKeeperDataReader.cpp b/src/Coordination/ZooKeeperDataReader.cpp index bdc462f3ea0..94fc07bcc4a 100644 --- a/src/Coordination/ZooKeeperDataReader.cpp +++ b/src/Coordination/ZooKeeperDataReader.cpp @@ -139,8 +139,8 @@ int64_t deserializeStorageData(KeeperStorage & storage, ReadBuffer & in, Poco::L { if (itr.key != "/") { - auto parent_path = parentPath(itr.key); - storage.container.updateValue(parent_path, [my_path = itr.key] (KeeperStorage::Node & value) { value.addChild(getBaseName(my_path)); ++value.stat.numChildren; }); + auto parent_path = parentNodePath(itr.key); + storage.container.updateValue(parent_path, [my_path = itr.key] (KeeperStorage::Node & value) { value.addChild(getBaseNodeName(my_path)); ++value.stat.numChildren; }); } } diff --git a/src/Coordination/pathUtils.cpp b/src/Coordination/pathUtils.cpp index 1e1da339d2e..afa42b4a639 100644 --- a/src/Coordination/pathUtils.cpp +++ b/src/Coordination/pathUtils.cpp @@ -21,7 +21,7 @@ static size_t findLastSlash(StringRef path) return std::string::npos; } -StringRef parentPath(StringRef path) +StringRef parentNodePath(StringRef path) { auto rslash_pos = findLastSlash(path); if (rslash_pos > 0) @@ -29,7 +29,7 @@ StringRef parentPath(StringRef path) return "/"; } -StringRef getBaseName(StringRef path) +StringRef getBaseNodeName(StringRef path) { size_t basename_start = findLastSlash(path); return StringRef{path.data + basename_start + 1, path.size - basename_start - 1}; diff --git a/src/Coordination/pathUtils.h b/src/Coordination/pathUtils.h index 69ed2d8b177..b2b79b14110 100644 --- a/src/Coordination/pathUtils.h +++ b/src/Coordination/pathUtils.h @@ -6,8 +6,8 @@ namespace DB { -StringRef parentPath(StringRef path); +StringRef parentNodePath(StringRef path); -StringRef getBaseName(StringRef path); +StringRef getBaseNodeName(StringRef path); } diff --git a/src/Coordination/tests/gtest_coordination.cpp b/src/Coordination/tests/gtest_coordination.cpp index 453fd0f2e60..0f60c960b8b 100644 --- a/src/Coordination/tests/gtest_coordination.cpp +++ b/src/Coordination/tests/gtest_coordination.cpp @@ -2,35 +2,39 @@ #include #include "Common/ZooKeeper/IKeeper.h" -#include "Coordination/KeeperContext.h" -#include "Coordination/KeeperStorage.h" #include "Core/Defines.h" -#include "IO/WriteHelpers.h" #include "config.h" #if USE_NURAFT +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include #include #include -#include -#include -#include -#include -#include -#include -#include -#include -#include +#include +#include #include #include -#include #include -#include -#include -#include -#include -#include -#include -#include + +#include #include @@ -39,9 +43,7 @@ struct ChangelogDirTest { std::string path; bool drop; - explicit ChangelogDirTest(std::string path_, bool drop_ = true) - : path(path_) - , drop(drop_) + explicit ChangelogDirTest(std::string path_, bool drop_ = true) : path(path_), drop(drop_) { if (fs::exists(path)) { @@ -66,8 +68,20 @@ struct CompressionParam class CoordinationTest : public ::testing::TestWithParam { protected: - DB::KeeperContextPtr keeper_context = std::make_shared(); + DB::KeeperContextPtr keeper_context = std::make_shared(true); Poco::Logger * log{&Poco::Logger::get("CoordinationTest")}; + + void setLogDirectory(const std::string & path) { keeper_context->setLogDisk(std::make_shared("LogDisk", path, 0)); } + + void setSnapshotDirectory(const std::string & path) + { + keeper_context->setSnapshotDisk(std::make_shared("SnapshotDisk", path, 0)); + } + + void setStateFileDirectory(const std::string & path) + { + keeper_context->setStateFileDisk(std::make_shared("StateFile", path, 0)); + } }; TEST_P(CoordinationTest, BuildTest) @@ -113,13 +127,14 @@ TEST_P(CoordinationTest, BufferSerde) template struct SimpliestRaftServer { - SimpliestRaftServer(int server_id_, const std::string & hostname_, int port_, const std::string & logs_path, const std::string & state_path) + SimpliestRaftServer( + int server_id_, const std::string & hostname_, int port_, DB::KeeperContextPtr keeper_context) : server_id(server_id_) , hostname(hostname_) , port(port_) , endpoint(hostname + ":" + std::to_string(port)) , state_machine(nuraft::cs_new()) - , state_manager(nuraft::cs_new(server_id, hostname, port, logs_path, state_path)) + , state_manager(nuraft::cs_new(server_id, hostname, port, keeper_context)) { state_manager->loadLogStore(1, 0); nuraft::raft_params params; @@ -135,8 +150,13 @@ struct SimpliestRaftServer nuraft::raft_server::init_options opts; opts.start_server_in_constructor_ = false; raft_instance = launcher.init( - state_machine, state_manager, nuraft::cs_new("ToyRaftLogger", DB::LogsLevel::trace), port, - nuraft::asio_service::options{}, params, opts); + state_machine, + state_manager, + nuraft::cs_new("ToyRaftLogger", DB::LogsLevel::trace), + port, + nuraft::asio_service::options{}, + params, + opts); if (!raft_instance) { @@ -195,15 +215,14 @@ nuraft::ptr getBuffer(int64_t number) return ret; } - TEST_P(CoordinationTest, TestSummingRaft1) { ChangelogDirTest test("./logs"); - SummingRaftServer s1(1, "localhost", 44444, "./logs", "./state"); - SCOPE_EXIT( - if (std::filesystem::exists("./state")) - std::filesystem::remove("./state"); - ); + setLogDirectory("./logs"); + setStateFileDirectory("."); + + SummingRaftServer s1(1, "localhost", 44444, keeper_context); + SCOPE_EXIT(if (std::filesystem::exists("./state")) std::filesystem::remove("./state");); /// Single node is leader EXPECT_EQ(s1.raft_instance->get_leader(), 1); @@ -235,8 +254,10 @@ TEST_P(CoordinationTest, ChangelogTestSimple) { auto params = GetParam(); ChangelogDirTest test("./logs"); + setLogDirectory("./logs"); - DB::KeeperLogStore changelog("./logs", DB::LogFileSettings{.force_sync = true, .compress_logs = params.enable_compression, .rotate_interval = 5}); + DB::KeeperLogStore changelog( + DB::LogFileSettings{.force_sync = true, .compress_logs = params.enable_compression, .rotate_interval = 5}, keeper_context); changelog.init(1, 0); auto entry = getLogEntry("hello world", 77); changelog.append(entry); @@ -263,7 +284,10 @@ TEST_P(CoordinationTest, ChangelogTestFile) { auto params = GetParam(); ChangelogDirTest test("./logs"); - DB::KeeperLogStore changelog("./logs", DB::LogFileSettings{.force_sync = true, .compress_logs = params.enable_compression, .rotate_interval = 5}); + setLogDirectory("./logs"); + + DB::KeeperLogStore changelog( + DB::LogFileSettings{.force_sync = true, .compress_logs = params.enable_compression, .rotate_interval = 5}, keeper_context); changelog.init(1, 0); auto entry = getLogEntry("hello world", 77); changelog.append(entry); @@ -292,7 +316,10 @@ TEST_P(CoordinationTest, ChangelogReadWrite) { auto params = GetParam(); ChangelogDirTest test("./logs"); - DB::KeeperLogStore changelog("./logs", DB::LogFileSettings{.force_sync = true, .compress_logs = params.enable_compression, .rotate_interval = 1000}); + setLogDirectory("./logs"); + + DB::KeeperLogStore changelog( + DB::LogFileSettings{.force_sync = true, .compress_logs = params.enable_compression, .rotate_interval = 1000}, keeper_context); changelog.init(1, 0); for (size_t i = 0; i < 10; ++i) @@ -306,7 +333,8 @@ TEST_P(CoordinationTest, ChangelogReadWrite) waitDurableLogs(changelog); - DB::KeeperLogStore changelog_reader("./logs", DB::LogFileSettings{.force_sync = true, .compress_logs = params.enable_compression, .rotate_interval = 1000}); + DB::KeeperLogStore changelog_reader( + DB::LogFileSettings{.force_sync = true, .compress_logs = params.enable_compression, .rotate_interval = 1000}, keeper_context); changelog_reader.init(1, 0); EXPECT_EQ(changelog_reader.size(), 10); EXPECT_EQ(changelog_reader.last_entry()->get_term(), changelog.last_entry()->get_term()); @@ -326,7 +354,10 @@ TEST_P(CoordinationTest, ChangelogWriteAt) { auto params = GetParam(); ChangelogDirTest test("./logs"); - DB::KeeperLogStore changelog("./logs", DB::LogFileSettings{.force_sync = true, .compress_logs = params.enable_compression, .rotate_interval = 1000}); + setLogDirectory("./logs"); + + DB::KeeperLogStore changelog( + DB::LogFileSettings{.force_sync = true, .compress_logs = params.enable_compression, .rotate_interval = 1000}, keeper_context); changelog.init(1, 0); for (size_t i = 0; i < 10; ++i) { @@ -348,7 +379,8 @@ TEST_P(CoordinationTest, ChangelogWriteAt) EXPECT_EQ(changelog.entry_at(7)->get_term(), 77); EXPECT_EQ(changelog.next_slot(), 8); - DB::KeeperLogStore changelog_reader("./logs", DB::LogFileSettings{.force_sync = true, .compress_logs = params.enable_compression, .rotate_interval = 1000}); + DB::KeeperLogStore changelog_reader( + DB::LogFileSettings{.force_sync = true, .compress_logs = params.enable_compression, .rotate_interval = 1000}, keeper_context); changelog_reader.init(1, 0); EXPECT_EQ(changelog_reader.size(), changelog.size()); @@ -362,7 +394,10 @@ TEST_P(CoordinationTest, ChangelogTestAppendAfterRead) { auto params = GetParam(); ChangelogDirTest test("./logs"); - DB::KeeperLogStore changelog("./logs", DB::LogFileSettings{.force_sync = true, .compress_logs = params.enable_compression, .rotate_interval = 5}); + setLogDirectory("./logs"); + + DB::KeeperLogStore changelog( + DB::LogFileSettings{.force_sync = true, .compress_logs = params.enable_compression, .rotate_interval = 5}, keeper_context); changelog.init(1, 0); for (size_t i = 0; i < 7; ++i) { @@ -378,7 +413,8 @@ TEST_P(CoordinationTest, ChangelogTestAppendAfterRead) EXPECT_TRUE(fs::exists("./logs/changelog_1_5.bin" + params.extension)); EXPECT_TRUE(fs::exists("./logs/changelog_6_10.bin" + params.extension)); - DB::KeeperLogStore changelog_reader("./logs", DB::LogFileSettings{.force_sync = true, .compress_logs = params.enable_compression, .rotate_interval = 5}); + DB::KeeperLogStore changelog_reader( + DB::LogFileSettings{.force_sync = true, .compress_logs = params.enable_compression, .rotate_interval = 5}, keeper_context); changelog_reader.init(1, 0); EXPECT_EQ(changelog_reader.size(), 7); @@ -395,7 +431,7 @@ TEST_P(CoordinationTest, ChangelogTestAppendAfterRead) EXPECT_TRUE(fs::exists("./logs/changelog_6_10.bin" + params.extension)); size_t logs_count = 0; - for (const auto & _ [[maybe_unused]]: fs::directory_iterator("./logs")) + for (const auto & _ [[maybe_unused]] : fs::directory_iterator("./logs")) logs_count++; EXPECT_EQ(logs_count, 2); @@ -412,7 +448,7 @@ TEST_P(CoordinationTest, ChangelogTestAppendAfterRead) EXPECT_TRUE(fs::exists("./logs/changelog_11_15.bin" + params.extension)); logs_count = 0; - for (const auto & _ [[maybe_unused]]: fs::directory_iterator("./logs")) + for (const auto & _ [[maybe_unused]] : fs::directory_iterator("./logs")) logs_count++; EXPECT_EQ(logs_count, 3); @@ -440,7 +476,10 @@ TEST_P(CoordinationTest, ChangelogTestCompaction) { auto params = GetParam(); ChangelogDirTest test("./logs"); - DB::KeeperLogStore changelog("./logs", DB::LogFileSettings{.force_sync = true, .compress_logs = params.enable_compression, .rotate_interval = 5}); + setLogDirectory("./logs"); + + DB::KeeperLogStore changelog( + DB::LogFileSettings{.force_sync = true, .compress_logs = params.enable_compression, .rotate_interval = 5}, keeper_context); changelog.init(1, 0); for (size_t i = 0; i < 3; ++i) @@ -489,7 +528,8 @@ TEST_P(CoordinationTest, ChangelogTestCompaction) EXPECT_EQ(changelog.next_slot(), 8); EXPECT_EQ(changelog.last_entry()->get_term(), 60); /// And we able to read it - DB::KeeperLogStore changelog_reader("./logs", DB::LogFileSettings{.force_sync = true, .compress_logs = params.enable_compression, .rotate_interval = 5}); + DB::KeeperLogStore changelog_reader( + DB::LogFileSettings{.force_sync = true, .compress_logs = params.enable_compression, .rotate_interval = 5}, keeper_context); changelog_reader.init(7, 0); EXPECT_EQ(changelog_reader.size(), 1); @@ -502,7 +542,10 @@ TEST_P(CoordinationTest, ChangelogTestBatchOperations) { auto params = GetParam(); ChangelogDirTest test("./logs"); - DB::KeeperLogStore changelog("./logs", DB::LogFileSettings{.force_sync = true, .compress_logs = params.enable_compression, .rotate_interval = 100}); + setLogDirectory("./logs"); + + DB::KeeperLogStore changelog( + DB::LogFileSettings{.force_sync = true, .compress_logs = params.enable_compression, .rotate_interval = 100}, keeper_context); changelog.init(1, 0); for (size_t i = 0; i < 10; ++i) { @@ -517,7 +560,8 @@ TEST_P(CoordinationTest, ChangelogTestBatchOperations) auto entries = changelog.pack(1, 5); - DB::KeeperLogStore apply_changelog("./logs", DB::LogFileSettings{.force_sync = true, .compress_logs = params.enable_compression, .rotate_interval = 100}); + DB::KeeperLogStore apply_changelog( + DB::LogFileSettings{.force_sync = true, .compress_logs = params.enable_compression, .rotate_interval = 100}, keeper_context); apply_changelog.init(1, 0); for (size_t i = 0; i < 10; ++i) @@ -549,23 +593,31 @@ TEST_P(CoordinationTest, ChangelogTestBatchOperationsEmpty) { auto params = GetParam(); ChangelogDirTest test("./logs"); - DB::KeeperLogStore changelog("./logs", DB::LogFileSettings{.force_sync = true, .compress_logs = params.enable_compression, .rotate_interval = 100}); - changelog.init(1, 0); - for (size_t i = 0; i < 10; ++i) + setLogDirectory("./logs"); + + nuraft::ptr entries; { - auto entry = getLogEntry(std::to_string(i) + "_hello_world", i * 10); - changelog.append(entry); + DB::KeeperLogStore changelog( + DB::LogFileSettings{.force_sync = true, .compress_logs = params.enable_compression, .rotate_interval = 100}, keeper_context); + changelog.init(1, 0); + for (size_t i = 0; i < 10; ++i) + { + auto entry = getLogEntry(std::to_string(i) + "_hello_world", i * 10); + changelog.append(entry); + } + changelog.end_of_append_batch(0, 0); + + EXPECT_EQ(changelog.size(), 10); + + waitDurableLogs(changelog); + + entries = changelog.pack(5, 5); } - changelog.end_of_append_batch(0, 0); - - EXPECT_EQ(changelog.size(), 10); - - waitDurableLogs(changelog); - - auto entries = changelog.pack(5, 5); ChangelogDirTest test1("./logs1"); - DB::KeeperLogStore changelog_new("./logs1", DB::LogFileSettings{.force_sync = true, .compress_logs = params.enable_compression, .rotate_interval = 100}); + setLogDirectory("./logs1"); + DB::KeeperLogStore changelog_new( + DB::LogFileSettings{.force_sync = true, .compress_logs = params.enable_compression, .rotate_interval = 100}, keeper_context); changelog_new.init(1, 0); EXPECT_EQ(changelog_new.size(), 0); @@ -587,7 +639,8 @@ TEST_P(CoordinationTest, ChangelogTestBatchOperationsEmpty) EXPECT_EQ(changelog_new.start_index(), 5); EXPECT_EQ(changelog_new.next_slot(), 11); - DB::KeeperLogStore changelog_reader("./logs1", DB::LogFileSettings{.force_sync = true, .compress_logs = params.enable_compression, .rotate_interval = 100}); + DB::KeeperLogStore changelog_reader( + DB::LogFileSettings{.force_sync = true, .compress_logs = params.enable_compression, .rotate_interval = 100}, keeper_context); changelog_reader.init(5, 0); } @@ -596,7 +649,10 @@ TEST_P(CoordinationTest, ChangelogTestWriteAtPreviousFile) { auto params = GetParam(); ChangelogDirTest test("./logs"); - DB::KeeperLogStore changelog("./logs", DB::LogFileSettings{.force_sync = true, .compress_logs = params.enable_compression, .rotate_interval = 5}); + setLogDirectory("./logs"); + + DB::KeeperLogStore changelog( + DB::LogFileSettings{.force_sync = true, .compress_logs = params.enable_compression, .rotate_interval = 5}, keeper_context); changelog.init(1, 0); for (size_t i = 0; i < 33; ++i) @@ -637,7 +693,8 @@ TEST_P(CoordinationTest, ChangelogTestWriteAtPreviousFile) EXPECT_FALSE(fs::exists("./logs/changelog_26_30.bin" + params.extension)); EXPECT_FALSE(fs::exists("./logs/changelog_31_35.bin" + params.extension)); - DB::KeeperLogStore changelog_read("./logs", DB::LogFileSettings{.force_sync = true, .compress_logs = params.enable_compression, .rotate_interval = 5}); + DB::KeeperLogStore changelog_read( + DB::LogFileSettings{.force_sync = true, .compress_logs = params.enable_compression, .rotate_interval = 5}, keeper_context); changelog_read.init(1, 0); EXPECT_EQ(changelog_read.size(), 7); EXPECT_EQ(changelog_read.start_index(), 1); @@ -649,7 +706,10 @@ TEST_P(CoordinationTest, ChangelogTestWriteAtFileBorder) { auto params = GetParam(); ChangelogDirTest test("./logs"); - DB::KeeperLogStore changelog("./logs", DB::LogFileSettings{.force_sync = true, .compress_logs = params.enable_compression, .rotate_interval = 5}); + setLogDirectory("./logs"); + + DB::KeeperLogStore changelog( + DB::LogFileSettings{.force_sync = true, .compress_logs = params.enable_compression, .rotate_interval = 5}, keeper_context); changelog.init(1, 0); for (size_t i = 0; i < 33; ++i) @@ -690,7 +750,8 @@ TEST_P(CoordinationTest, ChangelogTestWriteAtFileBorder) EXPECT_FALSE(fs::exists("./logs/changelog_26_30.bin" + params.extension)); EXPECT_FALSE(fs::exists("./logs/changelog_31_35.bin" + params.extension)); - DB::KeeperLogStore changelog_read("./logs", DB::LogFileSettings{.force_sync = true, .compress_logs = params.enable_compression, .rotate_interval = 5}); + DB::KeeperLogStore changelog_read( + DB::LogFileSettings{.force_sync = true, .compress_logs = params.enable_compression, .rotate_interval = 5}, keeper_context); changelog_read.init(1, 0); EXPECT_EQ(changelog_read.size(), 11); EXPECT_EQ(changelog_read.start_index(), 1); @@ -702,7 +763,10 @@ TEST_P(CoordinationTest, ChangelogTestWriteAtAllFiles) { auto params = GetParam(); ChangelogDirTest test("./logs"); - DB::KeeperLogStore changelog("./logs", DB::LogFileSettings{.force_sync = true, .compress_logs = params.enable_compression, .rotate_interval = 5}); + setLogDirectory("./logs"); + + DB::KeeperLogStore changelog( + DB::LogFileSettings{.force_sync = true, .compress_logs = params.enable_compression, .rotate_interval = 5}, keeper_context); changelog.init(1, 0); for (size_t i = 0; i < 33; ++i) { @@ -747,7 +811,10 @@ TEST_P(CoordinationTest, ChangelogTestStartNewLogAfterRead) { auto params = GetParam(); ChangelogDirTest test("./logs"); - DB::KeeperLogStore changelog("./logs", DB::LogFileSettings{.force_sync = true, .compress_logs = params.enable_compression, .rotate_interval = 5}); + setLogDirectory("./logs"); + + DB::KeeperLogStore changelog( + DB::LogFileSettings{.force_sync = true, .compress_logs = params.enable_compression, .rotate_interval = 5}, keeper_context); changelog.init(1, 0); for (size_t i = 0; i < 35; ++i) @@ -768,7 +835,8 @@ TEST_P(CoordinationTest, ChangelogTestStartNewLogAfterRead) EXPECT_TRUE(fs::exists("./logs/changelog_31_35.bin" + params.extension)); EXPECT_FALSE(fs::exists("./logs/changelog_36_40.bin" + params.extension)); - DB::KeeperLogStore changelog_reader("./logs", DB::LogFileSettings{.force_sync = true, .compress_logs = params.enable_compression, .rotate_interval = 5}); + DB::KeeperLogStore changelog_reader( + DB::LogFileSettings{.force_sync = true, .compress_logs = params.enable_compression, .rotate_interval = 5}, keeper_context); changelog_reader.init(1, 0); auto entry = getLogEntry("36_hello_world", 360); @@ -792,16 +860,16 @@ namespace { void assertBrokenLogRemoved(const fs::path & log_folder, const fs::path & filename) { - EXPECT_FALSE(fs::exists(log_folder / filename)); - // broken logs are sent to the detached/{timestamp} folder - // we don't know timestamp so we iterate all of them - for (const auto & dir_entry : fs::recursive_directory_iterator(log_folder / "detached")) - { - if (dir_entry.path().filename() == filename) - return; - } + EXPECT_FALSE(fs::exists(log_folder / filename)); + // broken logs are sent to the detached/{timestamp} folder + // we don't know timestamp so we iterate all of them + for (const auto & dir_entry : fs::recursive_directory_iterator(log_folder / "detached")) + { + if (dir_entry.path().filename() == filename) + return; + } - FAIL() << "Broken log " << filename << " was not moved to the detached folder"; + FAIL() << "Broken log " << filename << " was not moved to the detached folder"; } } @@ -812,8 +880,10 @@ TEST_P(CoordinationTest, ChangelogTestReadAfterBrokenTruncate) auto params = GetParam(); ChangelogDirTest test(log_folder); + setLogDirectory(log_folder); - DB::KeeperLogStore changelog(log_folder, DB::LogFileSettings{.force_sync = true, .compress_logs = params.enable_compression, .rotate_interval = 5}); + DB::KeeperLogStore changelog( + DB::LogFileSettings{.force_sync = true, .compress_logs = params.enable_compression, .rotate_interval = 5}, keeper_context); changelog.init(1, 0); for (size_t i = 0; i < 35; ++i) @@ -833,10 +903,12 @@ TEST_P(CoordinationTest, ChangelogTestReadAfterBrokenTruncate) EXPECT_TRUE(fs::exists("./logs/changelog_26_30.bin" + params.extension)); EXPECT_TRUE(fs::exists("./logs/changelog_31_35.bin" + params.extension)); - DB::WriteBufferFromFile plain_buf("./logs/changelog_11_15.bin" + params.extension, DBMS_DEFAULT_BUFFER_SIZE, O_APPEND | O_CREAT | O_WRONLY); + DB::WriteBufferFromFile plain_buf( + "./logs/changelog_11_15.bin" + params.extension, DBMS_DEFAULT_BUFFER_SIZE, O_APPEND | O_CREAT | O_WRONLY); plain_buf.truncate(0); - DB::KeeperLogStore changelog_reader("./logs", DB::LogFileSettings{.force_sync = true, .compress_logs = params.enable_compression, .rotate_interval = 5}); + DB::KeeperLogStore changelog_reader( + DB::LogFileSettings{.force_sync = true, .compress_logs = params.enable_compression, .rotate_interval = 5}, keeper_context); changelog_reader.init(1, 0); changelog_reader.end_of_append_batch(0, 0); @@ -869,7 +941,8 @@ TEST_P(CoordinationTest, ChangelogTestReadAfterBrokenTruncate) assertBrokenLogRemoved(log_folder, "changelog_26_30.bin" + params.extension); assertBrokenLogRemoved(log_folder, "changelog_31_35.bin" + params.extension); - DB::KeeperLogStore changelog_reader2("./logs", DB::LogFileSettings{.force_sync = true, .compress_logs = params.enable_compression, .rotate_interval = 5}); + DB::KeeperLogStore changelog_reader2( + DB::LogFileSettings{.force_sync = true, .compress_logs = params.enable_compression, .rotate_interval = 5}, keeper_context); changelog_reader2.init(1, 0); EXPECT_EQ(changelog_reader2.size(), 11); EXPECT_EQ(changelog_reader2.last_entry()->get_term(), 7777); @@ -879,8 +952,10 @@ TEST_P(CoordinationTest, ChangelogTestReadAfterBrokenTruncate2) { auto params = GetParam(); ChangelogDirTest test("./logs"); + setLogDirectory("./logs"); - DB::KeeperLogStore changelog("./logs", DB::LogFileSettings{.force_sync = true, .compress_logs = params.enable_compression, .rotate_interval = 20}); + DB::KeeperLogStore changelog( + DB::LogFileSettings{.force_sync = true, .compress_logs = params.enable_compression, .rotate_interval = 20}, keeper_context); changelog.init(1, 0); for (size_t i = 0; i < 35; ++i) @@ -894,10 +969,12 @@ TEST_P(CoordinationTest, ChangelogTestReadAfterBrokenTruncate2) EXPECT_TRUE(fs::exists("./logs/changelog_1_20.bin" + params.extension)); EXPECT_TRUE(fs::exists("./logs/changelog_21_40.bin" + params.extension)); - DB::WriteBufferFromFile plain_buf("./logs/changelog_1_20.bin" + params.extension, DBMS_DEFAULT_BUFFER_SIZE, O_APPEND | O_CREAT | O_WRONLY); + DB::WriteBufferFromFile plain_buf( + "./logs/changelog_1_20.bin" + params.extension, DBMS_DEFAULT_BUFFER_SIZE, O_APPEND | O_CREAT | O_WRONLY); plain_buf.truncate(30); - DB::KeeperLogStore changelog_reader("./logs", DB::LogFileSettings{.force_sync = true, .compress_logs = params.enable_compression, .rotate_interval = 20}); + DB::KeeperLogStore changelog_reader( + DB::LogFileSettings{.force_sync = true, .compress_logs = params.enable_compression, .rotate_interval = 20}, keeper_context); changelog_reader.init(1, 0); EXPECT_EQ(changelog_reader.size(), 0); @@ -912,7 +989,8 @@ TEST_P(CoordinationTest, ChangelogTestReadAfterBrokenTruncate2) EXPECT_EQ(changelog_reader.size(), 1); EXPECT_EQ(changelog_reader.last_entry()->get_term(), 7777); - DB::KeeperLogStore changelog_reader2("./logs", DB::LogFileSettings{.force_sync = true, .compress_logs = params.enable_compression, .rotate_interval = 1}); + DB::KeeperLogStore changelog_reader2( + DB::LogFileSettings{.force_sync = true, .compress_logs = params.enable_compression, .rotate_interval = 1}, keeper_context); changelog_reader2.init(1, 0); EXPECT_EQ(changelog_reader2.size(), 1); EXPECT_EQ(changelog_reader2.last_entry()->get_term(), 7777); @@ -922,8 +1000,10 @@ TEST_P(CoordinationTest, ChangelogTestLostFiles) { auto params = GetParam(); ChangelogDirTest test("./logs"); + setLogDirectory("./logs"); - DB::KeeperLogStore changelog("./logs", DB::LogFileSettings{.force_sync = true, .compress_logs = params.enable_compression, .rotate_interval = 20}); + DB::KeeperLogStore changelog( + DB::LogFileSettings{.force_sync = true, .compress_logs = params.enable_compression, .rotate_interval = 20}, keeper_context); changelog.init(1, 0); for (size_t i = 0; i < 35; ++i) @@ -939,7 +1019,8 @@ TEST_P(CoordinationTest, ChangelogTestLostFiles) fs::remove("./logs/changelog_1_20.bin" + params.extension); - DB::KeeperLogStore changelog_reader("./logs", DB::LogFileSettings{.force_sync = true, .compress_logs = params.enable_compression, .rotate_interval = 20}); + DB::KeeperLogStore changelog_reader( + DB::LogFileSettings{.force_sync = true, .compress_logs = params.enable_compression, .rotate_interval = 20}, keeper_context); /// It should print error message, but still able to start changelog_reader.init(5, 0); assertBrokenLogRemoved("./logs", "changelog_21_40.bin" + params.extension); @@ -949,8 +1030,10 @@ TEST_P(CoordinationTest, ChangelogTestLostFiles2) { auto params = GetParam(); ChangelogDirTest test("./logs"); + setLogDirectory("./logs"); - DB::KeeperLogStore changelog("./logs", DB::LogFileSettings{.force_sync = true, .compress_logs = params.enable_compression, .rotate_interval = 10}); + DB::KeeperLogStore changelog( + DB::LogFileSettings{.force_sync = true, .compress_logs = params.enable_compression, .rotate_interval = 10}, keeper_context); changelog.init(1, 0); for (size_t i = 0; i < 35; ++i) @@ -970,7 +1053,8 @@ TEST_P(CoordinationTest, ChangelogTestLostFiles2) // we have a gap in our logs, we need to remove all the logs after the gap fs::remove("./logs/changelog_21_30.bin" + params.extension); - DB::KeeperLogStore changelog_reader("./logs", DB::LogFileSettings{.force_sync = true, .compress_logs = params.enable_compression, .rotate_interval = 10}); + DB::KeeperLogStore changelog_reader( + DB::LogFileSettings{.force_sync = true, .compress_logs = params.enable_compression, .rotate_interval = 10}, keeper_context); /// It should print error message, but still able to start changelog_reader.init(5, 0); EXPECT_TRUE(fs::exists("./logs/changelog_1_10.bin" + params.extension)); @@ -1155,7 +1239,7 @@ TEST_P(CoordinationTest, SnapshotableHashMapDataSize) EXPECT_EQ(world.getApproximateDataSize(), 0); } -void addNode(DB::KeeperStorage & storage, const std::string & path, const std::string & data, int64_t ephemeral_owner=0) +void addNode(DB::KeeperStorage & storage, const std::string & path, const std::string & data, int64_t ephemeral_owner = 0) { using Node = DB::KeeperStorage::Node; Node node{}; @@ -1163,19 +1247,23 @@ void addNode(DB::KeeperStorage & storage, const std::string & path, const std::s node.stat.ephemeralOwner = ephemeral_owner; storage.container.insertOrReplace(path, node); auto child_it = storage.container.find(path); - auto child_path = DB::getBaseName(child_it->key); - storage.container.updateValue(DB::parentPath(StringRef{path}), [&](auto & parent) - { - parent.addChild(child_path); - parent.stat.numChildren++; - }); + auto child_path = DB::getBaseNodeName(child_it->key); + storage.container.updateValue( + DB::parentNodePath(StringRef{path}), + [&](auto & parent) + { + parent.addChild(child_path); + parent.stat.numChildren++; + }); } TEST_P(CoordinationTest, TestStorageSnapshotSimple) { auto params = GetParam(); ChangelogDirTest test("./snapshots"); - DB::KeeperSnapshotManager manager("./snapshots", 3, keeper_context, params.enable_compression); + setSnapshotDirectory("./snapshots"); + + DB::KeeperSnapshotManager manager(3, keeper_context, params.enable_compression); DB::KeeperStorage storage(500, "", keeper_context); addNode(storage, "/hello", "world", 1); @@ -1191,7 +1279,7 @@ TEST_P(CoordinationTest, TestStorageSnapshotSimple) EXPECT_EQ(snapshot.snapshot_meta->get_last_log_idx(), 2); EXPECT_EQ(snapshot.session_id, 7); - EXPECT_EQ(snapshot.snapshot_container_size, 5); + EXPECT_EQ(snapshot.snapshot_container_size, 6); EXPECT_EQ(snapshot.session_and_timeout.size(), 2); auto buf = manager.serializeSnapshotToBuffer(snapshot); @@ -1203,7 +1291,7 @@ TEST_P(CoordinationTest, TestStorageSnapshotSimple) auto [restored_storage, snapshot_meta, _] = manager.deserializeSnapshotFromBuffer(debuf); - EXPECT_EQ(restored_storage->container.size(), 5); + EXPECT_EQ(restored_storage->container.size(), 6); EXPECT_EQ(restored_storage->container.getValue("/").getChildren().size(), 2); EXPECT_EQ(restored_storage->container.getValue("/hello").getChildren().size(), 1); EXPECT_EQ(restored_storage->container.getValue("/hello/somepath").getChildren().size(), 0); @@ -1223,7 +1311,9 @@ TEST_P(CoordinationTest, TestStorageSnapshotMoreWrites) { auto params = GetParam(); ChangelogDirTest test("./snapshots"); - DB::KeeperSnapshotManager manager("./snapshots", 3, keeper_context, params.enable_compression); + setSnapshotDirectory("./snapshots"); + + DB::KeeperSnapshotManager manager(3, keeper_context, params.enable_compression); DB::KeeperStorage storage(500, "", keeper_context); storage.getSessionID(130); @@ -1235,14 +1325,14 @@ TEST_P(CoordinationTest, TestStorageSnapshotMoreWrites) DB::KeeperStorageSnapshot snapshot(&storage, 50); EXPECT_EQ(snapshot.snapshot_meta->get_last_log_idx(), 50); - EXPECT_EQ(snapshot.snapshot_container_size, 53); + EXPECT_EQ(snapshot.snapshot_container_size, 54); for (size_t i = 50; i < 100; ++i) { addNode(storage, "/hello_" + std::to_string(i), "world_" + std::to_string(i)); } - EXPECT_EQ(storage.container.size(), 103); + EXPECT_EQ(storage.container.size(), 104); auto buf = manager.serializeSnapshotToBuffer(snapshot); manager.serializeSnapshotBufferToDisk(*buf, 50); @@ -1252,7 +1342,7 @@ TEST_P(CoordinationTest, TestStorageSnapshotMoreWrites) auto debuf = manager.deserializeSnapshotBufferFromDisk(50); auto [restored_storage, meta, _] = manager.deserializeSnapshotFromBuffer(debuf); - EXPECT_EQ(restored_storage->container.size(), 53); + EXPECT_EQ(restored_storage->container.size(), 54); for (size_t i = 0; i < 50; ++i) { EXPECT_EQ(restored_storage->container.getValue("/hello_" + std::to_string(i)).getData(), "world_" + std::to_string(i)); @@ -1264,7 +1354,9 @@ TEST_P(CoordinationTest, TestStorageSnapshotManySnapshots) { auto params = GetParam(); ChangelogDirTest test("./snapshots"); - DB::KeeperSnapshotManager manager("./snapshots", 3, keeper_context, params.enable_compression); + setSnapshotDirectory("./snapshots"); + + DB::KeeperSnapshotManager manager(3, keeper_context, params.enable_compression); DB::KeeperStorage storage(500, "", keeper_context); storage.getSessionID(130); @@ -1291,7 +1383,7 @@ TEST_P(CoordinationTest, TestStorageSnapshotManySnapshots) auto [restored_storage, meta, _] = manager.restoreFromLatestSnapshot(); - EXPECT_EQ(restored_storage->container.size(), 253); + EXPECT_EQ(restored_storage->container.size(), 254); for (size_t i = 0; i < 250; ++i) { @@ -1303,7 +1395,9 @@ TEST_P(CoordinationTest, TestStorageSnapshotMode) { auto params = GetParam(); ChangelogDirTest test("./snapshots"); - DB::KeeperSnapshotManager manager("./snapshots", 3, keeper_context, params.enable_compression); + setSnapshotDirectory("./snapshots"); + + DB::KeeperSnapshotManager manager(3, keeper_context, params.enable_compression); DB::KeeperStorage storage(500, "", keeper_context); for (size_t i = 0; i < 50; ++i) { @@ -1325,16 +1419,16 @@ TEST_P(CoordinationTest, TestStorageSnapshotMode) if (i % 2 == 0) storage.container.erase("/hello_" + std::to_string(i)); } - EXPECT_EQ(storage.container.size(), 28); - EXPECT_EQ(storage.container.snapshotSizeWithVersion().first, 104); + EXPECT_EQ(storage.container.size(), 29); + EXPECT_EQ(storage.container.snapshotSizeWithVersion().first, 105); EXPECT_EQ(storage.container.snapshotSizeWithVersion().second, 1); auto buf = manager.serializeSnapshotToBuffer(snapshot); manager.serializeSnapshotBufferToDisk(*buf, 50); } EXPECT_TRUE(fs::exists("./snapshots/snapshot_50.bin" + params.extension)); - EXPECT_EQ(storage.container.size(), 28); + EXPECT_EQ(storage.container.size(), 29); storage.clearGarbageAfterSnapshot(); - EXPECT_EQ(storage.container.snapshotSizeWithVersion().first, 28); + EXPECT_EQ(storage.container.snapshotSizeWithVersion().first, 29); for (size_t i = 0; i < 50; ++i) { if (i % 2 != 0) @@ -1349,14 +1443,15 @@ TEST_P(CoordinationTest, TestStorageSnapshotMode) { EXPECT_EQ(restored_storage->container.getValue("/hello_" + std::to_string(i)).getData(), "world_" + std::to_string(i)); } - } TEST_P(CoordinationTest, TestStorageSnapshotBroken) { auto params = GetParam(); ChangelogDirTest test("./snapshots"); - DB::KeeperSnapshotManager manager("./snapshots", 3, keeper_context, params.enable_compression); + setSnapshotDirectory("./snapshots"); + + DB::KeeperSnapshotManager manager(3, keeper_context, params.enable_compression); DB::KeeperStorage storage(500, "", keeper_context); for (size_t i = 0; i < 50; ++i) { @@ -1370,7 +1465,8 @@ TEST_P(CoordinationTest, TestStorageSnapshotBroken) EXPECT_TRUE(fs::exists("./snapshots/snapshot_50.bin" + params.extension)); /// Let's corrupt file - DB::WriteBufferFromFile plain_buf("./snapshots/snapshot_50.bin" + params.extension, DBMS_DEFAULT_BUFFER_SIZE, O_APPEND | O_CREAT | O_WRONLY); + DB::WriteBufferFromFile plain_buf( + "./snapshots/snapshot_50.bin" + params.extension, DBMS_DEFAULT_BUFFER_SIZE, O_APPEND | O_CREAT | O_WRONLY); plain_buf.truncate(34); plain_buf.sync(); @@ -1390,25 +1486,35 @@ nuraft::ptr getBufferFromZKRequest(int64_t session_id, int64_t z return buf.getBuffer(); } -nuraft::ptr getLogEntryFromZKRequest(size_t term, int64_t session_id, int64_t zxid, const Coordination::ZooKeeperRequestPtr & request) +nuraft::ptr +getLogEntryFromZKRequest(size_t term, int64_t session_id, int64_t zxid, const Coordination::ZooKeeperRequestPtr & request) { auto buffer = getBufferFromZKRequest(session_id, zxid, request); return nuraft::cs_new(term, buffer); } -void testLogAndStateMachine(Coordination::CoordinationSettingsPtr settings, uint64_t total_logs, bool enable_compression, Coordination::KeeperContextPtr keeper_context) +void testLogAndStateMachine( + Coordination::CoordinationSettingsPtr settings, + uint64_t total_logs, + bool enable_compression, + Coordination::KeeperContextPtr keeper_context) { using namespace Coordination; using namespace DB; ChangelogDirTest snapshots("./snapshots"); + keeper_context->setSnapshotDisk(std::make_shared("SnapshotDisk", "./snapshots", 0)); ChangelogDirTest logs("./logs"); + keeper_context->setLogDisk(std::make_shared("LogDisk", "./logs", 0)); ResponsesQueue queue(std::numeric_limits::max()); SnapshotsQueue snapshots_queue{1}; - auto state_machine = std::make_shared(queue, snapshots_queue, "./snapshots", settings, keeper_context, nullptr); + auto state_machine = std::make_shared(queue, snapshots_queue, settings, keeper_context, nullptr); state_machine->init(); - DB::KeeperLogStore changelog("./logs", DB::LogFileSettings{.force_sync = true, .compress_logs = enable_compression, .rotate_interval = settings->rotate_log_storage_interval}); + DB::KeeperLogStore changelog( + DB::LogFileSettings{ + .force_sync = true, .compress_logs = enable_compression, .rotate_interval = settings->rotate_log_storage_interval}, + keeper_context); changelog.init(state_machine->last_commit_index() + 1, settings->reserved_log_items); for (size_t i = 1; i < total_logs + 1; ++i) { @@ -1426,7 +1532,8 @@ void testLogAndStateMachine(Coordination::CoordinationSettingsPtr settings, uint if (i % settings->snapshot_distance == 0) { nuraft::snapshot s(i, 0, std::make_shared()); - nuraft::async_result::handler_type when_done = [&snapshot_created] (bool & ret, nuraft::ptr &/*exception*/) + nuraft::async_result::handler_type when_done + = [&snapshot_created](bool & ret, nuraft::ptr & /*exception*/) { snapshot_created = ret; LOG_INFO(&Poco::Logger::get("CoordinationTest"), "Snapshot finished"); @@ -1444,17 +1551,21 @@ void testLogAndStateMachine(Coordination::CoordinationSettingsPtr settings, uint } SnapshotsQueue snapshots_queue1{1}; - auto restore_machine = std::make_shared(queue, snapshots_queue1, "./snapshots", settings, keeper_context, nullptr); + auto restore_machine = std::make_shared(queue, snapshots_queue1, settings, keeper_context, nullptr); restore_machine->init(); EXPECT_EQ(restore_machine->last_commit_index(), total_logs - total_logs % settings->snapshot_distance); - DB::KeeperLogStore restore_changelog("./logs", DB::LogFileSettings{.force_sync = true, .compress_logs = enable_compression, .rotate_interval = settings->rotate_log_storage_interval}); + DB::KeeperLogStore restore_changelog( + DB::LogFileSettings{ + .force_sync = true, .compress_logs = enable_compression, .rotate_interval = settings->rotate_log_storage_interval}, + keeper_context); restore_changelog.init(restore_machine->last_commit_index() + 1, settings->reserved_log_items); EXPECT_EQ(restore_changelog.size(), std::min(settings->reserved_log_items + total_logs % settings->snapshot_distance, total_logs)); EXPECT_EQ(restore_changelog.next_slot(), total_logs + 1); if (total_logs > settings->reserved_log_items + 1) - EXPECT_EQ(restore_changelog.start_index(), total_logs - total_logs % settings->snapshot_distance - settings->reserved_log_items + 1); + EXPECT_EQ( + restore_changelog.start_index(), total_logs - total_logs % settings->snapshot_distance - settings->reserved_log_items + 1); else EXPECT_EQ(restore_changelog.start_index(), 1); @@ -1552,11 +1663,13 @@ TEST_P(CoordinationTest, TestEphemeralNodeRemove) using namespace DB; ChangelogDirTest snapshots("./snapshots"); + setSnapshotDirectory("./snapshots"); + CoordinationSettingsPtr settings = std::make_shared(); ResponsesQueue queue(std::numeric_limits::max()); SnapshotsQueue snapshots_queue{1}; - auto state_machine = std::make_shared(queue, snapshots_queue, "./snapshots", settings, keeper_context, nullptr); + auto state_machine = std::make_shared(queue, snapshots_queue, settings, keeper_context, nullptr); state_machine->init(); std::shared_ptr request_c = std::make_shared(); @@ -1585,11 +1698,12 @@ TEST_P(CoordinationTest, TestCreateNodeWithAuthSchemeForAclWhenAuthIsPrecommitte using namespace DB; ChangelogDirTest snapshots("./snapshots"); + setSnapshotDirectory("./snapshots"); CoordinationSettingsPtr settings = std::make_shared(); ResponsesQueue queue(std::numeric_limits::max()); SnapshotsQueue snapshots_queue{1}; - auto state_machine = std::make_shared(queue, snapshots_queue, "./snapshots", settings, keeper_context, nullptr); + auto state_machine = std::make_shared(queue, snapshots_queue, settings, keeper_context, nullptr); state_machine->init(); String user_auth_data = "test_user:test_password"; @@ -1635,11 +1749,13 @@ TEST_P(CoordinationTest, TestSetACLWithAuthSchemeForAclWhenAuthIsPrecommitted) using namespace DB; ChangelogDirTest snapshots("./snapshots"); + setSnapshotDirectory("./snapshots"); + CoordinationSettingsPtr settings = std::make_shared(); ResponsesQueue queue(std::numeric_limits::max()); SnapshotsQueue snapshots_queue{1}; - auto state_machine = std::make_shared(queue, snapshots_queue, "./snapshots", settings, keeper_context, nullptr); + auto state_machine = std::make_shared(queue, snapshots_queue, settings, keeper_context, nullptr); state_machine->init(); String user_auth_data = "test_user:test_password"; @@ -1691,8 +1807,10 @@ TEST_P(CoordinationTest, TestRotateIntervalChanges) using namespace Coordination; auto params = GetParam(); ChangelogDirTest snapshots("./logs"); + setLogDirectory("./logs"); { - DB::KeeperLogStore changelog("./logs", DB::LogFileSettings{.force_sync = true, .compress_logs = params.enable_compression, .rotate_interval = 100}); + DB::KeeperLogStore changelog( + DB::LogFileSettings{.force_sync = true, .compress_logs = params.enable_compression, .rotate_interval = 100}, keeper_context); changelog.init(0, 3); for (size_t i = 1; i < 55; ++i) @@ -1710,7 +1828,8 @@ TEST_P(CoordinationTest, TestRotateIntervalChanges) EXPECT_TRUE(fs::exists("./logs/changelog_1_100.bin" + params.extension)); - DB::KeeperLogStore changelog_1("./logs", DB::LogFileSettings{.force_sync = true, .compress_logs = params.enable_compression, .rotate_interval = 10}); + DB::KeeperLogStore changelog_1( + DB::LogFileSettings{.force_sync = true, .compress_logs = params.enable_compression, .rotate_interval = 10}, keeper_context); changelog_1.init(0, 50); for (size_t i = 0; i < 55; ++i) { @@ -1726,7 +1845,8 @@ TEST_P(CoordinationTest, TestRotateIntervalChanges) EXPECT_TRUE(fs::exists("./logs/changelog_1_100.bin" + params.extension)); EXPECT_TRUE(fs::exists("./logs/changelog_101_110.bin" + params.extension)); - DB::KeeperLogStore changelog_2("./logs", DB::LogFileSettings{.force_sync = true, .compress_logs = params.enable_compression, .rotate_interval = 7}); + DB::KeeperLogStore changelog_2( + DB::LogFileSettings{.force_sync = true, .compress_logs = params.enable_compression, .rotate_interval = 7}, keeper_context); changelog_2.init(98, 55); for (size_t i = 0; i < 17; ++i) @@ -1749,7 +1869,8 @@ TEST_P(CoordinationTest, TestRotateIntervalChanges) EXPECT_TRUE(fs::exists("./logs/changelog_118_124.bin" + params.extension)); EXPECT_TRUE(fs::exists("./logs/changelog_125_131.bin" + params.extension)); - DB::KeeperLogStore changelog_3("./logs", DB::LogFileSettings{.force_sync = true, .compress_logs = params.enable_compression, .rotate_interval = 5}); + DB::KeeperLogStore changelog_3( + DB::LogFileSettings{.force_sync = true, .compress_logs = params.enable_compression, .rotate_interval = 5}, keeper_context); changelog_3.init(116, 3); for (size_t i = 0; i < 17; ++i) { @@ -1796,8 +1917,10 @@ TEST_P(CoordinationTest, TestCompressedLogsMultipleRewrite) { using namespace Coordination; auto test_params = GetParam(); - ChangelogDirTest snapshots("./logs"); - DB::KeeperLogStore changelog("./logs", DB::LogFileSettings{.force_sync = true, .compress_logs = test_params.enable_compression, .rotate_interval = 100}); + ChangelogDirTest logs("./logs"); + setLogDirectory("./logs"); + DB::KeeperLogStore changelog( + DB::LogFileSettings{.force_sync = true, .compress_logs = test_params.enable_compression, .rotate_interval = 100}, keeper_context); changelog.init(0, 3); for (size_t i = 1; i < 55; ++i) @@ -1811,7 +1934,8 @@ TEST_P(CoordinationTest, TestCompressedLogsMultipleRewrite) waitDurableLogs(changelog); - DB::KeeperLogStore changelog1("./logs", DB::LogFileSettings{.force_sync = true, .compress_logs = test_params.enable_compression, .rotate_interval = 100}); + DB::KeeperLogStore changelog1( + DB::LogFileSettings{.force_sync = true, .compress_logs = test_params.enable_compression, .rotate_interval = 100}, keeper_context); changelog1.init(0, 3); for (size_t i = 55; i < 70; ++i) { @@ -1822,7 +1946,10 @@ TEST_P(CoordinationTest, TestCompressedLogsMultipleRewrite) changelog1.end_of_append_batch(0, 0); } - DB::KeeperLogStore changelog2("./logs", DB::LogFileSettings{.force_sync = true, .compress_logs = test_params.enable_compression, .rotate_interval = 100}); + waitDurableLogs(changelog1); + + DB::KeeperLogStore changelog2( + DB::LogFileSettings{.force_sync = true, .compress_logs = test_params.enable_compression, .rotate_interval = 100}, keeper_context); changelog2.init(0, 3); for (size_t i = 70; i < 80; ++i) { @@ -1839,7 +1966,9 @@ TEST_P(CoordinationTest, TestStorageSnapshotDifferentCompressions) auto params = GetParam(); ChangelogDirTest test("./snapshots"); - DB::KeeperSnapshotManager manager("./snapshots", 3, keeper_context, params.enable_compression); + setSnapshotDirectory("./snapshots"); + + DB::KeeperSnapshotManager manager(3, keeper_context, params.enable_compression); DB::KeeperStorage storage(500, "", keeper_context); addNode(storage, "/hello", "world", 1); @@ -1857,13 +1986,13 @@ TEST_P(CoordinationTest, TestStorageSnapshotDifferentCompressions) manager.serializeSnapshotBufferToDisk(*buf, 2); EXPECT_TRUE(fs::exists("./snapshots/snapshot_2.bin" + params.extension)); - DB::KeeperSnapshotManager new_manager("./snapshots", 3, keeper_context, !params.enable_compression); + DB::KeeperSnapshotManager new_manager(3, keeper_context, !params.enable_compression); auto debuf = new_manager.deserializeSnapshotBufferFromDisk(2); auto [restored_storage, snapshot_meta, _] = new_manager.deserializeSnapshotFromBuffer(debuf); - EXPECT_EQ(restored_storage->container.size(), 5); + EXPECT_EQ(restored_storage->container.size(), 6); EXPECT_EQ(restored_storage->container.getValue("/").getChildren().size(), 2); EXPECT_EQ(restored_storage->container.getValue("/hello").getChildren().size(), 1); EXPECT_EQ(restored_storage->container.getValue("/hello/somepath").getChildren().size(), 0); @@ -1883,9 +2012,10 @@ TEST_P(CoordinationTest, ChangelogInsertThreeTimesSmooth) { auto params = GetParam(); ChangelogDirTest test("./logs"); + setLogDirectory("./logs"); { LOG_INFO(log, "================First time====================="); - DB::KeeperLogStore changelog("./logs", DB::LogFileSettings{.force_sync = true, .compress_logs = params.enable_compression, .rotate_interval = 100}); + DB::KeeperLogStore changelog(DB::LogFileSettings{.force_sync = true, .compress_logs = params.enable_compression, .rotate_interval = 100}, keeper_context); changelog.init(1, 0); auto entry = getLogEntry("hello_world", 1000); changelog.append(entry); @@ -1896,7 +2026,8 @@ TEST_P(CoordinationTest, ChangelogInsertThreeTimesSmooth) { LOG_INFO(log, "================Second time====================="); - DB::KeeperLogStore changelog("./logs", DB::LogFileSettings{.force_sync = true, .compress_logs = params.enable_compression, .rotate_interval = 100}); + DB::KeeperLogStore changelog( + DB::LogFileSettings{.force_sync = true, .compress_logs = params.enable_compression, .rotate_interval = 100}, keeper_context); changelog.init(1, 0); auto entry = getLogEntry("hello_world", 1000); changelog.append(entry); @@ -1907,7 +2038,8 @@ TEST_P(CoordinationTest, ChangelogInsertThreeTimesSmooth) { LOG_INFO(log, "================Third time====================="); - DB::KeeperLogStore changelog("./logs", DB::LogFileSettings{.force_sync = true, .compress_logs = params.enable_compression, .rotate_interval = 100}); + DB::KeeperLogStore changelog( + DB::LogFileSettings{.force_sync = true, .compress_logs = params.enable_compression, .rotate_interval = 100}, keeper_context); changelog.init(1, 0); auto entry = getLogEntry("hello_world", 1000); changelog.append(entry); @@ -1918,7 +2050,8 @@ TEST_P(CoordinationTest, ChangelogInsertThreeTimesSmooth) { LOG_INFO(log, "================Fourth time====================="); - DB::KeeperLogStore changelog("./logs", DB::LogFileSettings{.force_sync = true, .compress_logs = params.enable_compression, .rotate_interval = 100}); + DB::KeeperLogStore changelog( + DB::LogFileSettings{.force_sync = true, .compress_logs = params.enable_compression, .rotate_interval = 100}, keeper_context); changelog.init(1, 0); auto entry = getLogEntry("hello_world", 1000); changelog.append(entry); @@ -1933,10 +2066,12 @@ TEST_P(CoordinationTest, ChangelogInsertMultipleTimesSmooth) { auto params = GetParam(); ChangelogDirTest test("./logs"); + setLogDirectory("./logs"); for (size_t i = 0; i < 36; ++i) { LOG_INFO(log, "================First time====================="); - DB::KeeperLogStore changelog("./logs", DB::LogFileSettings{.force_sync = true, .compress_logs = params.enable_compression, .rotate_interval = 100}); + DB::KeeperLogStore changelog( + DB::LogFileSettings{.force_sync = true, .compress_logs = params.enable_compression, .rotate_interval = 100}, keeper_context); changelog.init(1, 0); for (size_t j = 0; j < 7; ++j) { @@ -1947,7 +2082,8 @@ TEST_P(CoordinationTest, ChangelogInsertMultipleTimesSmooth) waitDurableLogs(changelog); } - DB::KeeperLogStore changelog("./logs", DB::LogFileSettings{.force_sync = true, .compress_logs = params.enable_compression, .rotate_interval = 100}); + DB::KeeperLogStore changelog( + DB::LogFileSettings{.force_sync = true, .compress_logs = params.enable_compression, .rotate_interval = 100}, keeper_context); changelog.init(1, 0); EXPECT_EQ(changelog.next_slot(), 36 * 7 + 1); } @@ -1956,9 +2092,11 @@ TEST_P(CoordinationTest, ChangelogInsertThreeTimesHard) { auto params = GetParam(); ChangelogDirTest test("./logs"); + setLogDirectory("./logs"); { LOG_INFO(log, "================First time====================="); - DB::KeeperLogStore changelog1("./logs", DB::LogFileSettings{.force_sync = true, .compress_logs = params.enable_compression, .rotate_interval = 100}); + DB::KeeperLogStore changelog1( + DB::LogFileSettings{.force_sync = true, .compress_logs = params.enable_compression, .rotate_interval = 100}, keeper_context); changelog1.init(1, 0); auto entry = getLogEntry("hello_world", 1000); changelog1.append(entry); @@ -1969,7 +2107,8 @@ TEST_P(CoordinationTest, ChangelogInsertThreeTimesHard) { LOG_INFO(log, "================Second time====================="); - DB::KeeperLogStore changelog2("./logs", DB::LogFileSettings{.force_sync = true, .compress_logs = params.enable_compression, .rotate_interval = 100}); + DB::KeeperLogStore changelog2( + DB::LogFileSettings{.force_sync = true, .compress_logs = params.enable_compression, .rotate_interval = 100}, keeper_context); changelog2.init(1, 0); auto entry = getLogEntry("hello_world", 1000); changelog2.append(entry); @@ -1980,7 +2119,8 @@ TEST_P(CoordinationTest, ChangelogInsertThreeTimesHard) { LOG_INFO(log, "================Third time====================="); - DB::KeeperLogStore changelog3("./logs", DB::LogFileSettings{.force_sync = true, .compress_logs = params.enable_compression, .rotate_interval = 100}); + DB::KeeperLogStore changelog3( + DB::LogFileSettings{.force_sync = true, .compress_logs = params.enable_compression, .rotate_interval = 100}, keeper_context); changelog3.init(1, 0); auto entry = getLogEntry("hello_world", 1000); changelog3.append(entry); @@ -1991,7 +2131,8 @@ TEST_P(CoordinationTest, ChangelogInsertThreeTimesHard) { LOG_INFO(log, "================Fourth time====================="); - DB::KeeperLogStore changelog4("./logs", DB::LogFileSettings{.force_sync = true, .compress_logs = params.enable_compression, .rotate_interval = 100}); + DB::KeeperLogStore changelog4( + DB::LogFileSettings{.force_sync = true, .compress_logs = params.enable_compression, .rotate_interval = 100}, keeper_context); changelog4.init(1, 0); auto entry = getLogEntry("hello_world", 1000); changelog4.append(entry); @@ -2005,10 +2146,12 @@ TEST_P(CoordinationTest, TestStorageSnapshotEqual) { auto params = GetParam(); ChangelogDirTest test("./snapshots"); + setSnapshotDirectory("./snapshots"); + std::optional snapshot_hash; for (size_t i = 0; i < 15; ++i) { - DB::KeeperSnapshotManager manager("./snapshots", 3, keeper_context, params.enable_compression); + DB::KeeperSnapshotManager manager(3, keeper_context, params.enable_compression); DB::KeeperStorage storage(500, "", keeper_context); addNode(storage, "/hello", ""); @@ -2048,7 +2191,10 @@ TEST_P(CoordinationTest, TestLogGap) using namespace Coordination; auto test_params = GetParam(); ChangelogDirTest logs("./logs"); - DB::KeeperLogStore changelog("./logs", DB::LogFileSettings{.force_sync = true, .compress_logs = test_params.enable_compression, .rotate_interval = 100}); + setLogDirectory("./logs"); + + DB::KeeperLogStore changelog( + DB::LogFileSettings{.force_sync = true, .compress_logs = test_params.enable_compression, .rotate_interval = 100}, keeper_context); changelog.init(0, 3); for (size_t i = 1; i < 55; ++i) @@ -2060,7 +2206,8 @@ TEST_P(CoordinationTest, TestLogGap) changelog.end_of_append_batch(0, 0); } - DB::KeeperLogStore changelog1("./logs", DB::LogFileSettings{.force_sync = true, .compress_logs = test_params.enable_compression, .rotate_interval = 100}); + DB::KeeperLogStore changelog1( + DB::LogFileSettings{.force_sync = true, .compress_logs = test_params.enable_compression, .rotate_interval = 100}, keeper_context); changelog1.init(61, 3); /// Logs discarded @@ -2220,13 +2367,13 @@ TEST_P(CoordinationTest, TestListRequestTypes) return create_response.path_created; }; - create_path(parentPath(StringRef{test_path}).toString(), false, false); + create_path(parentNodePath(StringRef{test_path}).toString(), false, false); static constexpr size_t persistent_num = 5; std::unordered_set expected_persistent_children; for (size_t i = 0; i < persistent_num; ++i) { - expected_persistent_children.insert(getBaseName(create_path(test_path, false)).toString()); + expected_persistent_children.insert(getBaseNodeName(create_path(test_path, false)).toString()); } ASSERT_EQ(expected_persistent_children.size(), persistent_num); @@ -2234,7 +2381,7 @@ TEST_P(CoordinationTest, TestListRequestTypes) std::unordered_set expected_ephemeral_children; for (size_t i = 0; i < ephemeral_num; ++i) { - expected_ephemeral_children.insert(getBaseName(create_path(test_path, true)).toString()); + expected_ephemeral_children.insert(getBaseNodeName(create_path(test_path, true)).toString()); } ASSERT_EQ(expected_ephemeral_children.size(), ephemeral_num); @@ -2242,7 +2389,7 @@ TEST_P(CoordinationTest, TestListRequestTypes) { const auto list_request = std::make_shared(); int new_zxid = ++zxid; - list_request->path = parentPath(StringRef{test_path}).toString(); + list_request->path = parentNodePath(StringRef{test_path}).toString(); list_request->list_request_type = list_request_type; storage.preprocessRequest(list_request, 1, 0, new_zxid); auto responses = storage.processRequest(list_request, 1, new_zxid); @@ -2270,20 +2417,23 @@ TEST_P(CoordinationTest, TestListRequestTypes) EXPECT_EQ(all_children.size(), ephemeral_num + persistent_num); for (const auto & child : all_children) { - EXPECT_TRUE(expected_ephemeral_children.contains(child) || expected_persistent_children.contains(child)) << "Missing child " << child; + EXPECT_TRUE(expected_ephemeral_children.contains(child) || expected_persistent_children.contains(child)) + << "Missing child " << child; } } TEST_P(CoordinationTest, TestDurableState) { ChangelogDirTest logs("./logs"); + setLogDirectory("./logs"); + setStateFileDirectory("."); auto state = nuraft::cs_new(); std::optional state_manager; const auto reload_state_manager = [&] { - state_manager.emplace(1, "localhost", 9181, "./logs", "./state"); + state_manager.emplace(1, "localhost", 9181, keeper_context); state_manager->loadLogStore(1, 0); }; @@ -2318,11 +2468,11 @@ TEST_P(CoordinationTest, TestDurableState) write_buf.sync(); write_buf.close(); reload_state_manager(); -#ifdef NDEBUG +# ifdef NDEBUG ASSERT_EQ(state_manager->read_state(), nullptr); -#else +# else ASSERT_THROW(state_manager->read_state(), DB::Exception); -#endif +# endif } { @@ -2346,18 +2496,19 @@ TEST_P(CoordinationTest, TestDurableState) } } -TEST_P(CoordinationTest, TestCurrentApiVersion) +TEST_P(CoordinationTest, TestFeatureFlags) { using namespace Coordination; KeeperStorage storage{500, "", keeper_context}; auto request = std::make_shared(); - request->path = DB::keeper_api_version_path; + request->path = DB::keeper_api_feature_flags_path; auto responses = storage.processRequest(request, 0, std::nullopt, true, true); const auto & get_response = getSingleResponse(responses); - uint8_t keeper_version{0}; - DB::ReadBufferFromOwnString buf(get_response.data); - DB::readIntText(keeper_version, buf); - EXPECT_EQ(keeper_version, static_cast(current_keeper_api_version)); + DB::KeeperFeatureFlags feature_flags; + feature_flags.setFeatureFlags(get_response.data); + ASSERT_TRUE(feature_flags.isEnabled(KeeperFeatureFlag::FILTERED_LIST)); + ASSERT_TRUE(feature_flags.isEnabled(KeeperFeatureFlag::MULTI_READ)); + ASSERT_FALSE(feature_flags.isEnabled(KeeperFeatureFlag::CHECK_NOT_EXISTS)); } TEST_P(CoordinationTest, TestSystemNodeModify) @@ -2366,7 +2517,7 @@ TEST_P(CoordinationTest, TestSystemNodeModify) int64_t zxid{0}; // On INIT we abort when a system path is modified - keeper_context->server_state = KeeperContext::Phase::RUNNING; + keeper_context->setServerState(KeeperContext::Phase::RUNNING); KeeperStorage storage{500, "", keeper_context}; const auto assert_create = [&](const std::string_view path, const auto expected_code) { @@ -2396,15 +2547,16 @@ TEST_P(CoordinationTest, ChangelogTestMaxLogSize) { auto params = GetParam(); ChangelogDirTest test("./logs"); + setLogDirectory("./logs"); uint64_t last_entry_index{0}; size_t i{0}; { SCOPED_TRACE("Small rotation interval, big size limit"); DB::KeeperLogStore changelog( - "./logs", DB::LogFileSettings{ - .force_sync = true, .compress_logs = params.enable_compression, .rotate_interval = 20, .max_size = 50 * 1024 * 1024}); + .force_sync = true, .compress_logs = params.enable_compression, .rotate_interval = 20, .max_size = 50 * 1024 * 1024}, + keeper_context); changelog.init(1, 0); for (; i < 100; ++i) @@ -2421,9 +2573,9 @@ TEST_P(CoordinationTest, ChangelogTestMaxLogSize) { SCOPED_TRACE("Large rotation interval, small size limit"); DB::KeeperLogStore changelog( - "./logs", DB::LogFileSettings{ - .force_sync = true, .compress_logs = params.enable_compression, .rotate_interval = 100'000, .max_size = 4000}); + .force_sync = true, .compress_logs = params.enable_compression, .rotate_interval = 100'000, .max_size = 4000}, + keeper_context); changelog.init(1, 0); ASSERT_EQ(changelog.entry_at(last_entry_index)->get_term(), (i - 1 + 44) * 10); @@ -2442,13 +2594,12 @@ TEST_P(CoordinationTest, ChangelogTestMaxLogSize) { SCOPED_TRACE("Final verify all logs"); DB::KeeperLogStore changelog( - "./logs", DB::LogFileSettings{ - .force_sync = true, .compress_logs = params.enable_compression, .rotate_interval = 100'000, .max_size = 4000}); + .force_sync = true, .compress_logs = params.enable_compression, .rotate_interval = 100'000, .max_size = 4000}, + keeper_context); changelog.init(1, 0); ASSERT_EQ(changelog.entry_at(last_entry_index)->get_term(), (i - 1 + 44) * 10); } - } TEST_P(CoordinationTest, TestCheckNotExistsRequest) @@ -2603,11 +2754,7 @@ TEST_P(CoordinationTest, TestReapplyingDeltas) INSTANTIATE_TEST_SUITE_P(CoordinationTestSuite, CoordinationTest, - ::testing::ValuesIn(std::initializer_list{ - CompressionParam{true, ".zstd"}, - CompressionParam{false, ""} - }) -); + ::testing::ValuesIn(std::initializer_list{CompressionParam{true, ".zstd"}, CompressionParam{false, ""}})); int main(int argc, char ** argv) { diff --git a/src/Core/Settings.h b/src/Core/Settings.h index 658f3c8025b..b7d12a518c8 100644 --- a/src/Core/Settings.h +++ b/src/Core/Settings.h @@ -467,6 +467,7 @@ class IColumn; M(UInt64, max_fetch_partition_retries_count, 5, "Amount of retries while fetching partition from another host.", 0) \ M(UInt64, http_max_multipart_form_data_size, 1024 * 1024 * 1024, "Limit on size of multipart/form-data content. This setting cannot be parsed from URL parameters and should be set in user profile. Note that content is parsed and external tables are created in memory before start of query execution. And this is the only limit that has effect on that stage (limits on max memory usage and max execution time have no effect while reading HTTP form data).", 0) \ M(Bool, calculate_text_stack_trace, true, "Calculate text stack trace in case of exceptions during query execution. This is the default. It requires symbol lookups that may slow down fuzzing tests when huge amount of wrong queries are executed. In normal cases you should not disable this option.", 0) \ + M(Bool, enable_job_stack_trace, false, "Output stack trace of a job creator when job results in exception", 0) \ M(Bool, allow_ddl, true, "If it is set to true, then a user is allowed to executed DDL queries.", 0) \ M(Bool, parallel_view_processing, false, "Enables pushing to attached views concurrently instead of sequentially.", 0) \ M(Bool, enable_unaligned_array_join, false, "Allow ARRAY JOIN with multiple arrays that have different sizes. When this settings is enabled, arrays will be resized to the longest one.", 0) \ diff --git a/src/Core/SettingsFields.cpp b/src/Core/SettingsFields.cpp index a0f643f8d30..9b8677a9888 100644 --- a/src/Core/SettingsFields.cpp +++ b/src/Core/SettingsFields.cpp @@ -378,6 +378,40 @@ void SettingFieldMap::readBinary(ReadBuffer & in) *this = map; } +#else + +namespace ErrorCodes +{ + extern const int LOGICAL_ERROR; +} + +SettingFieldMap::SettingFieldMap(const Field &) : value(Map()) {} +String SettingFieldMap::toString() const +{ + throw DB::Exception(DB::ErrorCodes::LOGICAL_ERROR, "Setting of type Map not supported"); +} + + +SettingFieldMap & SettingFieldMap::operator =(const Field &) +{ + throw DB::Exception(DB::ErrorCodes::LOGICAL_ERROR, "Setting of type Map not supported"); +} + +void SettingFieldMap::parseFromString(const String &) +{ + throw DB::Exception(DB::ErrorCodes::LOGICAL_ERROR, "Setting of type Map not supported"); +} + +void SettingFieldMap::writeBinary(WriteBuffer &) const +{ + throw DB::Exception(DB::ErrorCodes::LOGICAL_ERROR, "Setting of type Map not supported"); +} + +void SettingFieldMap::readBinary(ReadBuffer &) +{ + throw DB::Exception(DB::ErrorCodes::LOGICAL_ERROR, "Setting of type Map not supported"); +} + #endif namespace diff --git a/src/Core/SettingsFields.h b/src/Core/SettingsFields.h index 786492a7256..32fffd3af06 100644 --- a/src/Core/SettingsFields.h +++ b/src/Core/SettingsFields.h @@ -245,6 +245,12 @@ struct SettingFieldString void readBinary(ReadBuffer & in); }; +#ifdef CLICKHOUSE_PROGRAM_STANDALONE_BUILD +#define NORETURN [[noreturn]] +#else +#define NORETURN +#endif + struct SettingFieldMap { public: @@ -261,13 +267,15 @@ public: operator const Map &() const { return value; } /// NOLINT explicit operator Field() const { return value; } - String toString() const; - void parseFromString(const String & str); + NORETURN String toString() const; + NORETURN void parseFromString(const String & str); - void writeBinary(WriteBuffer & out) const; - void readBinary(ReadBuffer & in); + NORETURN void writeBinary(WriteBuffer & out) const; + NORETURN void readBinary(ReadBuffer & in); }; +#undef NORETURN + struct SettingFieldChar { public: diff --git a/src/Core/SettingsQuirks.cpp b/src/Core/SettingsQuirks.cpp index 3326f42adf5..37a0f2db3e2 100644 --- a/src/Core/SettingsQuirks.cpp +++ b/src/Core/SettingsQuirks.cpp @@ -1,10 +1,11 @@ +#include #include #include #include #include #include #include -#include + namespace { @@ -71,6 +72,12 @@ void applySettingsQuirks(Settings & settings, Poco::Logger * log) } } +#if defined(THREAD_SANITIZER) + settings.use_hedged_requests.value = false; + if (log) + LOG_WARNING(log, "use_hedged_requests has been disabled for the build with Thread Sanitizer, because they are using fibers, leading to a failed assertion inside TSan"); +#endif + if (!queryProfilerWorks()) { if (settings.query_profiler_real_time_period_ns) diff --git a/src/Core/Types_fwd.h b/src/Core/Types_fwd.h index 3db1127fe5a..a59e4b6eab8 100644 --- a/src/Core/Types_fwd.h +++ b/src/Core/Types_fwd.h @@ -27,7 +27,7 @@ namespace DB using UUID = StrongTypedef; -using IPv4 = StrongTypedef; +struct IPv4; struct IPv6; diff --git a/src/Daemon/BaseDaemon.cpp b/src/Daemon/BaseDaemon.cpp index 634b7b71f6d..6d29523a354 100644 --- a/src/Daemon/BaseDaemon.cpp +++ b/src/Daemon/BaseDaemon.cpp @@ -19,6 +19,7 @@ #include #include +#include #include #include #include @@ -153,6 +154,7 @@ static void signalHandler(int sig, siginfo_t * info, void * context) writePODBinary(*info, out); writePODBinary(signal_context, out); writePODBinary(stack_trace, out); + writeVectorBinary(Exception::thread_frame_pointers, out); writeBinary(static_cast(getThreadId()), out); writePODBinary(current_thread, out); @@ -250,6 +252,7 @@ public: siginfo_t info{}; ucontext_t * context{}; StackTrace stack_trace(NoCapture{}); + std::vector thread_frame_pointers; UInt32 thread_num{}; ThreadStatus * thread_ptr{}; @@ -260,12 +263,13 @@ public: } readPODBinary(stack_trace, in); + readVectorBinary(thread_frame_pointers, in); readBinary(thread_num, in); readPODBinary(thread_ptr, in); /// This allows to receive more signals if failure happens inside onFault function. /// Example: segfault while symbolizing stack trace. - std::thread([=, this] { onFault(sig, info, context, stack_trace, thread_num, thread_ptr); }).detach(); + std::thread([=, this] { onFault(sig, info, context, stack_trace, thread_frame_pointers, thread_num, thread_ptr); }).detach(); } } } @@ -300,6 +304,7 @@ private: const siginfo_t & info, ucontext_t * context, const StackTrace & stack_trace, + const std::vector & thread_frame_pointers, UInt32 thread_num, ThreadStatus * thread_ptr) const { @@ -375,6 +380,31 @@ private: /// Write symbolized stack trace line by line for better grep-ability. stack_trace.toStringEveryLine([&](std::string_view s) { LOG_FATAL(log, fmt::runtime(s)); }); + /// In case it's a scheduled job write all previous jobs origins call stacks + std::for_each(thread_frame_pointers.rbegin(), thread_frame_pointers.rend(), + [this](const StackTrace::FramePointers & frame_pointers) + { + if (size_t size = std::ranges::find(frame_pointers, nullptr) - frame_pointers.begin()) + { + LOG_FATAL(log, "========================================"); + WriteBufferFromOwnString bare_stacktrace; + writeString("Job's origin stack trace:", bare_stacktrace); + std::for_each_n(frame_pointers.begin(), size, + [&bare_stacktrace](const void * ptr) + { + writeChar(' ', bare_stacktrace); + writePointerHex(ptr, bare_stacktrace); + } + ); + + LOG_FATAL(log, fmt::runtime(bare_stacktrace.str())); + + StackTrace::toStringEveryLine(const_cast(frame_pointers.data()), 0, size, [this](std::string_view s) { LOG_FATAL(log, fmt::runtime(s)); }); + } + } + ); + + #if defined(OS_LINUX) /// Write information about binary checksum. It can be difficult to calculate, so do it only after printing stack trace. /// Please keep the below log messages in-sync with the ones in programs/server/Server.cpp diff --git a/src/DataTypes/DataTypeMap.cpp b/src/DataTypes/DataTypeMap.cpp index 0f5d97e6761..90561857fad 100644 --- a/src/DataTypes/DataTypeMap.cpp +++ b/src/DataTypes/DataTypeMap.cpp @@ -69,7 +69,7 @@ void DataTypeMap::assertKeyType() const if (!checkKeyType(key_type)) throw Exception(ErrorCodes::BAD_ARGUMENTS, "Type of Map key must be a type, that can be represented by integer " - "or String or FixedString (possibly LowCardinality) or UUID," + "or String or FixedString (possibly LowCardinality) or UUID or IPv6," " but {} given", key_type->getName()); } @@ -120,6 +120,7 @@ bool DataTypeMap::checkKeyType(DataTypePtr key_type) else if (!key_type->isValueRepresentedByInteger() && !isStringOrFixedString(*key_type) && !WhichDataType(key_type).isNothing() + && !WhichDataType(key_type).isIPv6() && !WhichDataType(key_type).isUUID()) { return false; diff --git a/src/Databases/DatabaseFactory.cpp b/src/Databases/DatabaseFactory.cpp index e1c8afa52c0..9d90c61bb41 100644 --- a/src/Databases/DatabaseFactory.cpp +++ b/src/Databases/DatabaseFactory.cpp @@ -3,6 +3,7 @@ #include #include #include +#include #include #include #include @@ -47,6 +48,14 @@ #include #endif +#if USE_AWS_S3 +#include +#endif + +#if USE_HDFS +#include +#endif + namespace fs = std::filesystem; namespace DB @@ -131,13 +140,13 @@ DatabasePtr DatabaseFactory::getImpl(const ASTCreateQuery & create, const String static const std::unordered_set database_engines{"Ordinary", "Atomic", "Memory", "Dictionary", "Lazy", "Replicated", "MySQL", "MaterializeMySQL", "MaterializedMySQL", - "PostgreSQL", "MaterializedPostgreSQL", "SQLite"}; + "PostgreSQL", "MaterializedPostgreSQL", "SQLite", "Filesystem", "S3", "HDFS"}; if (!database_engines.contains(engine_name)) throw Exception(ErrorCodes::BAD_ARGUMENTS, "Database engine name `{}` does not exist", engine_name); static const std::unordered_set engines_with_arguments{"MySQL", "MaterializeMySQL", "MaterializedMySQL", - "Lazy", "Replicated", "PostgreSQL", "MaterializedPostgreSQL", "SQLite"}; + "Lazy", "Replicated", "PostgreSQL", "MaterializedPostgreSQL", "SQLite", "Filesystem", "S3", "HDFS"}; static const std::unordered_set engines_with_table_overrides{"MaterializeMySQL", "MaterializedMySQL", "MaterializedPostgreSQL"}; bool engine_may_have_arguments = engines_with_arguments.contains(engine_name); @@ -432,6 +441,63 @@ DatabasePtr DatabaseFactory::getImpl(const ASTCreateQuery & create, const String } #endif + else if (engine_name == "Filesystem") + { + const ASTFunction * engine = engine_define->engine; + + /// If init_path is empty, then the current path will be used + std::string init_path; + + if (engine->arguments && !engine->arguments->children.empty()) + { + if (engine->arguments->children.size() != 1) + throw Exception(ErrorCodes::BAD_ARGUMENTS, "Filesystem database requires at most 1 argument: filesystem_path"); + + const auto & arguments = engine->arguments->children; + init_path = safeGetLiteralValue(arguments[0], engine_name); + } + + return std::make_shared(database_name, init_path, context); + } + +#if USE_AWS_S3 + else if (engine_name == "S3") + { + const ASTFunction * engine = engine_define->engine; + + DatabaseS3::Configuration config; + + if (engine->arguments && !engine->arguments->children.empty()) + { + ASTs & engine_args = engine->arguments->children; + config = DatabaseS3::parseArguments(engine_args, context); + } + + return std::make_shared(database_name, config, context); + } +#endif + +#if USE_HDFS + else if (engine_name == "HDFS") + { + const ASTFunction * engine = engine_define->engine; + + /// If source_url is empty, then table name must contain full url + std::string source_url; + + if (engine->arguments && !engine->arguments->children.empty()) + { + if (engine->arguments->children.size() != 1) + throw Exception(ErrorCodes::BAD_ARGUMENTS, "HDFS database requires at most 1 argument: source_url"); + + const auto & arguments = engine->arguments->children; + source_url = safeGetLiteralValue(arguments[0], engine_name); + } + + return std::make_shared(database_name, source_url, context); + } +#endif + throw Exception(ErrorCodes::UNKNOWN_DATABASE_ENGINE, "Unknown database engine: {}", engine_name); } diff --git a/src/Databases/DatabaseFilesystem.cpp b/src/Databases/DatabaseFilesystem.cpp new file mode 100644 index 00000000000..7eaf474eea0 --- /dev/null +++ b/src/Databases/DatabaseFilesystem.cpp @@ -0,0 +1,245 @@ +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +namespace fs = std::filesystem; + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int LOGICAL_ERROR; + extern const int UNKNOWN_TABLE; + extern const int PATH_ACCESS_DENIED; + extern const int BAD_ARGUMENTS; + extern const int FILE_DOESNT_EXIST; +} + +DatabaseFilesystem::DatabaseFilesystem(const String & name_, const String & path_, ContextPtr context_) + : IDatabase(name_), WithContext(context_->getGlobalContext()), path(path_), log(&Poco::Logger::get("DatabaseFileSystem(" + name_ + ")")) +{ + bool is_local = context_->getApplicationType() == Context::ApplicationType::LOCAL; + fs::path user_files_path = is_local ? "" : fs::canonical(getContext()->getUserFilesPath()); + + if (fs::path(path).is_relative()) + { + path = user_files_path / path; + } + else if (!is_local && !pathStartsWith(fs::path(path), user_files_path)) + { + throw Exception(ErrorCodes::BAD_ARGUMENTS, + "Path must be inside user-files path: {}", user_files_path.string()); + } + + path = fs::absolute(path).lexically_normal(); + if (!fs::exists(path)) + throw Exception(ErrorCodes::BAD_ARGUMENTS, "Path does not exist: {}", path); +} + +std::string DatabaseFilesystem::getTablePath(const std::string & table_name) const +{ + fs::path table_path = fs::path(path) / table_name; + return table_path.lexically_normal().string(); +} + +void DatabaseFilesystem::addTable(const std::string & table_name, StoragePtr table_storage) const +{ + std::lock_guard lock(mutex); + auto [_, inserted] = loaded_tables.emplace(table_name, table_storage); + if (!inserted) + throw Exception( + ErrorCodes::LOGICAL_ERROR, + "Table with name `{}` already exists in database `{}` (engine {})", + table_name, getDatabaseName(), getEngineName()); +} + +bool DatabaseFilesystem::checkTableFilePath(const std::string & table_path, ContextPtr context_, bool throw_on_error) const +{ + /// If run in Local mode, no need for path checking. + bool check_path = context_->getApplicationType() != Context::ApplicationType::LOCAL; + const auto & user_files_path = context_->getUserFilesPath(); + + /// Check access for file before checking its existence. + if (check_path && !fileOrSymlinkPathStartsWith(table_path, user_files_path)) + { + if (throw_on_error) + throw Exception(ErrorCodes::PATH_ACCESS_DENIED, "File is not inside {}", user_files_path); + else + return false; + } + + /// Check if the corresponding file exists. + if (!fs::exists(table_path)) + { + if (throw_on_error) + throw Exception(ErrorCodes::FILE_DOESNT_EXIST, "File does not exist: {}", table_path); + else + return false; + } + + if (!fs::is_regular_file(table_path)) + { + if (throw_on_error) + throw Exception(ErrorCodes::FILE_DOESNT_EXIST, + "File is directory, but expected a file: {}", table_path); + else + return false; + } + + return true; +} + +StoragePtr DatabaseFilesystem::tryGetTableFromCache(const std::string & name) const +{ + StoragePtr table = nullptr; + { + std::lock_guard lock(mutex); + auto it = loaded_tables.find(name); + if (it != loaded_tables.end()) + table = it->second; + } + + /// Invalidate cache if file no longer exists. + if (table && !fs::exists(getTablePath(name))) + { + std::lock_guard lock(mutex); + loaded_tables.erase(name); + return nullptr; + } + + return table; +} + +bool DatabaseFilesystem::isTableExist(const String & name, ContextPtr context_) const +{ + if (tryGetTableFromCache(name)) + return true; + + return checkTableFilePath(getTablePath(name), context_, /* throw_on_error */false); +} + +StoragePtr DatabaseFilesystem::getTableImpl(const String & name, ContextPtr context_) const +{ + /// Check if table exists in loaded tables map. + if (auto table = tryGetTableFromCache(name)) + return table; + + auto table_path = getTablePath(name); + checkTableFilePath(table_path, context_, /* throw_on_error */true); + + /// If the file exists, create a new table using TableFunctionFile and return it. + auto args = makeASTFunction("file", std::make_shared(table_path)); + + auto table_function = TableFunctionFactory::instance().get(args, context_); + if (!table_function) + return nullptr; + + /// TableFunctionFile throws exceptions, if table cannot be created. + auto table_storage = table_function->execute(args, context_, name); + if (table_storage) + addTable(name, table_storage); + + return table_storage; +} + +StoragePtr DatabaseFilesystem::getTable(const String & name, ContextPtr context_) const +{ + /// getTableImpl can throw exceptions, do not catch them to show correct error to user. + if (auto storage = getTableImpl(name, context_)) + return storage; + + throw Exception(ErrorCodes::UNKNOWN_TABLE, "Table {}.{} doesn't exist", + backQuoteIfNeed(getDatabaseName()), backQuoteIfNeed(name)); +} + +StoragePtr DatabaseFilesystem::tryGetTable(const String & name, ContextPtr context_) const +{ + try + { + return getTableImpl(name, context_); + } + catch (const Exception & e) + { + /// Ignore exceptions thrown by TableFunctionFile, which indicate that there is no table + /// see tests/02722_database_filesystem.sh for more details. + if (e.code() == ErrorCodes::FILE_DOESNT_EXIST) + { + return nullptr; + } + throw; + } +} + +bool DatabaseFilesystem::empty() const +{ + std::lock_guard lock(mutex); + return loaded_tables.empty(); +} + +ASTPtr DatabaseFilesystem::getCreateDatabaseQuery() const +{ + const auto & settings = getContext()->getSettingsRef(); + const String query = fmt::format("CREATE DATABASE {} ENGINE = Filesystem('{}')", backQuoteIfNeed(getDatabaseName()), path); + + ParserCreateQuery parser; + ASTPtr ast = parseQuery(parser, query.data(), query.data() + query.size(), "", 0, settings.max_parser_depth); + + if (const auto database_comment = getDatabaseComment(); !database_comment.empty()) + { + auto & ast_create_query = ast->as(); + ast_create_query.set(ast_create_query.comment, std::make_shared(database_comment)); + } + + return ast; +} + +void DatabaseFilesystem::shutdown() +{ + Tables tables_snapshot; + { + std::lock_guard lock(mutex); + tables_snapshot = loaded_tables; + } + + for (const auto & kv : tables_snapshot) + { + auto table_id = kv.second->getStorageID(); + kv.second->flushAndShutdown(); + } + + std::lock_guard lock(mutex); + loaded_tables.clear(); +} + +/** + * Returns an empty vector because the database is read-only and no tables can be backed up + */ +std::vector> DatabaseFilesystem::getTablesForBackup(const FilterByNameFunction &, const ContextPtr &) const +{ + return {}; +} + +/** + * + * Returns an empty iterator because the database does not have its own tables + * But only caches them for quick access + */ +DatabaseTablesIteratorPtr DatabaseFilesystem::getTablesIterator(ContextPtr, const FilterByNameFunction &) const +{ + return std::make_unique(Tables{}, getDatabaseName()); +} + +} diff --git a/src/Databases/DatabaseFilesystem.h b/src/Databases/DatabaseFilesystem.h new file mode 100644 index 00000000000..7fe620401dc --- /dev/null +++ b/src/Databases/DatabaseFilesystem.h @@ -0,0 +1,67 @@ +#pragma once + +#include +#include +#include +#include +#include + +namespace DB +{ + +class Context; + +/** + * DatabaseFilesystem allows to interact with files stored on the local filesystem. + * Uses TableFunctionFile to implicitly load file when a user requests the table, + * and provides a read-only access to the data in the file. + * Tables are cached inside the database for quick access + * + * Used in clickhouse-local to access local files. + * For clickhouse-server requires allows to access file only from user_files directory. + */ +class DatabaseFilesystem : public IDatabase, protected WithContext +{ +public: + DatabaseFilesystem(const String & name, const String & path, ContextPtr context); + + String getEngineName() const override { return "Filesystem"; } + + bool isTableExist(const String & name, ContextPtr context) const override; + + StoragePtr getTable(const String & name, ContextPtr context) const override; + + StoragePtr tryGetTable(const String & name, ContextPtr context) const override; + + bool shouldBeEmptyOnDetach() const override { return false; } /// Contains only temporary tables. + + bool empty() const override; + + bool isReadOnly() const override { return true; } + + ASTPtr getCreateDatabaseQuery() const override; + + void shutdown() override; + + std::vector> getTablesForBackup(const FilterByNameFunction &, const ContextPtr &) const override; + + DatabaseTablesIteratorPtr getTablesIterator(ContextPtr, const FilterByNameFunction &) const override; + +protected: + StoragePtr getTableImpl(const String & name, ContextPtr context) const; + + StoragePtr tryGetTableFromCache(const std::string & name) const; + + std::string getTablePath(const std::string & table_name) const; + + void addTable(const std::string & table_name, StoragePtr table_storage) const; + + bool checkTableFilePath(const std::string & table_path, ContextPtr context_, bool throw_on_error) const; + +private: + String path; + mutable Tables loaded_tables TSA_GUARDED_BY(mutex); + Poco::Logger * log; +}; + +} diff --git a/src/Databases/DatabaseHDFS.cpp b/src/Databases/DatabaseHDFS.cpp new file mode 100644 index 00000000000..1a0145b9015 --- /dev/null +++ b/src/Databases/DatabaseHDFS.cpp @@ -0,0 +1,234 @@ +#include "config.h" + +#if USE_HDFS + +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include + +namespace fs = std::filesystem; + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int LOGICAL_ERROR; + extern const int UNKNOWN_TABLE; + extern const int BAD_ARGUMENTS; + extern const int FILE_DOESNT_EXIST; + extern const int UNACCEPTABLE_URL; + extern const int ACCESS_DENIED; + extern const int DATABASE_ACCESS_DENIED; + extern const int HDFS_ERROR; + extern const int CANNOT_EXTRACT_TABLE_STRUCTURE; +} + +static constexpr std::string_view HDFS_HOST_REGEXP = "^hdfs://[^/]*"; + + +DatabaseHDFS::DatabaseHDFS(const String & name_, const String & source_url, ContextPtr context_) + : IDatabase(name_) + , WithContext(context_->getGlobalContext()) + , source(source_url) + , log(&Poco::Logger::get("DatabaseHDFS(" + name_ + ")")) +{ + if (!source.empty()) + { + if (!re2::RE2::FullMatch(source, std::string(HDFS_HOST_REGEXP))) + throw Exception(ErrorCodes::BAD_ARGUMENTS, "Bad hdfs host: {}. " + "It should have structure 'hdfs://:'", source); + + context_->getGlobalContext()->getRemoteHostFilter().checkURL(Poco::URI(source)); + } +} + +void DatabaseHDFS::addTable(const std::string & table_name, StoragePtr table_storage) const +{ + std::lock_guard lock(mutex); + auto [_, inserted] = loaded_tables.emplace(table_name, table_storage); + if (!inserted) + throw Exception( + ErrorCodes::LOGICAL_ERROR, + "Table with name `{}` already exists in database `{}` (engine {})", + table_name, getDatabaseName(), getEngineName()); +} + +std::string DatabaseHDFS::getTablePath(const std::string & table_name) const +{ + if (table_name.starts_with("hdfs://")) + return table_name; + + if (source.empty()) + throw Exception(ErrorCodes::BAD_ARGUMENTS, "Bad hdfs url: {}. " + "It should have structure 'hdfs://:/path'", table_name); + + return fs::path(source) / table_name; +} + +bool DatabaseHDFS::checkUrl(const std::string & url, ContextPtr context_, bool throw_on_error) const +{ + try + { + checkHDFSURL(url); + context_->getGlobalContext()->getRemoteHostFilter().checkURL(Poco::URI(url)); + } + catch (...) + { + if (throw_on_error) + throw; + return false; + } + + return true; +} + +bool DatabaseHDFS::isTableExist(const String & name, ContextPtr context_) const +{ + std::lock_guard lock(mutex); + if (loaded_tables.find(name) != loaded_tables.end()) + return true; + + return checkUrl(name, context_, false); +} + +StoragePtr DatabaseHDFS::getTableImpl(const String & name, ContextPtr context_) const +{ + /// Check if the table exists in the loaded tables map. + { + std::lock_guard lock(mutex); + auto it = loaded_tables.find(name); + if (it != loaded_tables.end()) + return it->second; + } + + auto url = getTablePath(name); + + checkUrl(url, context_, true); + + auto args = makeASTFunction("hdfs", std::make_shared(url)); + + auto table_function = TableFunctionFactory::instance().get(args, context_); + if (!table_function) + return nullptr; + + /// TableFunctionHDFS throws exceptions, if table cannot be created. + auto table_storage = table_function->execute(args, context_, name); + if (table_storage) + addTable(name, table_storage); + + return table_storage; +} + +StoragePtr DatabaseHDFS::getTable(const String & name, ContextPtr context_) const +{ + /// Rethrow all exceptions from TableFunctionHDFS to show correct error to user. + if (auto storage = getTableImpl(name, context_)) + return storage; + + throw Exception(ErrorCodes::UNKNOWN_TABLE, "Table {}.{} doesn't exist", + backQuoteIfNeed(getDatabaseName()), backQuoteIfNeed(name)); +} + +StoragePtr DatabaseHDFS::tryGetTable(const String & name, ContextPtr context_) const +{ + try + { + return getTableImpl(name, context_); + } + catch (const Exception & e) + { + // Ignore exceptions thrown by TableFunctionHDFS, which indicate that there is no table + if (e.code() == ErrorCodes::BAD_ARGUMENTS + || e.code() == ErrorCodes::ACCESS_DENIED + || e.code() == ErrorCodes::DATABASE_ACCESS_DENIED + || e.code() == ErrorCodes::FILE_DOESNT_EXIST + || e.code() == ErrorCodes::UNACCEPTABLE_URL + || e.code() == ErrorCodes::HDFS_ERROR + || e.code() == ErrorCodes::CANNOT_EXTRACT_TABLE_STRUCTURE) + { + return nullptr; + } + throw; + } + catch (const Poco::URISyntaxException &) + { + return nullptr; + } +} + +bool DatabaseHDFS::empty() const +{ + std::lock_guard lock(mutex); + return loaded_tables.empty(); +} + +ASTPtr DatabaseHDFS::getCreateDatabaseQuery() const +{ + const auto & settings = getContext()->getSettingsRef(); + ParserCreateQuery parser; + + const String query = fmt::format("CREATE DATABASE {} ENGINE = HDFS('{}')", backQuoteIfNeed(getDatabaseName()), source); + ASTPtr ast = parseQuery(parser, query.data(), query.data() + query.size(), "", 0, settings.max_parser_depth); + + if (const auto database_comment = getDatabaseComment(); !database_comment.empty()) + { + auto & ast_create_query = ast->as(); + ast_create_query.set(ast_create_query.comment, std::make_shared(database_comment)); + } + + return ast; +} + +void DatabaseHDFS::shutdown() +{ + Tables tables_snapshot; + { + std::lock_guard lock(mutex); + tables_snapshot = loaded_tables; + } + + for (const auto & kv : tables_snapshot) + { + auto table_id = kv.second->getStorageID(); + kv.second->flushAndShutdown(); + } + + std::lock_guard lock(mutex); + loaded_tables.clear(); +} + +/** + * Returns an empty vector because the database is read-only and no tables can be backed up + */ +std::vector> DatabaseHDFS::getTablesForBackup(const FilterByNameFunction &, const ContextPtr &) const +{ + return {}; +} + +/** + * + * Returns an empty iterator because the database does not have its own tables + * But only caches them for quick access + */ +DatabaseTablesIteratorPtr DatabaseHDFS::getTablesIterator(ContextPtr, const FilterByNameFunction &) const +{ + return std::make_unique(Tables{}, getDatabaseName()); +} + +} // DB + +#endif diff --git a/src/Databases/DatabaseHDFS.h b/src/Databases/DatabaseHDFS.h new file mode 100644 index 00000000000..957b2080135 --- /dev/null +++ b/src/Databases/DatabaseHDFS.h @@ -0,0 +1,68 @@ +#pragma once + +#include "config.h" + +#if USE_HDFS + +#include +#include +#include +#include +#include + +namespace DB +{ + +class Context; + +/** + * DatabaseHDFS allows to interact with files stored on the file system. + * Uses TableFunctionHDFS to implicitly load file when a user requests the table, + * and provides read-only access to the data in the file. + * Tables are cached inside the database for quick access. + */ +class DatabaseHDFS : public IDatabase, protected WithContext +{ +public: + DatabaseHDFS(const String & name, const String & source_url, ContextPtr context); + + String getEngineName() const override { return "S3"; } + + bool isTableExist(const String & name, ContextPtr context) const override; + + StoragePtr getTable(const String & name, ContextPtr context) const override; + + StoragePtr tryGetTable(const String & name, ContextPtr context) const override; + + bool shouldBeEmptyOnDetach() const override { return false; } /// Contains only temporary tables. + + bool empty() const override; + + bool isReadOnly() const override { return true; } + + ASTPtr getCreateDatabaseQuery() const override; + + void shutdown() override; + + std::vector> getTablesForBackup(const FilterByNameFunction &, const ContextPtr &) const override; + DatabaseTablesIteratorPtr getTablesIterator(ContextPtr, const FilterByNameFunction &) const override; + +protected: + StoragePtr getTableImpl(const String & name, ContextPtr context) const; + + void addTable(const std::string & table_name, StoragePtr table_storage) const; + + bool checkUrl(const std::string & url, ContextPtr context_, bool throw_on_error) const; + + std::string getTablePath(const std::string & table_name) const; + +private: + const String source; + + mutable Tables loaded_tables TSA_GUARDED_BY(mutex); + Poco::Logger * log; +}; + +} + +#endif diff --git a/src/Databases/DatabaseReplicated.cpp b/src/Databases/DatabaseReplicated.cpp index de40ee4d82d..661afc6bf1f 100644 --- a/src/Databases/DatabaseReplicated.cpp +++ b/src/Databases/DatabaseReplicated.cpp @@ -1293,6 +1293,16 @@ void DatabaseReplicated::commitAlterTable(const StorageID & table_id, assert(checkDigestValid(query_context)); } + +bool DatabaseReplicated::canExecuteReplicatedMetadataAlter() const +{ + /// ReplicatedMergeTree may call commitAlterTable from its background threads when executing ALTER_METADATA entries. + /// It may update the metadata digest (both locally and in ZooKeeper) + /// before DatabaseReplicatedDDLWorker::initializeReplication() has finished. + /// We should not update metadata until the database is initialized. + return ddl_worker && ddl_worker->isCurrentlyActive(); +} + void DatabaseReplicated::detachTablePermanently(ContextPtr local_context, const String & table_name) { auto txn = local_context->getZooKeeperMetadataTransaction(); diff --git a/src/Databases/DatabaseReplicated.h b/src/Databases/DatabaseReplicated.h index 1da181de030..ff1a4aba41c 100644 --- a/src/Databases/DatabaseReplicated.h +++ b/src/Databases/DatabaseReplicated.h @@ -48,6 +48,8 @@ public: /// then it will be executed on all replicas. BlockIO tryEnqueueReplicatedDDL(const ASTPtr & query, ContextPtr query_context, bool internal) override; + bool canExecuteReplicatedMetadataAlter() const override; + bool hasReplicationThread() const override { return true; } void stopReplication() override; diff --git a/src/Databases/DatabaseReplicatedWorker.cpp b/src/Databases/DatabaseReplicatedWorker.cpp index ff2675dfd6b..4976f54e417 100644 --- a/src/Databases/DatabaseReplicatedWorker.cpp +++ b/src/Databases/DatabaseReplicatedWorker.cpp @@ -91,12 +91,12 @@ void DatabaseReplicatedDDLWorker::initializeReplication() if (zookeeper->tryGet(database->replica_path + "/digest", digest_str)) { digest = parse(digest_str); - LOG_TRACE(log, "Metadata digest in ZooKeeper: {}", digest); std::lock_guard lock{database->metadata_mutex}; local_digest = database->tables_metadata_digest; } else { + LOG_WARNING(log, "Did not find digest in ZooKeeper, creating it"); /// Database was created by old ClickHouse versions, let's create the node std::lock_guard lock{database->metadata_mutex}; digest = local_digest = database->tables_metadata_digest; @@ -104,6 +104,9 @@ void DatabaseReplicatedDDLWorker::initializeReplication() zookeeper->create(database->replica_path + "/digest", digest_str, zkutil::CreateMode::Persistent); } + LOG_TRACE(log, "Trying to initialize replication: our_log_ptr={}, max_log_ptr={}, local_digest={}, zk_digest={}", + our_log_ptr, max_log_ptr, local_digest, digest); + bool is_new_replica = our_log_ptr == 0; bool lost_according_to_log_ptr = our_log_ptr + logs_to_keep < max_log_ptr; bool lost_according_to_digest = database->db_settings.check_consistency && local_digest != digest; @@ -158,7 +161,7 @@ bool DatabaseReplicatedDDLWorker::waitForReplicaToProcessAllEntries(UInt64 timeo LOG_TRACE(log, "Waiting for worker thread to process all entries before {}, current task is {}", max_log, current_task); bool processed = wait_current_task_change.wait_for(lock, std::chrono::milliseconds(timeout_ms), [&]() { - return zookeeper->expired() || current_task == max_log || stop_flag; + return zookeeper->expired() || current_task >= max_log || stop_flag; }); if (!processed) diff --git a/src/Databases/DatabaseS3.cpp b/src/Databases/DatabaseS3.cpp new file mode 100644 index 00000000000..11655f5f100 --- /dev/null +++ b/src/Databases/DatabaseS3.cpp @@ -0,0 +1,312 @@ +#include "config.h" + +#if USE_AWS_S3 + +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +namespace fs = std::filesystem; + +namespace DB +{ + +static const std::unordered_set optional_configuration_keys = { + "url", + "access_key_id", + "secret_access_key", + "no_sign_request" +}; + +namespace ErrorCodes +{ + extern const int LOGICAL_ERROR; + extern const int UNKNOWN_TABLE; + extern const int BAD_ARGUMENTS; + extern const int FILE_DOESNT_EXIST; + extern const int UNACCEPTABLE_URL; + extern const int S3_ERROR; + + extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH; +} + +DatabaseS3::DatabaseS3(const String & name_, const Configuration& config_, ContextPtr context_) + : IDatabase(name_) + , WithContext(context_->getGlobalContext()) + , config(config_) + , log(&Poco::Logger::get("DatabaseS3(" + name_ + ")")) +{ +} + +void DatabaseS3::addTable(const std::string & table_name, StoragePtr table_storage) const +{ + std::lock_guard lock(mutex); + auto [_, inserted] = loaded_tables.emplace(table_name, table_storage); + if (!inserted) + throw Exception( + ErrorCodes::LOGICAL_ERROR, + "Table with name `{}` already exists in database `{}` (engine {})", + table_name, getDatabaseName(), getEngineName()); +} + +std::string DatabaseS3::getFullUrl(const std::string & name) const +{ + if (!config.url_prefix.empty()) + return fs::path(config.url_prefix) / name; + + return name; +} + +bool DatabaseS3::checkUrl(const std::string & url, ContextPtr context_, bool throw_on_error) const +{ + try + { + S3::URI uri(url); + context_->getGlobalContext()->getRemoteHostFilter().checkURL(uri.uri); + } + catch (...) + { + if (throw_on_error) + throw; + return false; + } + return true; +} + +bool DatabaseS3::isTableExist(const String & name, ContextPtr context_) const +{ + std::lock_guard lock(mutex); + if (loaded_tables.find(name) != loaded_tables.end()) + return true; + + return checkUrl(getFullUrl(name), context_, false); +} + +StoragePtr DatabaseS3::getTableImpl(const String & name, ContextPtr context_) const +{ + /// Check if the table exists in the loaded tables map. + { + std::lock_guard lock(mutex); + auto it = loaded_tables.find(name); + if (it != loaded_tables.end()) + return it->second; + } + + auto url = getFullUrl(name); + checkUrl(url, context_, /* throw_on_error */true); + + auto function = std::make_shared(); + function->name = "s3"; + function->arguments = std::make_shared(); + function->children.push_back(function->arguments); + + function->arguments->children.push_back(std::make_shared(url)); + if (config.no_sign_request) + { + function->arguments->children.push_back(std::make_shared("NOSIGN")); + } + else if (config.access_key_id.has_value() && config.secret_access_key.has_value()) + { + function->arguments->children.push_back(std::make_shared(config.access_key_id.value())); + function->arguments->children.push_back(std::make_shared(config.secret_access_key.value())); + } + + auto table_function = TableFunctionFactory::instance().get(function, context_); + if (!table_function) + return nullptr; + + /// TableFunctionS3 throws exceptions, if table cannot be created. + auto table_storage = table_function->execute(function, context_, name); + if (table_storage) + addTable(name, table_storage); + + return table_storage; +} + +StoragePtr DatabaseS3::getTable(const String & name, ContextPtr context_) const +{ + /// Rethrow all exceptions from TableFunctionS3 to show correct error to user. + if (auto storage = getTableImpl(name, context_)) + return storage; + + throw Exception(ErrorCodes::UNKNOWN_TABLE, "Table {}.{} doesn't exist", + backQuoteIfNeed(getDatabaseName()), backQuoteIfNeed(name)); +} + +StoragePtr DatabaseS3::tryGetTable(const String & name, ContextPtr context_) const +{ + try + { + return getTableImpl(name, context_); + } + catch (const Exception & e) + { + /// Ignore exceptions thrown by TableFunctionS3, which indicate that there is no table. + if (e.code() == ErrorCodes::BAD_ARGUMENTS + || e.code() == ErrorCodes::S3_ERROR + || e.code() == ErrorCodes::FILE_DOESNT_EXIST + || e.code() == ErrorCodes::UNACCEPTABLE_URL) + { + return nullptr; + } + throw; + } + catch (const Poco::URISyntaxException &) + { + return nullptr; + } +} + +bool DatabaseS3::empty() const +{ + std::lock_guard lock(mutex); + return loaded_tables.empty(); +} + +ASTPtr DatabaseS3::getCreateDatabaseQuery() const +{ + const auto & settings = getContext()->getSettingsRef(); + ParserCreateQuery parser; + + std::string creation_args; + creation_args += fmt::format("'{}'", config.url_prefix); + if (config.no_sign_request) + creation_args += ", 'NOSIGN'"; + else if (config.access_key_id.has_value() && config.secret_access_key.has_value()) + creation_args += fmt::format(", '{}', '{}'", config.access_key_id.value(), config.secret_access_key.value()); + + const String query = fmt::format("CREATE DATABASE {} ENGINE = S3({})", backQuoteIfNeed(getDatabaseName()), creation_args); + ASTPtr ast = parseQuery(parser, query.data(), query.data() + query.size(), "", 0, settings.max_parser_depth); + + if (const auto database_comment = getDatabaseComment(); !database_comment.empty()) + { + auto & ast_create_query = ast->as(); + ast_create_query.set(ast_create_query.comment, std::make_shared(database_comment)); + } + + return ast; +} + +void DatabaseS3::shutdown() +{ + Tables tables_snapshot; + { + std::lock_guard lock(mutex); + tables_snapshot = loaded_tables; + } + + for (const auto & kv : tables_snapshot) + { + auto table_id = kv.second->getStorageID(); + kv.second->flushAndShutdown(); + } + + std::lock_guard lock(mutex); + loaded_tables.clear(); +} + +DatabaseS3::Configuration DatabaseS3::parseArguments(ASTs engine_args, ContextPtr context_) +{ + Configuration result; + + if (auto named_collection = tryGetNamedCollectionWithOverrides(engine_args, context_)) + { + auto & collection = *named_collection; + + validateNamedCollection(collection, {}, optional_configuration_keys); + + result.url_prefix = collection.getOrDefault("url", ""); + result.no_sign_request = collection.getOrDefault("no_sign_request", false); + + auto key_id = collection.getOrDefault("access_key_id", ""); + auto secret_key = collection.getOrDefault("secret_access_key", ""); + + if (!key_id.empty()) + result.access_key_id = key_id; + + if (!secret_key.empty()) + result.secret_access_key = secret_key; + } + else + { + const std::string supported_signature = + " - S3()\n" + " - S3('url')\n" + " - S3('url', 'NOSIGN')\n" + " - S3('url', 'access_key_id', 'secret_access_key')\n"; + const auto error_message = + fmt::format("Engine DatabaseS3 must have the following arguments signature\n{}", supported_signature); + + for (auto & arg : engine_args) + arg = evaluateConstantExpressionOrIdentifierAsLiteral(arg, context_); + + if (engine_args.size() > 3) + throw Exception(ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH, error_message.c_str()); + + if (engine_args.empty()) + return result; + + result.url_prefix = checkAndGetLiteralArgument(engine_args[0], "url"); + + // url, NOSIGN + if (engine_args.size() == 2) + { + auto second_arg = checkAndGetLiteralArgument(engine_args[1], "NOSIGN"); + if (boost::iequals(second_arg, "NOSIGN")) + result.no_sign_request = true; + else + throw Exception(ErrorCodes::BAD_ARGUMENTS, error_message.c_str()); + } + + // url, access_key_id, secret_access_key + if (engine_args.size() == 3) + { + auto key_id = checkAndGetLiteralArgument(engine_args[1], "access_key_id"); + auto secret_key = checkAndGetLiteralArgument(engine_args[2], "secret_access_key"); + + if (key_id.empty() || secret_key.empty() || boost::iequals(key_id, "NOSIGN")) + throw Exception(ErrorCodes::BAD_ARGUMENTS, error_message.c_str()); + + result.access_key_id = key_id; + result.secret_access_key = secret_key; + } + } + + return result; +} + +/** + * Returns an empty vector because the database is read-only and no tables can be backed up + */ +std::vector> DatabaseS3::getTablesForBackup(const FilterByNameFunction &, const ContextPtr &) const +{ + return {}; +} + +/** + * + * Returns an empty iterator because the database does not have its own tables + * But only caches them for quick access + */ +DatabaseTablesIteratorPtr DatabaseS3::getTablesIterator(ContextPtr, const FilterByNameFunction &) const +{ + return std::make_unique(Tables{}, getDatabaseName()); +} + +} + +#endif diff --git a/src/Databases/DatabaseS3.h b/src/Databases/DatabaseS3.h new file mode 100644 index 00000000000..8297ae4e02d --- /dev/null +++ b/src/Databases/DatabaseS3.h @@ -0,0 +1,81 @@ +#pragma once + +#include "config.h" + +#if USE_AWS_S3 + +#include +#include +#include +#include +#include + +namespace DB +{ + +class Context; + +/** + * DatabaseS3 provides access to data stored in S3. + * Uses TableFunctionS3 to implicitly load file when a user requests the table, + * and provides read-only access to the data in the file. + * Tables are cached inside the database for quick access. + */ +class DatabaseS3 : public IDatabase, protected WithContext +{ +public: + struct Configuration + { + std::string url_prefix; + + bool no_sign_request = false; + + std::optional access_key_id; + std::optional secret_access_key; + }; + + DatabaseS3(const String & name, const Configuration& config, ContextPtr context); + + String getEngineName() const override { return "S3"; } + + bool isTableExist(const String & name, ContextPtr context) const override; + + StoragePtr getTable(const String & name, ContextPtr context) const override; + + StoragePtr tryGetTable(const String & name, ContextPtr context) const override; + + // Contains only temporary tables + bool shouldBeEmptyOnDetach() const override { return false; } + + bool empty() const override; + + bool isReadOnly() const override { return true; } + + ASTPtr getCreateDatabaseQuery() const override; + + void shutdown() override; + + std::vector> getTablesForBackup(const FilterByNameFunction &, const ContextPtr &) const override; + DatabaseTablesIteratorPtr getTablesIterator(ContextPtr, const FilterByNameFunction &) const override; + + static Configuration parseArguments(ASTs engine_args, ContextPtr context); + +protected: + StoragePtr getTableImpl(const String & name, ContextPtr context) const; + + void addTable(const std::string & table_name, StoragePtr table_storage) const; + + bool checkUrl(const std::string & url, ContextPtr context_, bool throw_on_error) const; + + std::string getFullUrl(const std::string & name) const; + +private: + const Configuration config; + + mutable Tables loaded_tables TSA_GUARDED_BY(mutex); + Poco::Logger * log; +}; + +} + +#endif diff --git a/src/Databases/DatabasesOverlay.cpp b/src/Databases/DatabasesOverlay.cpp new file mode 100644 index 00000000000..b44a9798072 --- /dev/null +++ b/src/Databases/DatabasesOverlay.cpp @@ -0,0 +1,266 @@ +#include + +#include +#include +#include +#include + +#include + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int LOGICAL_ERROR; + extern const int CANNOT_GET_CREATE_TABLE_QUERY; +} + +DatabasesOverlay::DatabasesOverlay(const String & name_, ContextPtr context_) + : IDatabase(name_), WithContext(context_->getGlobalContext()), log(&Poco::Logger::get("DatabaseOverlay(" + name_ + ")")) +{ +} + +DatabasesOverlay & DatabasesOverlay::registerNextDatabase(DatabasePtr database) +{ + databases.push_back(std::move(database)); + return *this; +} + +bool DatabasesOverlay::isTableExist(const String & table_name, ContextPtr context_) const +{ + for (const auto & db : databases) + { + if (db->isTableExist(table_name, context_)) + return true; + } + return false; +} + +StoragePtr DatabasesOverlay::tryGetTable(const String & table_name, ContextPtr context_) const +{ + StoragePtr result = nullptr; + for (const auto & db : databases) + { + result = db->tryGetTable(table_name, context_); + if (result) + break; + } + return result; +} + +void DatabasesOverlay::createTable(ContextPtr context_, const String & table_name, const StoragePtr & table, const ASTPtr & query) +{ + for (auto & db : databases) + { + if (!db->isReadOnly()) + { + db->createTable(context_, table_name, table, query); + return; + } + } + throw Exception( + ErrorCodes::LOGICAL_ERROR, + "There is no databases for CREATE TABLE `{}` query in database `{}` (engine {})", + table_name, + getDatabaseName(), + getEngineName()); +} + +void DatabasesOverlay::dropTable(ContextPtr context_, const String & table_name, bool sync) +{ + for (auto & db : databases) + { + if (db->isTableExist(table_name, context_)) + { + db->dropTable(context_, table_name, sync); + return; + } + } + throw Exception( + ErrorCodes::LOGICAL_ERROR, + "There is no databases for DROP TABLE `{}` query in database `{}` (engine {})", + table_name, + getDatabaseName(), + getEngineName()); +} + +void DatabasesOverlay::attachTable( + ContextPtr context_, const String & table_name, const StoragePtr & table, const String & relative_table_path) +{ + for (auto & db : databases) + { + try + { + db->attachTable(context_, table_name, table, relative_table_path); + return; + } + catch (...) + { + continue; + } + } + throw Exception( + ErrorCodes::LOGICAL_ERROR, + "There is no databases for ATTACH TABLE `{}` query in database `{}` (engine {})", + table_name, + getDatabaseName(), + getEngineName()); +} + +StoragePtr DatabasesOverlay::detachTable(ContextPtr context_, const String & table_name) +{ + StoragePtr result = nullptr; + for (auto & db : databases) + { + if (db->isTableExist(table_name, context_)) + return db->detachTable(context_, table_name); + } + throw Exception( + ErrorCodes::LOGICAL_ERROR, + "There is no databases for DETACH TABLE `{}` query in database `{}` (engine {})", + table_name, + getDatabaseName(), + getEngineName()); +} + +ASTPtr DatabasesOverlay::getCreateTableQueryImpl(const String & name, ContextPtr context_, bool throw_on_error) const +{ + ASTPtr result = nullptr; + for (const auto & db : databases) + { + result = db->tryGetCreateTableQuery(name, context_); + if (result) + break; + } + if (!result && throw_on_error) + throw Exception( + ErrorCodes::CANNOT_GET_CREATE_TABLE_QUERY, + "There is no metadata of table `{}` in database `{}` (engine {})", + name, + getDatabaseName(), + getEngineName()); + return result; +} + +/* + * DatabaseOverlay cannot be constructed by "CREATE DATABASE" query, as it is not a traditional ClickHouse database + * To use DatabaseOverlay, it must be constructed programmatically in code + */ +ASTPtr DatabasesOverlay::getCreateDatabaseQuery() const +{ + return std::make_shared(); +} + +String DatabasesOverlay::getTableDataPath(const String & table_name) const +{ + String result; + for (const auto & db : databases) + { + result = db->getTableDataPath(table_name); + if (!result.empty()) + break; + } + return result; +} + +String DatabasesOverlay::getTableDataPath(const ASTCreateQuery & query) const +{ + String result; + for (const auto & db : databases) + { + result = db->getTableDataPath(query); + if (!result.empty()) + break; + } + return result; +} + +UUID DatabasesOverlay::tryGetTableUUID(const String & table_name) const +{ + UUID result = UUIDHelpers::Nil; + for (const auto & db : databases) + { + result = db->tryGetTableUUID(table_name); + if (result != UUIDHelpers::Nil) + break; + } + return result; +} + +void DatabasesOverlay::drop(ContextPtr context_) +{ + for (auto & db : databases) + db->drop(context_); +} + +void DatabasesOverlay::alterTable(ContextPtr local_context, const StorageID & table_id, const StorageInMemoryMetadata & metadata) +{ + for (auto & db : databases) + { + if (!db->isReadOnly() && db->isTableExist(table_id.table_name, local_context)) + { + db->alterTable(local_context, table_id, metadata); + return; + } + } + throw Exception( + ErrorCodes::LOGICAL_ERROR, + "There is no databases for ALTER TABLE `{}` query in database `{}` (engine {})", + table_id.table_name, + getDatabaseName(), + getEngineName()); +} + +std::vector> +DatabasesOverlay::getTablesForBackup(const FilterByNameFunction & filter, const ContextPtr & local_context) const +{ + std::vector> result; + for (const auto & db : databases) + { + auto db_backup = db->getTablesForBackup(filter, local_context); + result.insert(result.end(), std::make_move_iterator(db_backup.begin()), std::make_move_iterator(db_backup.end())); + } + return result; +} + +void DatabasesOverlay::createTableRestoredFromBackup( + const ASTPtr & create_table_query, + ContextMutablePtr local_context, + std::shared_ptr /*restore_coordination*/, + UInt64 /*timeout_ms*/) +{ + /// Creates a tables by executing a "CREATE TABLE" query. + InterpreterCreateQuery interpreter{create_table_query, local_context}; + interpreter.setInternal(true); + interpreter.execute(); +} + +bool DatabasesOverlay::empty() const +{ + for (const auto & db : databases) + { + if (!db->empty()) + return false; + } + return true; +} + +void DatabasesOverlay::shutdown() +{ + for (auto & db : databases) + db->shutdown(); +} + +DatabaseTablesIteratorPtr DatabasesOverlay::getTablesIterator(ContextPtr context_, const FilterByNameFunction & filter_by_table_name) const +{ + Tables tables; + for (const auto & db : databases) + { + for (auto table_it = db->getTablesIterator(context_, filter_by_table_name); table_it->isValid(); table_it->next()) + tables.insert({table_it->name(), table_it->table()}); + } + return std::make_unique(std::move(tables), getDatabaseName()); +} + +} diff --git a/src/Databases/DatabasesOverlay.h b/src/Databases/DatabasesOverlay.h new file mode 100644 index 00000000000..0f31bbd6a47 --- /dev/null +++ b/src/Databases/DatabasesOverlay.h @@ -0,0 +1,66 @@ +#pragma once + +#include +#include + +namespace DB +{ + +/** + * Implements the IDatabase interface and combines multiple other databases + * Searches for tables in each database in order until found, and delegates operations to the appropriate database + * Useful for combining databases + * + * Used in clickhouse-local to combine DatabaseFileSystem and DatabaseMemory + */ +class DatabasesOverlay : public IDatabase, protected WithContext +{ +public: + DatabasesOverlay(const String & name_, ContextPtr context_); + + /// Not thread-safe. Use only as factory to initialize database + DatabasesOverlay & registerNextDatabase(DatabasePtr database); + + String getEngineName() const override { return "Overlay"; } + +public: + bool isTableExist(const String & table_name, ContextPtr context) const override; + + StoragePtr tryGetTable(const String & table_name, ContextPtr context) const override; + + void createTable(ContextPtr context, const String & table_name, const StoragePtr & table, const ASTPtr & query) override; + + void dropTable(ContextPtr context, const String & table_name, bool sync) override; + + void attachTable(ContextPtr context, const String & table_name, const StoragePtr & table, const String & relative_table_path) override; + + StoragePtr detachTable(ContextPtr context, const String & table_name) override; + + ASTPtr getCreateTableQueryImpl(const String & name, ContextPtr context, bool throw_on_error) const override; + ASTPtr getCreateDatabaseQuery() const override; + + String getTableDataPath(const String & table_name) const override; + String getTableDataPath(const ASTCreateQuery & query) const override; + + UUID tryGetTableUUID(const String & table_name) const override; + + void drop(ContextPtr context) override; + + void alterTable(ContextPtr local_context, const StorageID & table_id, const StorageInMemoryMetadata & metadata) override; + + std::vector> getTablesForBackup(const FilterByNameFunction & filter, const ContextPtr & local_context) const override; + + void createTableRestoredFromBackup(const ASTPtr & create_table_query, ContextMutablePtr local_context, std::shared_ptr restore_coordination, UInt64 timeout_ms) override; + + DatabaseTablesIteratorPtr getTablesIterator(ContextPtr context, const FilterByNameFunction & filter_by_table_name) const override; + + bool empty() const override; + + void shutdown() override; + +protected: + std::vector databases; + Poco::Logger * log; +}; + +} diff --git a/src/Databases/IDatabase.h b/src/Databases/IDatabase.h index 53a2f372814..a9577dfc84a 100644 --- a/src/Databases/IDatabase.h +++ b/src/Databases/IDatabase.h @@ -170,7 +170,7 @@ public: /// Get the table for work. Return nullptr if there is no table. virtual StoragePtr tryGetTable(const String & name, ContextPtr context) const = 0; - StoragePtr getTable(const String & name, ContextPtr context) const; + virtual StoragePtr getTable(const String & name, ContextPtr context) const; virtual UUID tryGetTableUUID(const String & /*table_name*/) const { return UUIDHelpers::Nil; } @@ -183,6 +183,8 @@ public: /// Is the database empty. virtual bool empty() const = 0; + virtual bool isReadOnly() const { return false; } + /// Add the table to the database. Record its presence in the metadata. virtual void createTable( ContextPtr /*context*/, @@ -254,6 +256,9 @@ public: throw Exception(ErrorCodes::NOT_IMPLEMENTED, "{}: alterTable() is not supported", getEngineName()); } + /// Special method for ReplicatedMergeTree and DatabaseReplicated + virtual bool canExecuteReplicatedMetadataAlter() const { return true; } + /// Returns time of table's metadata change, 0 if there is no corresponding metadata file. virtual time_t getObjectMetadataModificationTime(const String & /*name*/) const { diff --git a/src/Dictionaries/HTTPDictionarySource.cpp b/src/Dictionaries/HTTPDictionarySource.cpp index 757a3b1819b..55bff868dee 100644 --- a/src/Dictionaries/HTTPDictionarySource.cpp +++ b/src/Dictionaries/HTTPDictionarySource.cpp @@ -135,6 +135,7 @@ QueryPipeline HTTPDictionarySource::loadIds(const std::vector & ids) WriteBufferFromOStream out_buffer(ostr); auto output_format = context->getOutputFormatParallelIfPossible(configuration.format, out_buffer, block.cloneEmpty()); formatBlock(output_format, block); + out_buffer.finalize(); }; Poco::URI uri(configuration.url); @@ -164,6 +165,7 @@ QueryPipeline HTTPDictionarySource::loadKeys(const Columns & key_columns, const WriteBufferFromOStream out_buffer(ostr); auto output_format = context->getOutputFormatParallelIfPossible(configuration.format, out_buffer, block.cloneEmpty()); formatBlock(output_format, block); + out_buffer.finalize(); }; Poco::URI uri(configuration.url); diff --git a/src/Dictionaries/HashedDictionary.cpp b/src/Dictionaries/HashedDictionary.cpp index eb1d98a8f39..5f25600db8f 100644 --- a/src/Dictionaries/HashedDictionary.cpp +++ b/src/Dictionaries/HashedDictionary.cpp @@ -10,6 +10,7 @@ #include #include #include +#include #include @@ -69,6 +70,11 @@ public: shards_queues[shard].emplace(backlog); pool.scheduleOrThrowOnError([this, shard, thread_group = CurrentThread::getGroup()] { + SCOPE_EXIT_SAFE( + if (thread_group) + CurrentThread::detachFromGroupIfNotDetached(); + ); + /// Do not account memory that was occupied by the dictionaries for the query/user context. MemoryTrackerBlockerInThread memory_blocker; @@ -230,6 +236,11 @@ HashedDictionary::~HashedDictionary() pool.trySchedule([&container, thread_group = CurrentThread::getGroup()] { + SCOPE_EXIT_SAFE( + if (thread_group) + CurrentThread::detachFromGroupIfNotDetached(); + ); + /// Do not account memory that was occupied by the dictionaries for the query/user context. MemoryTrackerBlockerInThread memory_blocker; diff --git a/src/Dictionaries/RegExpTreeDictionary.cpp b/src/Dictionaries/RegExpTreeDictionary.cpp index 3852cca6928..a9846dc06e9 100644 --- a/src/Dictionaries/RegExpTreeDictionary.cpp +++ b/src/Dictionaries/RegExpTreeDictionary.cpp @@ -30,8 +30,6 @@ #include #include -#include - #include "config.h" #if USE_VECTORSCAN @@ -469,17 +467,16 @@ public: std::pair processBackRefs(const String & data, const re2_st::RE2 & searcher, const std::vector & pieces) { - re2_st::StringPiece haystack(data.data(), data.size()); - re2_st::StringPiece matches[10]; + std::string_view matches[10]; String result; - searcher.Match(haystack, 0, data.size(), re2_st::RE2::Anchor::UNANCHORED, matches, 10); + searcher.Match({data.data(), data.size()}, 0, data.size(), re2_st::RE2::Anchor::UNANCHORED, matches, 10); /// if the pattern is a single '$1' but fails to match, we would use the default value. if (pieces.size() == 1 && pieces[0].ref_num >= 0 && pieces[0].ref_num < 10 && matches[pieces[0].ref_num].empty()) return std::make_pair(result, true); for (const auto & item : pieces) { if (item.ref_num >= 0 && item.ref_num < 10) - result += matches[item.ref_num].ToString(); + result += String{matches[item.ref_num]}; else result += item.literal; } diff --git a/src/Disks/IO/CachedOnDiskReadBufferFromFile.cpp b/src/Disks/IO/CachedOnDiskReadBufferFromFile.cpp index 202f40bfdb2..15b6a9211de 100644 --- a/src/Disks/IO/CachedOnDiskReadBufferFromFile.cpp +++ b/src/Disks/IO/CachedOnDiskReadBufferFromFile.cpp @@ -74,19 +74,22 @@ CachedOnDiskReadBufferFromFile::CachedOnDiskReadBufferFromFile( } void CachedOnDiskReadBufferFromFile::appendFilesystemCacheLog( - const FileSegment::Range & file_segment_range, CachedOnDiskReadBufferFromFile::ReadType type) + const FileSegment & file_segment, CachedOnDiskReadBufferFromFile::ReadType type) { if (!cache_log) return; + const auto range = file_segment.range(); FilesystemCacheLogElement elem { .event_time = std::chrono::system_clock::to_time_t(std::chrono::system_clock::now()), .query_id = query_id, .source_file_path = source_file_path, - .file_segment_range = { file_segment_range.left, file_segment_range.right }, + .file_segment_range = { range.left, range.right }, .requested_range = { first_offset, read_until_position }, - .file_segment_size = file_segment_range.size(), + .file_segment_key = file_segment.key().toString(), + .file_segment_offset = file_segment.offset(), + .file_segment_size = range.size(), .read_from_cache_attempted = true, .read_buffer_id = current_buffer_id, .profile_counters = std::make_shared( @@ -157,7 +160,7 @@ CachedOnDiskReadBufferFromFile::getCacheReadBuffer(const FileSegment & file_segm if (use_external_buffer) local_read_settings.local_fs_buffer_size = 0; - auto buf = createReadBufferFromFileBase(path, local_read_settings); + auto buf = createReadBufferFromFileBase(path, local_read_settings, std::nullopt, std::nullopt, file_segment.getFlagsForLocalRead()); if (getFileSizeFromReadBuffer(*buf) == 0) throw Exception(ErrorCodes::LOGICAL_ERROR, "Attempt to read from an empty cache file: {}", path); @@ -495,7 +498,7 @@ bool CachedOnDiskReadBufferFromFile::completeFileSegmentAndGetNext() auto completed_range = current_file_segment->range(); if (cache_log) - appendFilesystemCacheLog(completed_range, read_type); + appendFilesystemCacheLog(*current_file_segment, read_type); chassert(file_offset_of_buffer_end > completed_range.right); @@ -507,9 +510,6 @@ bool CachedOnDiskReadBufferFromFile::completeFileSegmentAndGetNext() current_file_segment->use(); implementation_buffer = getImplementationBuffer(*current_file_segment); - if (read_type == ReadType::CACHED) - current_file_segment->incrementHitsCount(); - LOG_TEST( log, "New segment range: {}, old range: {}", current_file_segment->range().toString(), completed_range.toString()); @@ -521,7 +521,7 @@ CachedOnDiskReadBufferFromFile::~CachedOnDiskReadBufferFromFile() { if (cache_log && file_segments && !file_segments->empty()) { - appendFilesystemCacheLog(file_segments->front().range(), read_type); + appendFilesystemCacheLog(file_segments->front(), read_type); } } @@ -852,9 +852,7 @@ bool CachedOnDiskReadBufferFromFile::nextImplStep() else { implementation_buffer = getImplementationBuffer(file_segments->front()); - - if (read_type == ReadType::CACHED) - file_segments->front().incrementHitsCount(); + file_segments->front().use(); } chassert(!internal_buffer.empty()); diff --git a/src/Disks/IO/CachedOnDiskReadBufferFromFile.h b/src/Disks/IO/CachedOnDiskReadBufferFromFile.h index b4e7701de75..36cf8a54183 100644 --- a/src/Disks/IO/CachedOnDiskReadBufferFromFile.h +++ b/src/Disks/IO/CachedOnDiskReadBufferFromFile.h @@ -90,7 +90,7 @@ private: bool completeFileSegmentAndGetNext(); - void appendFilesystemCacheLog(const FileSegment::Range & file_segment_range, ReadType read_type); + void appendFilesystemCacheLog(const FileSegment & file_segment, ReadType read_type); bool writeCache(char * data, size_t size, size_t offset, FileSegment & file_segment); diff --git a/src/Disks/IO/ReadBufferFromRemoteFSGather.cpp b/src/Disks/IO/ReadBufferFromRemoteFSGather.cpp index eb9c509e459..2cd90731f1d 100644 --- a/src/Disks/IO/ReadBufferFromRemoteFSGather.cpp +++ b/src/Disks/IO/ReadBufferFromRemoteFSGather.cpp @@ -56,6 +56,7 @@ SeekableReadBufferPtr ReadBufferFromRemoteFSGather::createImplementationBuffer(c size_t current_read_until_position = read_until_position ? read_until_position : object.bytes_size; auto current_read_buffer_creator = [=, this]() { return read_buffer_creator(object_path, current_read_until_position); }; +#ifndef CLICKHOUSE_PROGRAM_STANDALONE_BUILD if (with_cache) { auto cache_key = settings.remote_fs_cache->createKeyForPath(object_path); @@ -72,6 +73,7 @@ SeekableReadBufferPtr ReadBufferFromRemoteFSGather::createImplementationBuffer(c read_until_position ? std::optional(read_until_position) : std::nullopt, cache_log); } +#endif return current_read_buffer_creator(); } @@ -88,6 +90,8 @@ void ReadBufferFromRemoteFSGather::appendUncachedReadInfo() .source_file_path = current_object.remote_path, .file_segment_range = { 0, current_object.bytes_size }, .cache_type = FilesystemCacheLogElement::CacheType::READ_FROM_FS_BYPASSING_CACHE, + .file_segment_key = {}, + .file_segment_offset = {}, .file_segment_size = current_object.bytes_size, .read_from_cache_attempted = false, }; diff --git a/src/Disks/IO/getThreadPoolReader.cpp b/src/Disks/IO/getThreadPoolReader.cpp index deb8f66106c..7dbff9ffe76 100644 --- a/src/Disks/IO/getThreadPoolReader.cpp +++ b/src/Disks/IO/getThreadPoolReader.cpp @@ -7,9 +7,7 @@ #include #include -#ifndef CLICKHOUSE_PROGRAM_STANDALONE_BUILD #include -#endif namespace DB { @@ -21,32 +19,10 @@ namespace ErrorCodes IAsynchronousReader & getThreadPoolReader(FilesystemReaderType type) { -#ifdef CLICKHOUSE_PROGRAM_STANDALONE_BUILD - const auto & config = Poco::Util::Application::instance().config(); - switch (type) - { - case FilesystemReaderType::ASYNCHRONOUS_REMOTE_FS_READER: - { - static auto asynchronous_remote_fs_reader = createThreadPoolReader(type, config); - return *asynchronous_remote_fs_reader; - } - case FilesystemReaderType::ASYNCHRONOUS_LOCAL_FS_READER: - { - static auto asynchronous_local_fs_reader = createThreadPoolReader(type, config); - return *asynchronous_local_fs_reader; - } - case FilesystemReaderType::SYNCHRONOUS_LOCAL_FS_READER: - { - static auto synchronous_local_fs_reader = createThreadPoolReader(type, config); - return *synchronous_local_fs_reader; - } - } -#else auto context = Context::getGlobalContextInstance(); if (!context) throw Exception(ErrorCodes::LOGICAL_ERROR, "Global context not initialized"); return context->getThreadPoolReader(type); -#endif } std::unique_ptr createThreadPoolReader( diff --git a/src/Disks/ObjectStorages/DiskObjectStorage.cpp b/src/Disks/ObjectStorages/DiskObjectStorage.cpp index 005d115a277..e5bbd2ca0c6 100644 --- a/src/Disks/ObjectStorages/DiskObjectStorage.cpp +++ b/src/Disks/ObjectStorages/DiskObjectStorage.cpp @@ -11,7 +11,6 @@ #include #include #include -#include #include #include #include @@ -530,24 +529,6 @@ DiskObjectStoragePtr DiskObjectStorage::createDiskObjectStorage() threadpool_size); } -void DiskObjectStorage::wrapWithCache(FileCachePtr cache, const FileCacheSettings & cache_settings, const String & layer_name) -{ - object_storage = std::make_shared(object_storage, cache, cache_settings, layer_name); -} - -NameSet DiskObjectStorage::getCacheLayersNames() const -{ - NameSet cache_layers; - auto current_object_storage = object_storage; - while (current_object_storage->supportsCache()) - { - auto * cached_object_storage = assert_cast(current_object_storage.get()); - cache_layers.insert(cached_object_storage->getCacheConfigName()); - current_object_storage = cached_object_storage->getWrappedObjectStorage(); - } - return cache_layers; -} - std::unique_ptr DiskObjectStorage::readFile( const String & path, const ReadSettings & settings, diff --git a/src/Disks/ObjectStorages/DiskObjectStorage.h b/src/Disks/ObjectStorages/DiskObjectStorage.h index b7dfaf67cf2..c9820956a4d 100644 --- a/src/Disks/ObjectStorages/DiskObjectStorage.h +++ b/src/Disks/ObjectStorages/DiskObjectStorage.h @@ -181,20 +181,22 @@ public: /// MergeTree table on this disk. bool isWriteOnce() const override; - /// Add a cache layer. - /// Example: DiskObjectStorage(S3ObjectStorage) -> DiskObjectStorage(CachedObjectStorage(S3ObjectStorage)) - /// There can be any number of cache layers: - /// DiskObjectStorage(CachedObjectStorage(...CacheObjectStorage(S3ObjectStorage)...)) - void wrapWithCache(FileCachePtr cache, const FileCacheSettings & cache_settings, const String & layer_name); - /// Get structure of object storage this disk works with. Examples: /// DiskObjectStorage(S3ObjectStorage) /// DiskObjectStorage(CachedObjectStorage(S3ObjectStorage)) /// DiskObjectStorage(CachedObjectStorage(CachedObjectStorage(S3ObjectStorage))) String getStructure() const { return fmt::format("DiskObjectStorage-{}({})", getName(), object_storage->getName()); } +#ifndef CLICKHOUSE_PROGRAM_STANDALONE_BUILD + /// Add a cache layer. + /// Example: DiskObjectStorage(S3ObjectStorage) -> DiskObjectStorage(CachedObjectStorage(S3ObjectStorage)) + /// There can be any number of cache layers: + /// DiskObjectStorage(CachedObjectStorage(...CacheObjectStorage(S3ObjectStorage)...)) + void wrapWithCache(FileCachePtr cache, const FileCacheSettings & cache_settings, const String & layer_name); + /// Get names of all cache layers. Name is how cache is defined in configuration file. NameSet getCacheLayersNames() const override; +#endif static std::shared_ptr getAsyncExecutor(const std::string & log_name, size_t size); diff --git a/src/Disks/ObjectStorages/DiskObjectStorageCache.cpp b/src/Disks/ObjectStorages/DiskObjectStorageCache.cpp new file mode 100644 index 00000000000..9e5012dec54 --- /dev/null +++ b/src/Disks/ObjectStorages/DiskObjectStorageCache.cpp @@ -0,0 +1,28 @@ +#include + +#include + +#include + +namespace DB +{ + +void DiskObjectStorage::wrapWithCache(FileCachePtr cache, const FileCacheSettings & cache_settings, const String & layer_name) +{ + object_storage = std::make_shared(object_storage, cache, cache_settings, layer_name); +} + +NameSet DiskObjectStorage::getCacheLayersNames() const +{ + NameSet cache_layers; + auto current_object_storage = object_storage; + while (current_object_storage->supportsCache()) + { + auto * cached_object_storage = assert_cast(current_object_storage.get()); + cache_layers.insert(cached_object_storage->getCacheConfigName()); + current_object_storage = cached_object_storage->getWrappedObjectStorage(); + } + return cache_layers; +} + +} diff --git a/src/Disks/ObjectStorages/MetadataStorageFromPlainObjectStorage.h b/src/Disks/ObjectStorages/MetadataStorageFromPlainObjectStorage.h index 0beed65879b..fb5b6d0757c 100644 --- a/src/Disks/ObjectStorages/MetadataStorageFromPlainObjectStorage.h +++ b/src/Disks/ObjectStorages/MetadataStorageFromPlainObjectStorage.h @@ -63,7 +63,7 @@ public: uint32_t getHardlinkCount(const std::string & /* path */) const override { - return 1; + return 0; } bool supportsChmod() const override { return false; } diff --git a/src/Disks/getOrCreateDiskFromAST.cpp b/src/Disks/getOrCreateDiskFromAST.cpp index 637acff7b95..81d5b7372f3 100644 --- a/src/Disks/getOrCreateDiskFromAST.cpp +++ b/src/Disks/getOrCreateDiskFromAST.cpp @@ -26,12 +26,25 @@ namespace { std::string getOrCreateDiskFromDiskAST(const ASTFunction & function, ContextPtr context) { - /// We need a unique name for a created custom disk, but it needs to be the same - /// after table is reattached or server is restarted, so take a hash of the disk - /// configuration serialized ast as a disk name suffix. - auto disk_setting_string = serializeAST(function, true); - auto disk_name = DiskSelector::TMP_INTERNAL_DISK_PREFIX - + toString(sipHash128(disk_setting_string.data(), disk_setting_string.size())); + std::string disk_name; + if (function.name == "disk") + { + /// We need a unique name for a created custom disk, but it needs to be the same + /// after table is reattached or server is restarted, so take a hash of the disk + /// configuration serialized ast as a disk name suffix. + auto disk_setting_string = serializeAST(function, true); + disk_name = DiskSelector::TMP_INTERNAL_DISK_PREFIX + + toString(sipHash128(disk_setting_string.data(), disk_setting_string.size())); + } + else + { + static constexpr std::string_view custom_disk_prefix = "disk_"; + + if (function.name.size() <= custom_disk_prefix.size() || !function.name.starts_with(custom_disk_prefix)) + throw Exception(ErrorCodes::BAD_ARGUMENTS, "Invalid disk name: {}", function.name); + + disk_name = function.name.substr(custom_disk_prefix.size()); + } auto result_disk = context->getOrCreateDisk(disk_name, [&](const DisksMap & disks_map) -> DiskPtr { const auto * function_args_expr = assert_cast(function.arguments.get()); @@ -43,6 +56,9 @@ namespace return disk; }); + if (!result_disk->isCustomDisk()) + throw Exception(ErrorCodes::BAD_ARGUMENTS, "Disk with name `{}` already exist", disk_name); + if (!result_disk->isRemote()) { static constexpr auto custom_disks_base_dir_in_config = "custom_local_disks_base_directory"; diff --git a/src/Disks/registerDisks.cpp b/src/Disks/registerDisks.cpp index 48d5a19fb61..676744a8e79 100644 --- a/src/Disks/registerDisks.cpp +++ b/src/Disks/registerDisks.cpp @@ -32,6 +32,8 @@ void registerDiskCache(DiskFactory & factory, bool global_skip_access_check); void registerDiskLocalObjectStorage(DiskFactory & factory, bool global_skip_access_check); +#ifndef CLICKHOUSE_PROGRAM_STANDALONE_BUILD + void registerDisks(bool global_skip_access_check) { auto & factory = DiskFactory::instance(); @@ -61,4 +63,19 @@ void registerDisks(bool global_skip_access_check) registerDiskLocalObjectStorage(factory, global_skip_access_check); } +#else + +void registerDisks(bool global_skip_access_check) +{ + auto & factory = DiskFactory::instance(); + + registerDiskLocal(factory, global_skip_access_check); + +#if USE_AWS_S3 + registerDiskS3(factory, global_skip_access_check); +#endif +} + +#endif + } diff --git a/src/Functions/FunctionMathUnary.h b/src/Functions/FunctionMathUnary.h index 6e4bff7122d..9f400932356 100644 --- a/src/Functions/FunctionMathUnary.h +++ b/src/Functions/FunctionMathUnary.h @@ -154,6 +154,8 @@ private: using ColVecType = ColumnVectorOrDecimal; const auto col_vec = checkAndGetColumn(col.column.get()); + if (col_vec == nullptr) + return false; return (res = execute(col_vec)) != nullptr; }; diff --git a/src/Functions/ReplaceRegexpImpl.h b/src/Functions/ReplaceRegexpImpl.h index 7e3af1e62d9..9395489dac3 100644 --- a/src/Functions/ReplaceRegexpImpl.h +++ b/src/Functions/ReplaceRegexpImpl.h @@ -99,8 +99,8 @@ struct ReplaceRegexpImpl int num_captures, const Instructions & instructions) { - re2_st::StringPiece haystack(haystack_data, haystack_length); - re2_st::StringPiece matches[max_captures]; + std::string_view haystack(haystack_data, haystack_length); + std::string_view matches[max_captures]; size_t copy_pos = 0; size_t match_pos = 0; diff --git a/src/Functions/checkHyperscanRegexp.cpp b/src/Functions/checkHyperscanRegexp.cpp index 441e35cc5db..0dd4c5740c3 100644 --- a/src/Functions/checkHyperscanRegexp.cpp +++ b/src/Functions/checkHyperscanRegexp.cpp @@ -45,8 +45,8 @@ bool isLargerThanFifty(std::string_view str) /// Check for sub-patterns of the form x{n} or x{n,} can be expensive. Ignore spaces before/after n and m. bool SlowWithHyperscanChecker::isSlowOneRepeat(std::string_view regexp) { - re2_st::StringPiece haystack(regexp.data(), regexp.size()); - re2_st::StringPiece matches[2]; + std::string_view haystack(regexp.data(), regexp.size()); + std::string_view matches[2]; size_t start_pos = 0; while (start_pos < haystack.size()) { @@ -67,8 +67,8 @@ bool SlowWithHyperscanChecker::isSlowOneRepeat(std::string_view regexp) /// Check if sub-patterns of the form x{n,m} can be expensive. Ignore spaces before/after n and m. bool SlowWithHyperscanChecker::isSlowTwoRepeats(std::string_view regexp) { - re2_st::StringPiece haystack(regexp.data(), regexp.size()); - re2_st::StringPiece matches[3]; + std::string_view haystack(regexp.data(), regexp.size()); + std::string_view matches[3]; size_t start_pos = 0; while (start_pos < haystack.size()) { diff --git a/src/Functions/extractAllGroups.h b/src/Functions/extractAllGroups.h index faee25aa0ab..3a7987be93e 100644 --- a/src/Functions/extractAllGroups.h +++ b/src/Functions/extractAllGroups.h @@ -94,7 +94,6 @@ public: if (needle.empty()) throw Exception(ErrorCodes::BAD_ARGUMENTS, "Length of 'needle' argument must be greater than 0."); - using StringPiece = typename Regexps::Regexp::StringPieceType; const Regexps::Regexp holder = Regexps::createRegexp(needle); const auto & regexp = holder.getRE2(); @@ -111,7 +110,7 @@ public: groups_count, std::to_string(MAX_GROUPS_COUNT - 1)); // Including 0-group, which is the whole regexp. - PODArrayWithStackMemory matched_groups(groups_count + 1); + PODArrayWithStackMemory matched_groups(groups_count + 1); ColumnArray::ColumnOffsets::MutablePtr root_offsets_col = ColumnArray::ColumnOffsets::create(); ColumnArray::ColumnOffsets::MutablePtr nested_offsets_col = ColumnArray::ColumnOffsets::create(); @@ -160,7 +159,7 @@ public: /// Additional limit to fail fast on supposedly incorrect usage. const auto max_matches_per_row = context->getSettingsRef().regexp_max_matches_per_row; - PODArray all_matches; + PODArray all_matches; /// Number of times RE matched on each row of haystack column. PODArray number_of_matches_per_row; diff --git a/src/Functions/extractGroups.cpp b/src/Functions/extractGroups.cpp index 6744edda922..21b8a68fc10 100644 --- a/src/Functions/extractGroups.cpp +++ b/src/Functions/extractGroups.cpp @@ -75,7 +75,7 @@ public: throw Exception(ErrorCodes::BAD_ARGUMENTS, "There are no groups in regexp: {}", needle); // Including 0-group, which is the whole regexp. - PODArrayWithStackMemory matched_groups(groups_count + 1); + PODArrayWithStackMemory matched_groups(groups_count + 1); ColumnArray::ColumnOffsets::MutablePtr offsets_col = ColumnArray::ColumnOffsets::create(); ColumnString::MutablePtr data_col = ColumnString::create(); @@ -89,7 +89,7 @@ public: { std::string_view current_row = column_haystack->getDataAt(i).toView(); - if (re2->Match(re2_st::StringPiece(current_row.data(), current_row.size()), + if (re2->Match({current_row.data(), current_row.size()}, 0, current_row.size(), re2_st::RE2::UNANCHORED, matched_groups.data(), static_cast(matched_groups.size()))) { diff --git a/src/Functions/tupleElement.cpp b/src/Functions/tupleElement.cpp index b1fd200f5cd..96b5a047419 100644 --- a/src/Functions/tupleElement.cpp +++ b/src/Functions/tupleElement.cpp @@ -17,11 +17,8 @@ namespace DB namespace ErrorCodes { extern const int ILLEGAL_TYPE_OF_ARGUMENT; - extern const int ILLEGAL_INDEX; extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH; extern const int NOT_FOUND_COLUMN_IN_BLOCK; - extern const int NUMBER_OF_DIMENSIONS_MISMATCHED; - extern const int SIZES_OF_ARRAYS_DONT_MATCH; } namespace @@ -34,32 +31,14 @@ class FunctionTupleElement : public IFunction { public: static constexpr auto name = "tupleElement"; - static FunctionPtr create(ContextPtr) - { - return std::make_shared(); - } - - String getName() const override - { - return name; - } + static FunctionPtr create(ContextPtr) { return std::make_shared(); } + String getName() const override { return name; } bool isVariadic() const override { return true; } - - size_t getNumberOfArguments() const override - { - return 0; - } - - bool useDefaultImplementationForConstants() const override - { - return true; - } - + size_t getNumberOfArguments() const override { return 0; } + bool useDefaultImplementationForConstants() const override { return true; } ColumnNumbers getArgumentsThatAreAlwaysConstant() const override { return {1}; } - bool useDefaultImplementationForNulls() const override { return false; } - bool isSuitableForShortCircuitArgumentsExecution(const DataTypesWithConstInfo & /*arguments*/) const override { return false; } DataTypePtr getReturnTypeImpl(const ColumnsWithTypeAndName & arguments) const override @@ -72,193 +51,111 @@ public: getName(), number_of_arguments); size_t count_arrays = 0; - const IDataType * tuple_col = arguments[0].type.get(); - while (const DataTypeArray * array = checkAndGetDataType(tuple_col)) + const IDataType * input_type = arguments[0].type.get(); + while (const DataTypeArray * array = checkAndGetDataType(input_type)) { - tuple_col = array->getNestedType().get(); + input_type = array->getNestedType().get(); ++count_arrays; } - const DataTypeTuple * tuple = checkAndGetDataType(tuple_col); + const DataTypeTuple * tuple = checkAndGetDataType(input_type); if (!tuple) throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "First argument for function {} must be tuple or array of tuple. Actual {}", getName(), arguments[0].type->getName()); - auto index = getElementNum(arguments[1].column, *tuple, number_of_arguments); + std::optional index = getElementIndex(arguments[1].column, *tuple, number_of_arguments); if (index.has_value()) { - DataTypePtr out_return_type = tuple->getElements()[index.value()]; + DataTypePtr return_type = tuple->getElements()[index.value()]; for (; count_arrays; --count_arrays) - out_return_type = std::make_shared(out_return_type); + return_type = std::make_shared(return_type); - return out_return_type; + return return_type; } else - { - const IDataType * default_col = arguments[2].type.get(); - size_t default_argument_count_arrays = 0; - if (const DataTypeArray * array = checkAndGetDataType(default_col)) - { - default_argument_count_arrays = array->getNumberOfDimensions(); - } - - if (count_arrays != default_argument_count_arrays) - { - throw Exception(ErrorCodes::NUMBER_OF_DIMENSIONS_MISMATCHED, - "Dimension of types mismatched between first argument and third argument. " - "Dimension of 1st argument: {}. " - "Dimension of 3rd argument: {}.",count_arrays, default_argument_count_arrays); - } return arguments[2].type; - } } ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr &, size_t input_rows_count) const override { - Columns array_offsets; + const auto & input_arg = arguments[0]; + const IDataType * input_type = input_arg.type.get(); + const IColumn * input_col = input_arg.column.get(); - const auto & first_arg = arguments[0]; - - const IDataType * tuple_type = first_arg.type.get(); - const IColumn * tuple_col = first_arg.column.get(); - bool first_arg_is_const = false; - if (typeid_cast(tuple_col)) + bool input_arg_is_const = false; + if (typeid_cast(input_col)) { - tuple_col = assert_cast(tuple_col)->getDataColumnPtr().get(); - first_arg_is_const = true; + input_col = assert_cast(input_col)->getDataColumnPtr().get(); + input_arg_is_const = true; } - while (const DataTypeArray * array_type = checkAndGetDataType(tuple_type)) - { - const ColumnArray * array_col = assert_cast(tuple_col); - tuple_type = array_type->getNestedType().get(); - tuple_col = &array_col->getData(); + Columns array_offsets; + while (const DataTypeArray * array_type = checkAndGetDataType(input_type)) + { + const ColumnArray * array_col = assert_cast(input_col); + + input_type = array_type->getNestedType().get(); + input_col = &array_col->getData(); array_offsets.push_back(array_col->getOffsetsPtr()); } - const DataTypeTuple * tuple_type_concrete = checkAndGetDataType(tuple_type); - const ColumnTuple * tuple_col_concrete = checkAndGetColumn(tuple_col); - if (!tuple_type_concrete || !tuple_col_concrete) + const DataTypeTuple * input_type_as_tuple = checkAndGetDataType(input_type); + const ColumnTuple * input_col_as_tuple = checkAndGetColumn(input_col); + if (!input_type_as_tuple || !input_col_as_tuple) throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, - "First argument for function {} must be tuple or array of tuple. Actual {}", - getName(), - first_arg.type->getName()); + "First argument for function {} must be tuple or array of tuple. Actual {}", getName(), input_arg.type->getName()); - auto index = getElementNum(arguments[1].column, *tuple_type_concrete, arguments.size()); + std::optional index = getElementIndex(arguments[1].column, *input_type_as_tuple, arguments.size()); if (!index.has_value()) - { - if (!array_offsets.empty()) - { - recursiveCheckArrayOffsets(arguments[0].column, arguments[2].column, array_offsets.size()); - } return arguments[2].column; - } - ColumnPtr res = tuple_col_concrete->getColumns()[index.value()]; + ColumnPtr res = input_col_as_tuple->getColumns()[index.value()]; /// Wrap into Arrays for (auto it = array_offsets.rbegin(); it != array_offsets.rend(); ++it) res = ColumnArray::create(res, *it); - if (first_arg_is_const) - { + if (input_arg_is_const) res = ColumnConst::create(res, input_rows_count); - } return res; } private: - - void recursiveCheckArrayOffsets(ColumnPtr col_x, ColumnPtr col_y, size_t depth) const - { - for (size_t i = 1; i < depth; ++i) - { - checkArrayOffsets(col_x, col_y); - col_x = assert_cast(col_x.get())->getDataPtr(); - col_y = assert_cast(col_y.get())->getDataPtr(); - } - checkArrayOffsets(col_x, col_y); - } - - void checkArrayOffsets(ColumnPtr col_x, ColumnPtr col_y) const - { - if (isColumnConst(*col_x)) - { - checkArrayOffsetsWithFirstArgConst(col_x, col_y); - } - else if (isColumnConst(*col_y)) - { - checkArrayOffsetsWithFirstArgConst(col_y, col_x); - } - else - { - const auto & array_x = *assert_cast(col_x.get()); - const auto & array_y = *assert_cast(col_y.get()); - if (!array_x.hasEqualOffsets(array_y)) - { - throw Exception(ErrorCodes::SIZES_OF_ARRAYS_DONT_MATCH, - "The argument 1 and argument 3 of function {} have different array sizes", getName()); - } - } - } - - void checkArrayOffsetsWithFirstArgConst(ColumnPtr col_x, ColumnPtr col_y) const - { - col_x = assert_cast(col_x.get())->getDataColumnPtr(); - col_y = col_y->convertToFullColumnIfConst(); - const auto & array_x = *assert_cast(col_x.get()); - const auto & array_y = *assert_cast(col_y.get()); - - const auto & offsets_x = array_x.getOffsets(); - const auto & offsets_y = array_y.getOffsets(); - - ColumnArray::Offset prev_offset = 0; - size_t row_size = offsets_y.size(); - for (size_t row = 0; row < row_size; ++row) - { - if (unlikely(offsets_x[0] != offsets_y[row] - prev_offset)) - { - throw Exception(ErrorCodes::SIZES_OF_ARRAYS_DONT_MATCH, - "The argument 1 and argument 3 of function {} have different array sizes", getName()); - } - prev_offset = offsets_y[row]; - } - } - - std::optional getElementNum(const ColumnPtr & index_column, const DataTypeTuple & tuple, const size_t argument_size) const + std::optional getElementIndex(const ColumnPtr & index_column, const DataTypeTuple & tuple, size_t argument_size) const { if (checkAndGetColumnConst(index_column.get()) || checkAndGetColumnConst(index_column.get()) || checkAndGetColumnConst(index_column.get()) || checkAndGetColumnConst(index_column.get())) { - size_t index = index_column->getUInt(0); + const size_t index = index_column->getUInt(0); - if (index == 0) - throw Exception(ErrorCodes::ILLEGAL_INDEX, "Indices in tuples are 1-based."); + if (index > 0 && index <= tuple.getElements().size()) + return {index - 1}; + else + { + if (argument_size == 2) + throw Exception(ErrorCodes::NOT_FOUND_COLUMN_IN_BLOCK, "Tuple doesn't have element with index '{}'", index); + return std::nullopt; + } - if (index > tuple.getElements().size()) - throw Exception(ErrorCodes::ILLEGAL_INDEX, "Index for tuple element is out of range."); - - return std::optional(index - 1); } else if (const auto * name_col = checkAndGetColumnConst(index_column.get())) { - auto index = tuple.tryGetPositionByName(name_col->getValue()); - if (index.has_value()) - { - return index; - } + std::optional index = tuple.tryGetPositionByName(name_col->getValue()); - if (argument_size == 2) + if (index.has_value()) + return index; + else { - throw Exception(ErrorCodes::NOT_FOUND_COLUMN_IN_BLOCK, "Tuple doesn't have element with name '{}'", name_col->getValue()); + if (argument_size == 2) + throw Exception(ErrorCodes::NOT_FOUND_COLUMN_IN_BLOCK, "Tuple doesn't have element with name '{}'", name_col->getValue()); + return std::nullopt; } - return std::nullopt; } else throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, diff --git a/src/IO/BoundedReadBuffer.h b/src/IO/BoundedReadBuffer.h index 183dbedb78b..eb65857e83a 100644 --- a/src/IO/BoundedReadBuffer.h +++ b/src/IO/BoundedReadBuffer.h @@ -31,7 +31,8 @@ public: private: std::optional read_until_position; - size_t file_offset_of_buffer_end = 0; + /// atomic because can be used in log or exception messages while being updated. + std::atomic file_offset_of_buffer_end = 0; }; } diff --git a/src/IO/OpenedFileCache.h b/src/IO/OpenedFileCache.h index 844e5b31d11..61e502a494b 100644 --- a/src/IO/OpenedFileCache.h +++ b/src/IO/OpenedFileCache.h @@ -72,6 +72,13 @@ public: return res; } + void remove(const std::string & path, int flags) + { + Key key(path, flags); + std::lock_guard lock(mutex); + files.erase(key); + } + static OpenedFileCache & instance() { static OpenedFileCache res; @@ -82,5 +89,3 @@ public: using OpenedFileCachePtr = std::shared_ptr; } - - diff --git a/src/IO/ReadHelpers.h b/src/IO/ReadHelpers.h index af8d4942ff1..b8ce162ec91 100644 --- a/src/IO/ReadHelpers.h +++ b/src/IO/ReadHelpers.h @@ -12,6 +12,7 @@ #include +#include #include #include #include @@ -63,6 +64,7 @@ namespace ErrorCodes extern const int INCORRECT_DATA; extern const int TOO_LARGE_STRING_SIZE; extern const int TOO_LARGE_ARRAY_SIZE; + extern const int SIZE_OF_FIXED_STRING_DOESNT_MATCH; } /// Helper functions for formatted input. @@ -138,6 +140,19 @@ inline void readStringBinary(std::string & s, ReadBuffer & buf, size_t max_strin buf.readStrict(s.data(), size); } +/// For historical reasons we store IPv6 as a String +inline void readIPv6Binary(IPv6 & ip, ReadBuffer & buf) +{ + size_t size = 0; + readVarUInt(size, buf); + + if (size != IPV6_BINARY_LENGTH) + throw Exception(ErrorCodes::SIZE_OF_FIXED_STRING_DOESNT_MATCH, + "Size of the string {} doesn't match size of binary IPv6 {}", size, IPV6_BINARY_LENGTH); + + buf.readStrict(reinterpret_cast(&ip.toUnderType()), size); +} + template void readVectorBinary(std::vector & v, ReadBuffer & buf) { @@ -1093,6 +1108,8 @@ inline void readBinary(Decimal128 & x, ReadBuffer & buf) { readPODBinary(x, buf) inline void readBinary(Decimal256 & x, ReadBuffer & buf) { readPODBinary(x.value, buf); } inline void readBinary(LocalDate & x, ReadBuffer & buf) { readPODBinary(x, buf); } +inline void readBinary(StackTrace::FramePointers & x, ReadBuffer & buf) { readPODBinary(x, buf); } + template inline void readBinaryEndian(T & x, ReadBuffer & buf) { diff --git a/src/IO/WriteBufferFromOStream.cpp b/src/IO/WriteBufferFromOStream.cpp index e0ec0b770e2..ffc3e62e9a6 100644 --- a/src/IO/WriteBufferFromOStream.cpp +++ b/src/IO/WriteBufferFromOStream.cpp @@ -19,14 +19,7 @@ void WriteBufferFromOStream::nextImpl() ostr->flush(); if (!ostr->good()) - { - /// FIXME do not call finalize in dtors (and remove iostreams) - bool avoid_throwing_exceptions = std::uncaught_exceptions(); - if (avoid_throwing_exceptions) - LOG_ERROR(&Poco::Logger::get("WriteBufferFromOStream"), "Cannot write to ostream at offset {}. Stack trace: {}", count(), StackTrace().toString()); - else - throw Exception(ErrorCodes::CANNOT_WRITE_TO_OSTREAM, "Cannot write to ostream at offset {}", count()); - } + throw Exception(ErrorCodes::CANNOT_WRITE_TO_OSTREAM, "Cannot write to ostream at offset {}", count()); } WriteBufferFromOStream::WriteBufferFromOStream( @@ -46,9 +39,4 @@ WriteBufferFromOStream::WriteBufferFromOStream( { } -WriteBufferFromOStream::~WriteBufferFromOStream() -{ - finalize(); -} - } diff --git a/src/IO/WriteBufferFromOStream.h b/src/IO/WriteBufferFromOStream.h index f8b45c2fa59..5a933739cb1 100644 --- a/src/IO/WriteBufferFromOStream.h +++ b/src/IO/WriteBufferFromOStream.h @@ -18,8 +18,6 @@ public: char * existing_memory = nullptr, size_t alignment = 0); - ~WriteBufferFromOStream() override; - protected: explicit WriteBufferFromOStream(size_t size = DBMS_DEFAULT_BUFFER_SIZE, char * existing_memory = nullptr, size_t alignment = 0); diff --git a/src/IO/WriteHelpers.h b/src/IO/WriteHelpers.h index 29eb1355fd8..aa4c9b17e48 100644 --- a/src/IO/WriteHelpers.h +++ b/src/IO/WriteHelpers.h @@ -10,6 +10,8 @@ #include +#include +#include #include #include #include @@ -105,6 +107,13 @@ inline void writeStringBinary(const std::string & s, WriteBuffer & buf) buf.write(s.data(), s.size()); } +/// For historical reasons we store IPv6 as a String +inline void writeIPv6Binary(const IPv6 & ip, WriteBuffer & buf) +{ + writeVarUInt(IPV6_BINARY_LENGTH, buf); + buf.write(reinterpret_cast(&ip.toUnderType()), IPV6_BINARY_LENGTH); +} + inline void writeStringBinary(StringRef s, WriteBuffer & buf) { writeVarUInt(s.size, buf); @@ -868,6 +877,8 @@ inline void writeBinary(const UUID & x, WriteBuffer & buf) { writePODBinary(x, b inline void writeBinary(const IPv4 & x, WriteBuffer & buf) { writePODBinary(x, buf); } inline void writeBinary(const IPv6 & x, WriteBuffer & buf) { writePODBinary(x, buf); } +inline void writeBinary(const StackTrace::FramePointers & x, WriteBuffer & buf) { writePODBinary(x, buf); } + /// Methods for outputting the value in text form for a tab-separated format. inline void writeText(is_integer auto x, WriteBuffer & buf) diff --git a/src/IO/ZstdDeflatingAppendableWriteBuffer.cpp b/src/IO/ZstdDeflatingAppendableWriteBuffer.cpp index be739c0e654..81be8d8ce4d 100644 --- a/src/IO/ZstdDeflatingAppendableWriteBuffer.cpp +++ b/src/IO/ZstdDeflatingAppendableWriteBuffer.cpp @@ -1,5 +1,6 @@ #include #include +#include "IO/ReadBufferFromFileBase.h" #include namespace DB @@ -11,14 +12,16 @@ namespace ErrorCodes } ZstdDeflatingAppendableWriteBuffer::ZstdDeflatingAppendableWriteBuffer( - std::unique_ptr out_, + std::unique_ptr out_, int compression_level, bool append_to_existing_file_, + std::function()> read_buffer_creator_, size_t buf_size, char * existing_memory, size_t alignment) : BufferWithOwnMemory(buf_size, existing_memory, alignment) , out(std::move(out_)) + , read_buffer_creator(std::move(read_buffer_creator_)) , append_to_existing_file(append_to_existing_file_) { cctx = ZSTD_createCCtx(); @@ -194,13 +197,13 @@ void ZstdDeflatingAppendableWriteBuffer::addEmptyBlock() bool ZstdDeflatingAppendableWriteBuffer::isNeedToAddEmptyBlock() { - ReadBufferFromFile reader(out->getFileName()); - auto fsize = reader.getFileSize(); + auto reader = read_buffer_creator(); + auto fsize = reader->getFileSize(); if (fsize > 3) { std::array result; - reader.seek(fsize - 3, SEEK_SET); - reader.readStrict(result.data(), 3); + reader->seek(fsize - 3, SEEK_SET); + reader->readStrict(result.data(), 3); /// If we don't have correct block in the end, then we need to add it manually. /// NOTE: maybe we can have the same bytes in case of data corruption/unfinished write. diff --git a/src/IO/ZstdDeflatingAppendableWriteBuffer.h b/src/IO/ZstdDeflatingAppendableWriteBuffer.h index a0715480737..d9c4f32d6da 100644 --- a/src/IO/ZstdDeflatingAppendableWriteBuffer.h +++ b/src/IO/ZstdDeflatingAppendableWriteBuffer.h @@ -5,6 +5,7 @@ #include #include #include +#include #include @@ -29,9 +30,10 @@ public: static inline constexpr ZSTDLastBlock ZSTD_CORRECT_TERMINATION_LAST_BLOCK = {0x01, 0x00, 0x00}; ZstdDeflatingAppendableWriteBuffer( - std::unique_ptr out_, + std::unique_ptr out_, int compression_level, bool append_to_existing_file_, + std::function()> read_buffer_creator_, size_t buf_size = DBMS_DEFAULT_BUFFER_SIZE, char * existing_memory = nullptr, size_t alignment = 0); @@ -68,7 +70,8 @@ private: /// Adding zstd empty block (ZSTD_CORRECT_TERMINATION_LAST_BLOCK) to out.working_buffer void addEmptyBlock(); - std::unique_ptr out; + std::unique_ptr out; + std::function()> read_buffer_creator; bool append_to_existing_file = false; ZSTD_CCtx * cctx; diff --git a/src/Interpreters/ActionsVisitor.cpp b/src/Interpreters/ActionsVisitor.cpp index cfbe53b5e4d..10502b7e66d 100644 --- a/src/Interpreters/ActionsVisitor.cpp +++ b/src/Interpreters/ActionsVisitor.cpp @@ -1435,6 +1435,9 @@ FutureSetPtr ActionsMatcher::makeSet(const ASTFunction & node, Data & data, bool if (table) { + if (auto set = data.prepared_sets->findStorage(set_key)) + return set; + if (StorageSet * storage_set = dynamic_cast(table.get())) return data.prepared_sets->addFromStorage(set_key, storage_set->getSet()); } diff --git a/src/Interpreters/Cache/FileCache.cpp b/src/Interpreters/Cache/FileCache.cpp index eb5b59a447d..de8ae33433a 100644 --- a/src/Interpreters/Cache/FileCache.cpp +++ b/src/Interpreters/Cache/FileCache.cpp @@ -11,6 +11,7 @@ #include #include #include +#include #include #include @@ -54,9 +55,10 @@ FileCache::FileCache(const FileCacheSettings & settings) : max_file_segment_size(settings.max_file_segment_size) , bypass_cache_threshold(settings.enable_bypass_cache_with_threashold ? settings.bypass_cache_threashold : 0) , delayed_cleanup_interval_ms(settings.delayed_cleanup_interval_ms) + , boundary_alignment(settings.boundary_alignment) + , background_download_threads(settings.background_download_threads) , log(&Poco::Logger::get("FileCache")) , metadata(settings.base_path) - , boundary_alignment(settings.boundary_alignment) { main_priority = std::make_unique(settings.max_size, settings.max_elements); @@ -129,6 +131,9 @@ void FileCache::initialize() is_initialized = true; + for (size_t i = 0; i < background_download_threads; ++i) + download_threads.emplace_back([this] { metadata.downloadThreadFunc(); }); + cleanup_task = Context::getGlobalContextInstance()->getSchedulePool().createTask("FileCacheCleanup", [this]{ cleanupThreadFunc(); }); cleanup_task->activate(); cleanup_task->scheduleAfter(delayed_cleanup_interval_ms); @@ -423,7 +428,12 @@ FileSegmentsHolderPtr FileCache::set( } FileSegmentsHolderPtr -FileCache::getOrSet(const Key & key, size_t offset, size_t size, size_t file_size, const CreateFileSegmentSettings & settings) +FileCache::getOrSet( + const Key & key, + size_t offset, + size_t size, + size_t file_size, + const CreateFileSegmentSettings & settings) { ProfileEventTimeIncrement watch(ProfileEvents::FilesystemCacheGetOrSetMicroseconds); @@ -645,27 +655,14 @@ bool FileCache::tryReserve(FileSegment & file_segment, const size_t size) if (segment_metadata->releasable()) { - auto segment = segment_metadata->file_segment; - if (segment->state() == FileSegment::State::DOWNLOADED) - { - const auto & key = segment->key(); + const auto & key = segment_metadata->file_segment->key(); + auto it = to_delete.find(key); + if (it == to_delete.end()) + it = to_delete.emplace(key, locked_key.getKeyMetadata()).first; + it->second.add(segment_metadata); - auto it = to_delete.find(key); - if (it == to_delete.end()) - it = to_delete.emplace(key, locked_key.getKeyMetadata()).first; - it->second.add(segment_metadata); - - freeable_space += segment_metadata->size(); - ++freeable_count; - - return PriorityIterationResult::CONTINUE; - } - - ProfileEvents::increment(ProfileEvents::FilesystemCacheEvictedFileSegments); - ProfileEvents::increment(ProfileEvents::FilesystemCacheEvictedBytes, segment->getDownloadedSize(false)); - - locked_key.removeFileSegment(segment->offset(), segment->lock()); - return PriorityIterationResult::REMOVE_AND_CONTINUE; + freeable_space += segment_metadata->size(); + ++freeable_count; } return PriorityIterationResult::CONTINUE; }; @@ -757,12 +754,14 @@ bool FileCache::tryReserve(FileSegment & file_segment, const size_t size) chassert(candidate->releasable()); const auto * segment = candidate->file_segment.get(); + auto queue_it = segment->getQueueIterator(); + chassert(queue_it); ProfileEvents::increment(ProfileEvents::FilesystemCacheEvictedFileSegments); ProfileEvents::increment(ProfileEvents::FilesystemCacheEvictedBytes, segment->range().size()); locked_key->removeFileSegment(segment->offset(), segment->lock()); - segment->getQueueIterator()->remove(cache_lock); + queue_it->remove(cache_lock); if (query_context) query_context->remove(current_key, segment->offset(), cache_lock); @@ -807,6 +806,13 @@ bool FileCache::tryReserve(FileSegment & file_segment, const size_t size) return true; } +void FileCache::removeKey(const Key & key) +{ + assertInitialized(); + auto locked_key = metadata.lockKeyMetadata(key, CacheMetadata::KeyNotFoundPolicy::THROW); + locked_key->removeAll(); +} + void FileCache::removeKeyIfExists(const Key & key) { assertInitialized(); @@ -819,7 +825,14 @@ void FileCache::removeKeyIfExists(const Key & key) /// But if we have multiple replicated zero-copy tables on the same server /// it became possible to start removing something from cache when it is used /// by other "zero-copy" tables. That is why it's not an error. - locked_key->removeAllReleasable(); + locked_key->removeAll(/* if_releasable */true); +} + +void FileCache::removeFileSegment(const Key & key, size_t offset) +{ + assertInitialized(); + auto locked_key = metadata.lockKeyMetadata(key, CacheMetadata::KeyNotFoundPolicy::THROW); + locked_key->removeFileSegment(offset); } void FileCache::removePathIfExists(const String & path) @@ -831,22 +844,12 @@ void FileCache::removeAllReleasable() { assertInitialized(); - auto lock = lockCache(); - - main_priority->iterate([&](LockedKey & locked_key, const FileSegmentMetadataPtr & segment_metadata) - { - if (segment_metadata->releasable()) - { - auto file_segment = segment_metadata->file_segment; - locked_key.removeFileSegment(file_segment->offset(), file_segment->lock()); - return PriorityIterationResult::REMOVE_AND_CONTINUE; - } - return PriorityIterationResult::CONTINUE; - }, lock); + metadata.iterate([](LockedKey & locked_key) { locked_key.removeAll(/* if_releasable */true); }); if (stash) { /// Remove all access information. + auto lock = lockCache(); stash->records.clear(); stash->queue->removeAll(lock); } @@ -916,7 +919,7 @@ void FileCache::loadMetadata() continue; } - const auto key = Key(unhexUInt(key_directory.filename().string().data())); + const auto key = Key::fromKeyString(key_directory.filename().string()); auto locked_key = metadata.lockKeyMetadata(key, CacheMetadata::KeyNotFoundPolicy::CREATE_EMPTY, /* is_initial_load */true); for (fs::directory_iterator offset_it{key_directory}; offset_it != fs::directory_iterator(); ++offset_it) @@ -1025,6 +1028,11 @@ void FileCache::deactivateBackgroundOperations() { if (cleanup_task) cleanup_task->deactivate(); + + metadata.cancelDownload(); + for (auto & thread : download_threads) + if (thread.joinable()) + thread.join(); } void FileCache::cleanup() @@ -1036,10 +1044,6 @@ void FileCache::cleanupThreadFunc() { try { -#ifdef ABORT_ON_LOGICAL_ERROR - assertCacheCorrectness(); -#endif - cleanup(); } catch (...) @@ -1070,7 +1074,7 @@ FileSegmentsHolderPtr FileCache::getSnapshot() FileSegmentsHolderPtr FileCache::getSnapshot(const Key & key) { FileSegments file_segments; - auto locked_key = metadata.lockKeyMetadata(key, CacheMetadata::KeyNotFoundPolicy::THROW); + auto locked_key = metadata.lockKeyMetadata(key, CacheMetadata::KeyNotFoundPolicy::THROW_LOGICAL); for (const auto & [_, file_segment_metadata] : *locked_key->getKeyMetadata()) file_segments.push_back(FileSegment::getSnapshot(file_segment_metadata->file_segment)); return std::make_unique(std::move(file_segments)); diff --git a/src/Interpreters/Cache/FileCache.h b/src/Interpreters/Cache/FileCache.h index a93ef669898..2e6a5094758 100644 --- a/src/Interpreters/Cache/FileCache.h +++ b/src/Interpreters/Cache/FileCache.h @@ -83,13 +83,19 @@ public: FileSegmentsHolderPtr set(const Key & key, size_t offset, size_t size, const CreateFileSegmentSettings & settings); - /// Remove files by `key`. Removes files which might be used at the moment. + /// Remove file segment by `key` and `offset`. Throws if file segment does not exist. + void removeFileSegment(const Key & key, size_t offset); + + /// Remove files by `key`. Throws if key does not exist. + void removeKey(const Key & key); + + /// Remove files by `key`. void removeKeyIfExists(const Key & key); - /// Removes files by `path`. Removes files which might be used at the moment. + /// Removes files by `path`. void removePathIfExists(const String & path); - /// Remove files by `key`. Will not remove files which are used at the moment. + /// Remove files by `key`. void removeAllReleasable(); std::vector tryGetCachePaths(const Key & key); @@ -136,6 +142,8 @@ private: const size_t max_file_segment_size; const size_t bypass_cache_threshold = 0; const size_t delayed_cleanup_interval_ms; + const size_t boundary_alignment; + const size_t background_download_threads; Poco::Logger * log; @@ -180,9 +188,9 @@ private: */ BackgroundSchedulePool::TaskHolder cleanup_task; - void assertInitialized() const; + std::vector download_threads; - size_t boundary_alignment; + void assertInitialized() const; void assertCacheCorrectness(); diff --git a/src/Interpreters/Cache/FileCacheKey.cpp b/src/Interpreters/Cache/FileCacheKey.cpp index f97cdc058aa..772fcd600bf 100644 --- a/src/Interpreters/Cache/FileCacheKey.cpp +++ b/src/Interpreters/Cache/FileCacheKey.cpp @@ -28,4 +28,9 @@ FileCacheKey FileCacheKey::random() return FileCacheKey(UUIDHelpers::generateV4().toUnderType()); } +FileCacheKey FileCacheKey::fromKeyString(const std::string & key_str) +{ + return FileCacheKey(unhexUInt(key_str.data())); +} + } diff --git a/src/Interpreters/Cache/FileCacheKey.h b/src/Interpreters/Cache/FileCacheKey.h index bab8359732c..e788cd5e7cd 100644 --- a/src/Interpreters/Cache/FileCacheKey.h +++ b/src/Interpreters/Cache/FileCacheKey.h @@ -21,6 +21,8 @@ struct FileCacheKey static FileCacheKey random(); bool operator==(const FileCacheKey & other) const { return key == other.key; } + + static FileCacheKey fromKeyString(const std::string & key_str); }; using FileCacheKeyAndOffset = std::pair; diff --git a/src/Interpreters/Cache/FileCacheSettings.cpp b/src/Interpreters/Cache/FileCacheSettings.cpp index dbe3e8ced02..455e9b44d0b 100644 --- a/src/Interpreters/Cache/FileCacheSettings.cpp +++ b/src/Interpreters/Cache/FileCacheSettings.cpp @@ -31,10 +31,9 @@ void FileCacheSettings::loadFromConfig(const Poco::Util::AbstractConfiguration & throw Exception(ErrorCodes::BAD_ARGUMENTS, "Disk Cache requires non-empty `path` field (cache base path) in config"); max_elements = config.getUInt64(config_prefix + ".max_elements", FILECACHE_DEFAULT_MAX_ELEMENTS); + if (config.has(config_prefix + ".max_file_segment_size")) max_file_segment_size = parseWithSizeSuffix(config.getString(config_prefix + ".max_file_segment_size")); - else - max_file_segment_size = FILECACHE_DEFAULT_MAX_FILE_SEGMENT_SIZE; cache_on_write_operations = config.getUInt64(config_prefix + ".cache_on_write_operations", false); enable_filesystem_query_cache_limit = config.getUInt64(config_prefix + ".enable_filesystem_query_cache_limit", false); @@ -44,10 +43,12 @@ void FileCacheSettings::loadFromConfig(const Poco::Util::AbstractConfiguration & if (config.has(config_prefix + ".bypass_cache_threashold")) bypass_cache_threashold = parseWithSizeSuffix(config.getString(config_prefix + ".bypass_cache_threashold")); - else - bypass_cache_threashold = FILECACHE_BYPASS_THRESHOLD; - boundary_alignment = config.getUInt64(config_prefix + ".boundary_alignment", DBMS_DEFAULT_BUFFER_SIZE); + if (config.has(config_prefix + ".boundary_alignment")) + boundary_alignment = parseWithSizeSuffix(config.getString(config_prefix + ".boundary_alignment")); + + if (config.has(config_prefix + ".background_download_threads")) + background_download_threads = config.getUInt(config_prefix + ".background_download_threads"); delayed_cleanup_interval_ms = config.getUInt64(config_prefix + ".delayed_cleanup_interval_ms", FILECACHE_DELAYED_CLEANUP_INTERVAL_MS); } diff --git a/src/Interpreters/Cache/FileCacheSettings.h b/src/Interpreters/Cache/FileCacheSettings.h index cddb8fd459b..e56d6fcc54d 100644 --- a/src/Interpreters/Cache/FileCacheSettings.h +++ b/src/Interpreters/Cache/FileCacheSettings.h @@ -26,7 +26,8 @@ struct FileCacheSettings size_t bypass_cache_threashold = FILECACHE_BYPASS_THRESHOLD; size_t delayed_cleanup_interval_ms = FILECACHE_DELAYED_CLEANUP_INTERVAL_MS; - size_t boundary_alignment = DBMS_DEFAULT_BUFFER_SIZE; + size_t boundary_alignment = FILECACHE_DEFAULT_FILE_SEGMENT_ALIGNMENT; + size_t background_download_threads = FILECACHE_DEFAULT_BACKGROUND_DOWNLOAD_THREADS; void loadFromConfig(const Poco::Util::AbstractConfiguration & config, const std::string & config_prefix); }; diff --git a/src/Interpreters/Cache/FileCache_fwd.h b/src/Interpreters/Cache/FileCache_fwd.h index 01f518d0c4e..4d4a1c1429c 100644 --- a/src/Interpreters/Cache/FileCache_fwd.h +++ b/src/Interpreters/Cache/FileCache_fwd.h @@ -4,7 +4,9 @@ namespace DB { -static constexpr int FILECACHE_DEFAULT_MAX_FILE_SEGMENT_SIZE = 8 * 1024 * 1024; +static constexpr int FILECACHE_DEFAULT_MAX_FILE_SEGMENT_SIZE = 32 * 1024 * 1024; /// 32Mi +static constexpr int FILECACHE_DEFAULT_FILE_SEGMENT_ALIGNMENT = 4 * 1024 * 1024; /// 4Mi +static constexpr int FILECACHE_DEFAULT_BACKGROUND_DOWNLOAD_THREADS = 2; static constexpr int FILECACHE_DEFAULT_MAX_ELEMENTS = 10000000; static constexpr int FILECACHE_DEFAULT_HITS_THRESHOLD = 0; static constexpr size_t FILECACHE_BYPASS_THRESHOLD = 256 * 1024 * 1024; diff --git a/src/Interpreters/Cache/FileSegment.cpp b/src/Interpreters/Cache/FileSegment.cpp index 95592fc7c12..d191b2a803c 100644 --- a/src/Interpreters/Cache/FileSegment.cpp +++ b/src/Interpreters/Cache/FileSegment.cpp @@ -237,8 +237,10 @@ void FileSegment::resetDownloadingStateUnlocked(const FileSegmentGuard::Lock & l /// range().size() can equal 0 in case of write-though cache. if (!is_unbound && current_downloaded_size != 0 && current_downloaded_size == range().size()) setDownloadedUnlocked(lock); - else + else if (current_downloaded_size) setDownloadState(State::PARTIALLY_DOWNLOADED, lock); + else + setDownloadState(State::EMPTY, lock); } void FileSegment::resetDownloader() @@ -303,22 +305,13 @@ void FileSegment::resetRemoteFileReader() FileSegment::RemoteFileReaderPtr FileSegment::extractRemoteFileReader() { - auto locked_key = lockKeyMetadata(false); - if (!locked_key) + auto lock = lockFileSegment(); + if (remote_file_reader && (download_state == State::DOWNLOADED + || download_state == State::PARTIALLY_DOWNLOADED_NO_CONTINUATION)) { - assert(isDetached()); return std::move(remote_file_reader); } - - auto segment_lock = lockFileSegment(); - - assert(download_state != State::DETACHED); - - bool is_last_holder = locked_key->isLastOwnerOfFileSegment(offset()); - if (!downloader_id.empty() || !is_last_holder) - return nullptr; - - return std::move(remote_file_reader); + return nullptr; } void FileSegment::setRemoteFileReader(RemoteFileReaderPtr remote_file_reader_) @@ -635,21 +628,9 @@ void FileSegment::complete() resetDownloaderUnlocked(segment_lock); } - if (is_downloader || is_last_holder) - { - if (cache_writer) - { - cache_writer->finalize(); - cache_writer.reset(); - } - remote_file_reader.reset(); - } - if (segment_kind == FileSegmentKind::Temporary && is_last_holder) { LOG_TEST(log, "Removing temporary file segment: {}", getInfoForLogUnlocked(segment_lock)); - detach(segment_lock, *locked_key); - setDownloadState(State::DETACHED, segment_lock); locked_key->removeFileSegment(offset(), segment_lock); return; } @@ -661,6 +642,7 @@ void FileSegment::complete() chassert(current_downloaded_size == range().size()); chassert(current_downloaded_size == fs::file_size(getPathInLocalCache())); chassert(!cache_writer); + chassert(!remote_file_reader); break; } case State::DOWNLOADING: @@ -669,7 +651,33 @@ void FileSegment::complete() break; } case State::EMPTY: + { + if (is_last_holder) + locked_key->removeFileSegment(offset(), segment_lock); + break; + } case State::PARTIALLY_DOWNLOADED: + { + chassert(current_downloaded_size > 0); + + if (is_last_holder) + { + if (remote_file_reader) + { + LOG_TEST( + log, "Submitting file segment for background download " + "(having {}/{})", downloaded_size, range().size()); + + locked_key->addToDownloadQueue(offset(), segment_lock); /// Finish download in background. + } + else + { + locked_key->shrinkFileSegmentToDownloadedSize(offset(), segment_lock); + setDetachedState(segment_lock); /// See comment below. + } + } + break; + } case State::PARTIALLY_DOWNLOADED_NO_CONTINUATION: { chassert(current_downloaded_size != range().size()); @@ -678,7 +686,6 @@ void FileSegment::complete() { if (current_downloaded_size == 0) { - LOG_TEST(log, "Remove file segment {} (nothing downloaded)", range().toString()); locked_key->removeFileSegment(offset(), segment_lock); } else @@ -700,9 +707,8 @@ void FileSegment::complete() /// We mark current file segment with state DETACHED, even though the data is still in cache /// (but a separate file segment) because is_last_holder is satisfied, so it does not matter. + setDetachedState(segment_lock); } - - setDetachedState(segment_lock); } break; } @@ -798,7 +804,6 @@ bool FileSegment::assertCorrectnessUnlocked(const FileSegmentGuard::Lock &) cons } chassert(reserved_size >= downloaded_size); - chassert((reserved_size == 0) || queue_iterator); check_iterator(queue_iterator); } @@ -872,6 +877,9 @@ void FileSegment::setDetachedState(const FileSegmentGuard::Lock & lock) setDownloadState(State::DETACHED, lock); key_metadata.reset(); cache = nullptr; + queue_iterator = nullptr; + cache_writer.reset(); + remote_file_reader.reset(); } void FileSegment::detach(const FileSegmentGuard::Lock & lock, const LockedKey &) @@ -890,7 +898,7 @@ void FileSegment::use() if (!cache) { - chassert(isCompleted(true)); + chassert(isDetached()); return; } @@ -898,7 +906,7 @@ void FileSegment::use() if (it) { auto cache_lock = cache->lockCache(); - it->use(cache_lock); + hits_count = it->use(cache_lock); } } diff --git a/src/Interpreters/Cache/FileSegment.h b/src/Interpreters/Cache/FileSegment.h index c583ba4c90c..44d95816915 100644 --- a/src/Interpreters/Cache/FileSegment.h +++ b/src/Interpreters/Cache/FileSegment.h @@ -156,6 +156,8 @@ public: String getPathInLocalCache() const; + int getFlagsForLocalRead() const { return O_RDONLY | O_CLOEXEC; } + /** * ========== Methods for _any_ file segment's owner ======================== */ @@ -175,8 +177,6 @@ public: size_t getRefCount() const { return ref_count; } - void incrementHitsCount() { ++hits_count; } - size_t getCurrentWriteOffset(bool sync) const; size_t getFirstNonDownloadedOffset(bool sync) const; diff --git a/src/Interpreters/Cache/IFileCachePriority.h b/src/Interpreters/Cache/IFileCachePriority.h index 10f63fce21d..34c49653ab8 100644 --- a/src/Interpreters/Cache/IFileCachePriority.h +++ b/src/Interpreters/Cache/IFileCachePriority.h @@ -44,7 +44,7 @@ public: virtual size_t use(const CacheGuard::Lock &) = 0; - virtual std::shared_ptr remove(const CacheGuard::Lock &) = 0; + virtual void remove(const CacheGuard::Lock &) = 0; virtual const Entry & getEntry() const = 0; diff --git a/src/Interpreters/Cache/LRUFileCachePriority.cpp b/src/Interpreters/Cache/LRUFileCachePriority.cpp index 6f142c0cc6d..18862e154da 100644 --- a/src/Interpreters/Cache/LRUFileCachePriority.cpp +++ b/src/Interpreters/Cache/LRUFileCachePriority.cpp @@ -166,15 +166,17 @@ void LRUFileCachePriority::iterate(IterateFunc && func, const CacheGuard::Lock & } } -LRUFileCachePriority::Iterator -LRUFileCachePriority::LRUFileCacheIterator::remove(const CacheGuard::Lock &) +void LRUFileCachePriority::LRUFileCacheIterator::remove(const CacheGuard::Lock &) { - return std::make_shared( - cache_priority, cache_priority->remove(queue_iter)); + checkUsable(); + cache_priority->remove(queue_iter); + queue_iter = LRUQueueIterator{}; } void LRUFileCachePriority::LRUFileCacheIterator::invalidate() { + checkUsable(); + LOG_TEST( cache_priority->log, "Invalidating entry in LRU queue. Key: {}, offset: {}, previous size: {}", @@ -187,6 +189,8 @@ void LRUFileCachePriority::LRUFileCacheIterator::invalidate() void LRUFileCachePriority::LRUFileCacheIterator::updateSize(int64_t size) { + checkUsable(); + LOG_TEST( cache_priority->log, "Update size with {} in LRU queue for key: {}, offset: {}, previous size: {}", @@ -198,8 +202,15 @@ void LRUFileCachePriority::LRUFileCacheIterator::updateSize(int64_t size) size_t LRUFileCachePriority::LRUFileCacheIterator::use(const CacheGuard::Lock &) { + checkUsable(); cache_priority->queue.splice(cache_priority->queue.end(), cache_priority->queue, queue_iter); return ++queue_iter->hits; } +void LRUFileCachePriority::LRUFileCacheIterator::checkUsable() const +{ + if (queue_iter == LRUQueueIterator{}) + throw Exception(ErrorCodes::LOGICAL_ERROR, "Attempt to use invalid iterator"); +} + } diff --git a/src/Interpreters/Cache/LRUFileCachePriority.h b/src/Interpreters/Cache/LRUFileCachePriority.h index 8dc4eb0a016..e0d7d45062a 100644 --- a/src/Interpreters/Cache/LRUFileCachePriority.h +++ b/src/Interpreters/Cache/LRUFileCachePriority.h @@ -60,13 +60,15 @@ public: size_t use(const CacheGuard::Lock &) override; - Iterator remove(const CacheGuard::Lock &) override; + void remove(const CacheGuard::Lock &) override; void invalidate() override; void updateSize(int64_t size) override; private: + void checkUsable() const; + LRUFileCachePriority * cache_priority; mutable LRUFileCachePriority::LRUQueueIterator queue_iter; }; diff --git a/src/Interpreters/Cache/Metadata.cpp b/src/Interpreters/Cache/Metadata.cpp index 8c8524f7fa7..0a2d58432e4 100644 --- a/src/Interpreters/Cache/Metadata.cpp +++ b/src/Interpreters/Cache/Metadata.cpp @@ -1,12 +1,18 @@ #include #include #include +#include "Common/Exception.h" #include #include #include namespace fs = std::filesystem; +namespace CurrentMetrics +{ + extern const Metric FilesystemCacheDownloadQueueElements; +} + namespace ProfileEvents { extern const Event FilesystemCacheLockKeyMicroseconds; @@ -19,6 +25,7 @@ namespace DB namespace ErrorCodes { extern const int LOGICAL_ERROR; + extern const int BAD_ARGUMENTS; } FileSegmentMetadata::FileSegmentMetadata(FileSegmentPtr && file_segment_) @@ -53,11 +60,13 @@ KeyMetadata::KeyMetadata( const Key & key_, const std::string & key_path_, CleanupQueue & cleanup_queue_, + DownloadQueue & download_queue_, Poco::Logger * log_, bool created_base_directory_) : key(key_) , key_path(key_path_) , cleanup_queue(cleanup_queue_) + , download_queue(download_queue_) , created_base_directory(created_base_directory_) , log(log_) { @@ -134,6 +143,7 @@ private: CacheMetadata::CacheMetadata(const std::string & path_) : path(path_) , cleanup_queue(std::make_unique()) + , download_queue(std::make_unique()) , log(&Poco::Logger::get("CacheMetadata")) { } @@ -147,7 +157,6 @@ String CacheMetadata::getFileNameForFileSegment(size_t offset, FileSegmentKind s file_suffix = "_temporary"; break; case FileSegmentKind::Regular: - file_suffix = ""; break; } return std::to_string(offset) + file_suffix; @@ -183,13 +192,15 @@ LockedKeyPtr CacheMetadata::lockKeyMetadata( if (it == end()) { if (key_not_found_policy == KeyNotFoundPolicy::THROW) + throw Exception(ErrorCodes::BAD_ARGUMENTS, "No such key `{}` in cache", key); + else if (key_not_found_policy == KeyNotFoundPolicy::THROW_LOGICAL) throw Exception(ErrorCodes::LOGICAL_ERROR, "No such key `{}` in cache", key); else if (key_not_found_policy == KeyNotFoundPolicy::RETURN_NULL) return nullptr; it = emplace( key, std::make_shared( - key, getPathForKey(key), *cleanup_queue, log, is_initial_load)).first; + key, getPathForKey(key), *cleanup_queue, *download_queue, log, is_initial_load)).first; } key_metadata = it->second; @@ -207,6 +218,8 @@ LockedKeyPtr CacheMetadata::lockKeyMetadata( return locked_metadata; if (key_not_found_policy == KeyNotFoundPolicy::THROW) + throw Exception(ErrorCodes::BAD_ARGUMENTS, "No such key `{}` in cache", key); + else if (key_not_found_policy == KeyNotFoundPolicy::THROW_LOGICAL) throw Exception(ErrorCodes::LOGICAL_ERROR, "No such key `{}` in cache", key); if (key_not_found_policy == KeyNotFoundPolicy::RETURN_NULL) @@ -280,6 +293,7 @@ void CacheMetadata::doCleanup() continue; } + chassert(it->second->empty()); locked_metadata->markAsRemoved(); erase(it); LOG_DEBUG(log, "Key {} is removed from metadata", cleanup_key); @@ -320,6 +334,199 @@ void CacheMetadata::doCleanup() } } +class DownloadQueue +{ +friend struct CacheMetadata; +public: + void add(FileSegmentPtr file_segment) + { + { + std::lock_guard lock(mutex); + queue.emplace(file_segment->key(), file_segment->offset(), file_segment); + } + + CurrentMetrics::add(CurrentMetrics::FilesystemCacheDownloadQueueElements); + cv.notify_one(); + } + +private: + void cancel() + { + { + std::lock_guard lock(mutex); + cancelled = true; + } + cv.notify_all(); + } + + std::mutex mutex; + std::condition_variable cv; + bool cancelled = false; + + struct DownloadInfo + { + CacheMetadata::Key key; + size_t offset; + /// We keep weak pointer to file segment + /// instead of just getting it from file_segment_metadata, + /// because file segment at key:offset count be removed and added back to metadata + /// before we actually started background download. + std::weak_ptr file_segment; + }; + std::queue queue; +}; + +void CacheMetadata::downloadThreadFunc() +{ + std::optional> memory; + while (true) + { + Key key; + size_t offset; + std::weak_ptr file_segment_weak; + + { + std::unique_lock lock(download_queue->mutex); + + if (download_queue->cancelled) + return; + + if (download_queue->queue.empty()) + { + download_queue->cv.wait(lock); + continue; + } + + auto entry = download_queue->queue.front(); + key = entry.key; + offset = entry.offset; + file_segment_weak = entry.file_segment; + + download_queue->queue.pop(); + } + + CurrentMetrics::sub(CurrentMetrics::FilesystemCacheDownloadQueueElements); + + FileSegmentsHolderPtr holder; + try + { + { + auto locked_key = lockKeyMetadata(key, KeyNotFoundPolicy::RETURN_NULL); + if (!locked_key) + continue; + + auto file_segment_metadata = locked_key->tryGetByOffset(offset); + if (!file_segment_metadata || file_segment_metadata->evicting()) + continue; + + auto file_segment = file_segment_weak.lock(); + + if (!file_segment + || file_segment != file_segment_metadata->file_segment + || file_segment->state() != FileSegment::State::PARTIALLY_DOWNLOADED) + continue; + + holder = std::make_unique(FileSegments{file_segment}); + } + + downloadImpl(holder->front(), memory); + } + catch (...) + { + if (holder) + { + const auto & file_segment = holder->front(); + LOG_ERROR( + log, "Error during background download of {}:{} ({}): {}", + file_segment.key(), file_segment.offset(), + file_segment.getInfoForLog(), getCurrentExceptionMessage(true)); + } + else + { + tryLogCurrentException(__PRETTY_FUNCTION__); + chassert(false); + } + } + } +} + +void CacheMetadata::downloadImpl(FileSegment & file_segment, std::optional> & memory) +{ + chassert(file_segment.assertCorrectness()); + + if (file_segment.getOrSetDownloader() != FileSegment::getCallerId()) + return; + + if (file_segment.getDownloadedSize(false) == file_segment.range().size()) + throw Exception(ErrorCodes::LOGICAL_ERROR, "File segment is already fully downloaded"); + + LOG_TEST( + log, "Downloading {} bytes for file segment {}", + file_segment.range().size() - file_segment.getDownloadedSize(false), file_segment.getInfoForLog()); + + auto reader = file_segment.getRemoteFileReader(); + + if (!reader) + { + throw Exception( + ErrorCodes::LOGICAL_ERROR, "No reader. " + "File segment should not have been submitted for background download ({})", + file_segment.getInfoForLog()); + } + + /// If remote_fs_read_method == 'threadpool', + /// reader itself never owns/allocates the buffer. + if (reader->internalBuffer().empty()) + { + if (!memory) + memory.emplace(DBMS_DEFAULT_BUFFER_SIZE); + reader->set(memory->data(), memory->size()); + } + + size_t offset = file_segment.getCurrentWriteOffset(false); + if (offset != static_cast(reader->getPosition())) + reader->seek(offset, SEEK_SET); + + while (!reader->eof()) + { + auto size = reader->available(); + + if (!file_segment.reserve(size)) + { + LOG_TEST( + log, "Failed to reserve space during background download " + "for {}:{} (downloaded size: {}/{})", + file_segment.key(), file_segment.offset(), + file_segment.getDownloadedSize(false), file_segment.range().size()); + return; + } + + try + { + file_segment.write(reader->position(), size, offset); + offset += size; + reader->position() += size; + } + catch (ErrnoException & e) + { + int code = e.getErrno(); + if (code == /* No space left on device */28 || code == /* Quota exceeded */122) + { + LOG_INFO(log, "Insert into cache is skipped due to insufficient disk space. ({})", e.displayText()); + return; + } + throw; + } + } + + LOG_TEST(log, "Downloaded file segment: {}", file_segment.getInfoForLog()); +} + +void CacheMetadata::cancelDownload() +{ + download_queue->cancel(); +} + LockedKey::LockedKey(std::shared_ptr key_metadata_) : key_metadata(key_metadata_) , lock(key_metadata->guard.lock()) @@ -356,11 +563,11 @@ bool LockedKey::isLastOwnerOfFileSegment(size_t offset) const return file_segment_metadata->file_segment.use_count() == 2; } -void LockedKey::removeAllReleasable() +void LockedKey::removeAll(bool if_releasable) { for (auto it = key_metadata->begin(); it != key_metadata->end();) { - if (!it->second->releasable()) + if (if_releasable && !it->second->releasable()) { ++it; continue; @@ -381,34 +588,58 @@ void LockedKey::removeAllReleasable() } } +KeyMetadata::iterator LockedKey::removeFileSegment(size_t offset) +{ + auto it = key_metadata->find(offset); + if (it == key_metadata->end()) + throw Exception(ErrorCodes::BAD_ARGUMENTS, "There is no offset {}", offset); + + auto file_segment = it->second->file_segment; + return removeFileSegmentImpl(it, file_segment->lock()); +} + KeyMetadata::iterator LockedKey::removeFileSegment(size_t offset, const FileSegmentGuard::Lock & segment_lock) { auto it = key_metadata->find(offset); if (it == key_metadata->end()) throw Exception(ErrorCodes::LOGICAL_ERROR, "There is no offset {}", offset); + return removeFileSegmentImpl(it, segment_lock); +} + +KeyMetadata::iterator LockedKey::removeFileSegmentImpl(KeyMetadata::iterator it, const FileSegmentGuard::Lock & segment_lock) +{ auto file_segment = it->second->file_segment; LOG_DEBUG( key_metadata->log, "Remove from cache. Key: {}, offset: {}, size: {}", - getKey(), offset, file_segment->reserved_size); + getKey(), file_segment->offset(), file_segment->reserved_size); chassert(file_segment->assertCorrectnessUnlocked(segment_lock)); if (file_segment->queue_iterator) file_segment->queue_iterator->invalidate(); + file_segment->detach(segment_lock, *this); + const auto path = key_metadata->getFileSegmentPath(*file_segment); bool exists = fs::exists(path); if (exists) { fs::remove(path); + + /// Clear OpenedFileCache to avoid reading from incorrect file descriptor. + int flags = file_segment->getFlagsForLocalRead(); + /// Files are created with flags from file_segment->getFlagsForLocalRead() + /// plus optionally O_DIRECT is added, depends on query setting, so remove both. + OpenedFileCache::instance().remove(path, flags); + OpenedFileCache::instance().remove(path, flags | O_DIRECT); + LOG_TEST(key_metadata->log, "Removed file segment at path: {}", path); } else if (file_segment->downloaded_size) throw Exception(ErrorCodes::LOGICAL_ERROR, "Expected path {} to exist", path); - file_segment->detach(segment_lock, *this); return key_metadata->erase(it); } @@ -448,6 +679,14 @@ void LockedKey::shrinkFileSegmentToDownloadedSize( chassert(file_segment->assertCorrectnessUnlocked(segment_lock)); } +void LockedKey::addToDownloadQueue(size_t offset, const FileSegmentGuard::Lock &) +{ + auto it = key_metadata->find(offset); + if (it == key_metadata->end()) + throw Exception(ErrorCodes::LOGICAL_ERROR, "There is not offset {}", offset); + key_metadata->download_queue.add(it->second->file_segment); +} + std::optional LockedKey::hasIntersectingRange(const FileSegment::Range & range) const { if (key_metadata->empty()) diff --git a/src/Interpreters/Cache/Metadata.h b/src/Interpreters/Cache/Metadata.h index 8ee40aa977f..42d74338e12 100644 --- a/src/Interpreters/Cache/Metadata.h +++ b/src/Interpreters/Cache/Metadata.h @@ -8,8 +8,12 @@ namespace DB { + class CleanupQueue; using CleanupQueuePtr = std::shared_ptr; +class DownloadQueue; +using DownloadQueuePtr = std::shared_ptr; +using FileSegmentsHolderPtr = std::unique_ptr; struct FileSegmentMetadata : private boost::noncopyable @@ -44,6 +48,7 @@ struct KeyMetadata : public std::map, const Key & key_, const std::string & key_path_, CleanupQueue & cleanup_queue_, + DownloadQueue & download_queue_, Poco::Logger * log_, bool created_base_directory_ = false); @@ -70,6 +75,7 @@ private: KeyState key_state = KeyState::ACTIVE; KeyGuard guard; CleanupQueue & cleanup_queue; + DownloadQueue & download_queue; std::atomic created_base_directory = false; Poco::Logger * log; }; @@ -81,7 +87,7 @@ struct CacheMetadata : public std::unordered_map, { public: using Key = FileCacheKey; - using IterateCacheMetadataFunc = std::function; + using IterateCacheMetadataFunc = std::function; explicit CacheMetadata(const std::string & path_); @@ -100,6 +106,7 @@ public: enum class KeyNotFoundPolicy { THROW, + THROW_LOGICAL, CREATE_EMPTY, RETURN_NULL, }; @@ -111,12 +118,19 @@ public: void doCleanup(); + void downloadThreadFunc(); + + void cancelDownload(); + private: CacheMetadataGuard::Lock lockMetadata() const; const std::string path; /// Cache base path mutable CacheMetadataGuard guard; const CleanupQueuePtr cleanup_queue; + const DownloadQueuePtr download_queue; Poco::Logger * log; + + void downloadImpl(FileSegment & file_segment, std::optional> & memory); }; @@ -156,12 +170,15 @@ struct LockedKey : private boost::noncopyable std::shared_ptr getKeyMetadata() const { return key_metadata; } std::shared_ptr getKeyMetadata() { return key_metadata; } - void removeAllReleasable(); + void removeAll(bool if_releasable = true); KeyMetadata::iterator removeFileSegment(size_t offset, const FileSegmentGuard::Lock &); + KeyMetadata::iterator removeFileSegment(size_t offset); void shrinkFileSegmentToDownloadedSize(size_t offset, const FileSegmentGuard::Lock &); + void addToDownloadQueue(size_t offset, const FileSegmentGuard::Lock &); + bool isLastOwnerOfFileSegment(size_t offset) const; std::optional hasIntersectingRange(const FileSegment::Range & range) const; @@ -173,6 +190,8 @@ struct LockedKey : private boost::noncopyable std::string toString() const; private: + KeyMetadata::iterator removeFileSegmentImpl(KeyMetadata::iterator it, const FileSegmentGuard::Lock &); + const std::shared_ptr key_metadata; KeyGuard::Lock lock; /// `lock` must be destructed before `key_metadata`. }; diff --git a/src/Interpreters/Context.h b/src/Interpreters/Context.h index 82d5d713d33..6cbb0e58911 100644 --- a/src/Interpreters/Context.h +++ b/src/Interpreters/Context.h @@ -1,5 +1,7 @@ #pragma once +#ifndef CLICKHOUSE_PROGRAM_STANDALONE_BUILD + #include #include #include @@ -1240,3 +1242,9 @@ struct HTTPContext : public IHTTPContext }; } + +#else + +#include + +#endif diff --git a/src/Interpreters/DatabaseCatalog.cpp b/src/Interpreters/DatabaseCatalog.cpp index 40ef2bd4775..e0b6348ed3c 100644 --- a/src/Interpreters/DatabaseCatalog.cpp +++ b/src/Interpreters/DatabaseCatalog.cpp @@ -217,6 +217,8 @@ void DatabaseCatalog::shutdownImpl() /// We still hold "databases" (instead of std::move) for Buffer tables to flush data correctly. /// Delay shutdown of temporary and system databases. They will be shutdown last. + /// Because some databases might use them until their shutdown is called, but calling shutdown + /// on temporary database means clearing its set of tables, which will lead to unnecessary errors like "table not found". std::vector databases_with_delayed_shutdown; for (auto & database : current_databases) { @@ -354,7 +356,8 @@ DatabaseAndTable DatabaseCatalog::getTableImpl( auto table = database->tryGetTable(table_id.table_name, context_); if (!table && exception) - exception->emplace(Exception(ErrorCodes::UNKNOWN_TABLE, "Table {} doesn't exist", table_id.getNameForLogs())); + exception->emplace(Exception(ErrorCodes::UNKNOWN_TABLE, "Table {} doesn't exist", table_id.getNameForLogs())); + if (!table) database = nullptr; diff --git a/src/Interpreters/FilesystemCacheLog.cpp b/src/Interpreters/FilesystemCacheLog.cpp index 17f0fda71ec..b660db064d1 100644 --- a/src/Interpreters/FilesystemCacheLog.cpp +++ b/src/Interpreters/FilesystemCacheLog.cpp @@ -40,6 +40,8 @@ NamesAndTypesList FilesystemCacheLogElement::getNamesAndTypes() {"source_file_path", std::make_shared()}, {"file_segment_range", std::make_shared(types)}, {"total_requested_range", std::make_shared(types)}, + {"key", std::make_shared()}, + {"offset", std::make_shared()}, {"size", std::make_shared()}, {"read_type", std::make_shared()}, {"read_from_cache_attempted", std::make_shared()}, @@ -60,6 +62,8 @@ void FilesystemCacheLogElement::appendToBlock(MutableColumns & columns) const columns[i++]->insert(source_file_path); columns[i++]->insert(Tuple{file_segment_range.first, file_segment_range.second}); columns[i++]->insert(Tuple{requested_range.first, requested_range.second}); + columns[i++]->insert(file_segment_key); + columns[i++]->insert(file_segment_offset); columns[i++]->insert(file_segment_size); columns[i++]->insert(typeToString(cache_type)); columns[i++]->insert(read_from_cache_attempted); diff --git a/src/Interpreters/FilesystemCacheLog.h b/src/Interpreters/FilesystemCacheLog.h index 1b22d561c51..d6dd00e5463 100644 --- a/src/Interpreters/FilesystemCacheLog.h +++ b/src/Interpreters/FilesystemCacheLog.h @@ -39,6 +39,8 @@ struct FilesystemCacheLogElement std::pair file_segment_range{}; std::pair requested_range{}; CacheType cache_type{}; + std::string file_segment_key; + size_t file_segment_offset; size_t file_segment_size; bool read_from_cache_attempted; String read_buffer_id; diff --git a/src/Interpreters/GraceHashJoin.cpp b/src/Interpreters/GraceHashJoin.cpp index 197b64865e1..4218a8ea4e1 100644 --- a/src/Interpreters/GraceHashJoin.cpp +++ b/src/Interpreters/GraceHashJoin.cpp @@ -385,11 +385,23 @@ GraceHashJoin::Buckets GraceHashJoin::rehashBuckets(size_t to_size) void GraceHashJoin::addBucket(Buckets & destination) { - auto & left_file = tmp_data->createStream(left_sample_block); - auto & right_file = tmp_data->createStream(prepareRightBlock(right_sample_block)); + // There could be exceptions from createStream, In ci tests + // there is a certain probability of failure in allocating memory, see memory_tracker_fault_probability. + // It may terminate this thread and leave a broken hash_join, and another thread cores when it tries to + // use the broken hash_join. So we print an exception message here to help debug. + try + { + auto & left_file = tmp_data->createStream(left_sample_block); + auto & right_file = tmp_data->createStream(prepareRightBlock(right_sample_block)); - BucketPtr new_bucket = std::make_shared(destination.size(), left_file, right_file, log); - destination.emplace_back(std::move(new_bucket)); + BucketPtr new_bucket = std::make_shared(destination.size(), left_file, right_file, log); + destination.emplace_back(std::move(new_bucket)); + } + catch (...) + { + LOG_ERROR(&Poco::Logger::get("GraceHashJoin"), "Can't create bucket. current buckets size: {}", destination.size()); + throw; + } } void GraceHashJoin::checkTypesOfKeys(const Block & block) const @@ -626,7 +638,11 @@ void GraceHashJoin::addJoinedBlockImpl(Block block) if (current_block.rows() > 0) { std::lock_guard lock(hash_join_mutex); - + auto current_buckets = getCurrentBuckets(); + if (!isPowerOf2(current_buckets.size())) [[unlikely]] + { + throw Exception(ErrorCodes::LOGICAL_ERROR, "Broken buckets. its size({}) is not power of 2", current_buckets.size()); + } if (!hash_join) hash_join = makeInMemoryJoin(); @@ -637,11 +653,11 @@ void GraceHashJoin::addJoinedBlockImpl(Block block) current_block = {}; + // Must use the latest buckets snapshot in case that it has been rehashed by other threads. + buckets_snapshot = rehashBuckets(current_buckets.size() * 2); auto right_blocks = hash_join->releaseJoinedBlocks(/* restructure */ false); hash_join = nullptr; - buckets_snapshot = rehashBuckets(buckets_snapshot.size() * 2); - { Blocks current_blocks; current_blocks.reserve(right_blocks.size()); diff --git a/src/Interpreters/InterpreterDescribeCacheQuery.cpp b/src/Interpreters/InterpreterDescribeCacheQuery.cpp index 1d2c567d16f..7822ecdb8be 100644 --- a/src/Interpreters/InterpreterDescribeCacheQuery.cpp +++ b/src/Interpreters/InterpreterDescribeCacheQuery.cpp @@ -19,11 +19,15 @@ static Block getSampleBlock() ColumnWithTypeAndName{std::make_shared(), "max_size"}, ColumnWithTypeAndName{std::make_shared(), "max_elements"}, ColumnWithTypeAndName{std::make_shared(), "max_file_segment_size"}, + ColumnWithTypeAndName{std::make_shared(), "boundary_alignment"}, ColumnWithTypeAndName{std::make_shared>(), "cache_on_write_operations"}, ColumnWithTypeAndName{std::make_shared>(), "cache_hits_threshold"}, ColumnWithTypeAndName{std::make_shared(), "current_size"}, ColumnWithTypeAndName{std::make_shared(), "current_elements"}, ColumnWithTypeAndName{std::make_shared(), "path"}, + ColumnWithTypeAndName{std::make_shared>(), "delayed_cleanup_interval_ms"}, + ColumnWithTypeAndName{std::make_shared>(), "background_download_threads"}, + ColumnWithTypeAndName{std::make_shared>(), "enable_bypass_cache_with_threshold"}, }; return Block(columns); } @@ -40,14 +44,19 @@ BlockIO InterpreterDescribeCacheQuery::execute() const auto & settings = cache_data.settings; const auto & cache = cache_data.cache; - res_columns[0]->insert(settings.max_size); - res_columns[1]->insert(settings.max_elements); - res_columns[2]->insert(settings.max_file_segment_size); - res_columns[3]->insert(settings.cache_on_write_operations); - res_columns[4]->insert(settings.cache_hits_threshold); - res_columns[5]->insert(cache->getUsedCacheSize()); - res_columns[6]->insert(cache->getFileSegmentsNum()); - res_columns[7]->insert(cache->getBasePath()); + size_t i = 0; + res_columns[i++]->insert(settings.max_size); + res_columns[i++]->insert(settings.max_elements); + res_columns[i++]->insert(settings.max_file_segment_size); + res_columns[i++]->insert(settings.boundary_alignment); + res_columns[i++]->insert(settings.cache_on_write_operations); + res_columns[i++]->insert(settings.cache_hits_threshold); + res_columns[i++]->insert(cache->getUsedCacheSize()); + res_columns[i++]->insert(cache->getFileSegmentsNum()); + res_columns[i++]->insert(cache->getBasePath()); + res_columns[i++]->insert(settings.delayed_cleanup_interval_ms); + res_columns[i++]->insert(settings.background_download_threads); + res_columns[i++]->insert(settings.enable_bypass_cache_with_threashold); BlockIO res; size_t num_rows = res_columns[0]->size(); diff --git a/src/Interpreters/InterpreterSystemQuery.cpp b/src/Interpreters/InterpreterSystemQuery.cpp index f2d011b12d1..e1ff8676bc7 100644 --- a/src/Interpreters/InterpreterSystemQuery.cpp +++ b/src/Interpreters/InterpreterSystemQuery.cpp @@ -370,7 +370,18 @@ BlockIO InterpreterSystemQuery::execute() else { auto cache = FileCacheFactory::instance().getByName(query.filesystem_cache_name).cache; - cache->removeAllReleasable(); + if (query.delete_key.empty()) + { + cache->removeAllReleasable(); + } + else + { + auto key = FileCacheKey::fromKeyString(query.delete_key); + if (query.delete_offset.has_value()) + cache->removeFileSegment(key, query.delete_offset.value()); + else + cache->removeKey(key); + } } break; } diff --git a/src/Interpreters/ThreadStatusExt.cpp b/src/Interpreters/ThreadStatusExt.cpp index 6a4f4576eca..5acfe500b1d 100644 --- a/src/Interpreters/ThreadStatusExt.cpp +++ b/src/Interpreters/ThreadStatusExt.cpp @@ -158,6 +158,17 @@ void CurrentThread::attachQueryForLog(const String & query_) current_thread->attachQueryForLog(query_); } +void ThreadStatus::applyGlobalSettings() +{ + auto global_context_ptr = global_context.lock(); + if (!global_context_ptr) + return; + + const Settings & settings = global_context_ptr->getSettingsRef(); + + DB::Exception::enable_job_stack_trace = settings.enable_job_stack_trace; +} + void ThreadStatus::applyQuerySettings() { auto query_context_ptr = query_context.lock(); @@ -166,6 +177,8 @@ void ThreadStatus::applyQuerySettings() const Settings & settings = query_context_ptr->getSettingsRef(); + DB::Exception::enable_job_stack_trace = settings.enable_job_stack_trace; + query_id_from_query_context = query_context_ptr->getCurrentQueryId(); initQueryProfiler(); @@ -204,6 +217,7 @@ void ThreadStatus::attachToGroupImpl(const ThreadGroupPtr & thread_group_) local_data = thread_group->getSharedData(); + applyGlobalSettings(); applyQuerySettings(); initPerformanceCounters(); } diff --git a/src/Interpreters/tests/gtest_lru_file_cache.cpp b/src/Interpreters/tests/gtest_lru_file_cache.cpp index 3cba1e48e1e..b9d12c8ed42 100644 --- a/src/Interpreters/tests/gtest_lru_file_cache.cpp +++ b/src/Interpreters/tests/gtest_lru_file_cache.cpp @@ -22,6 +22,8 @@ #include #include +#include +#include #include namespace fs = std::filesystem; @@ -475,7 +477,7 @@ TEST_F(FileCacheTest, get) } cv.notify_one(); - file_segment2.wait(file_segment2.range().left); + file_segment2.wait(file_segment2.range().right); file_segment2.complete(); ASSERT_TRUE(file_segment2.state() == State::DOWNLOADED); }); @@ -542,8 +544,8 @@ TEST_F(FileCacheTest, get) cv.notify_one(); file_segment2.wait(file_segment2.range().left); - ASSERT_TRUE(file_segment2.state() == DB::FileSegment::State::PARTIALLY_DOWNLOADED); - ASSERT_TRUE(file_segment2.getOrSetDownloader() == DB::FileSegment::getCallerId()); + ASSERT_EQ(file_segment2.state(), DB::FileSegment::State::EMPTY); + ASSERT_EQ(file_segment2.getOrSetDownloader(), DB::FileSegment::getCallerId()); download(file_segment2); }); @@ -862,3 +864,78 @@ TEST_F(FileCacheTest, temporaryData) ASSERT_LE(file_cache.getUsedCacheSize(), size_used_before_temporary_data); ASSERT_LE(file_cache.getFileSegmentsNum(), segments_used_before_temporary_data); } + +TEST_F(FileCacheTest, CachedReadBuffer) +{ + DB::ThreadStatus thread_status; + + /// To work with cache need query_id and query context. + std::string query_id = "query_id"; + + Poco::XML::DOMParser dom_parser; + std::string xml(R"CONFIG( +)CONFIG"); + Poco::AutoPtr document = dom_parser.parseString(xml); + Poco::AutoPtr config = new Poco::Util::XMLConfiguration(document); + getMutableContext().context->setConfig(config); + + auto query_context = DB::Context::createCopy(getContext().context); + query_context->makeQueryContext(); + query_context->setCurrentQueryId(query_id); + chassert(&DB::CurrentThread::get() == &thread_status); + DB::CurrentThread::QueryScope query_scope_holder(query_context); + + DB::FileCacheSettings settings; + settings.base_path = cache_base_path; + settings.max_file_segment_size = 5; + settings.max_size = 30; + settings.max_elements = 10; + settings.boundary_alignment = 1; + + ReadSettings read_settings; + read_settings.enable_filesystem_cache = true; + read_settings.local_fs_method = LocalFSReadMethod::pread; + + std::string file_path = fs::current_path() / "test"; + auto read_buffer_creator = [&]() + { + return createReadBufferFromFileBase(file_path, read_settings, std::nullopt, std::nullopt); + }; + + auto wb = std::make_unique(file_path, DBMS_DEFAULT_BUFFER_SIZE); + std::string s(30, '*'); + wb->write(s.data(), s.size()); + wb->next(); + wb->finalize(); + + auto cache = std::make_shared(settings); + cache->initialize(); + auto key = cache->createKeyForPath(file_path); + + { + auto cached_buffer = std::make_shared( + file_path, key, cache, read_buffer_creator, read_settings, "test", s.size(), false, false, std::nullopt, nullptr); + + WriteBufferFromOwnString result; + copyData(*cached_buffer, result); + ASSERT_EQ(result.str(), s); + + assertEqual(cache->dumpQueue(), { Range(0, 4), Range(5, 9), Range(10, 14), Range(15, 19), Range(20, 24), Range(25, 29) }); + } + + { + ReadSettings modified_settings{read_settings}; + modified_settings.local_fs_buffer_size = 10; + modified_settings.remote_fs_buffer_size = 10; + + auto cached_buffer = std::make_shared( + file_path, key, cache, read_buffer_creator, modified_settings, "test", s.size(), false, false, std::nullopt, nullptr); + + cached_buffer->next(); + assertEqual(cache->dumpQueue(), { Range(5, 9), Range(10, 14), Range(15, 19), Range(20, 24), Range(25, 29), Range(0, 4) }); + + cached_buffer->position() = cached_buffer->buffer().end(); + cached_buffer->next(); + assertEqual(cache->dumpQueue(), {Range(10, 14), Range(15, 19), Range(20, 24), Range(25, 29), Range(0, 4), Range(5, 9) }); + } +} diff --git a/src/Parsers/ASTSystemQuery.cpp b/src/Parsers/ASTSystemQuery.cpp index a91449ff035..9c5e7bff61e 100644 --- a/src/Parsers/ASTSystemQuery.cpp +++ b/src/Parsers/ASTSystemQuery.cpp @@ -210,7 +210,15 @@ void ASTSystemQuery::formatImpl(const FormatSettings & settings, FormatState &, else if (type == Type::DROP_FILESYSTEM_CACHE) { if (!filesystem_cache_name.empty()) + { settings.ostr << (settings.hilite ? hilite_none : "") << " " << filesystem_cache_name; + if (!delete_key.empty()) + { + settings.ostr << (settings.hilite ? hilite_none : "") << " KEY " << delete_key; + if (delete_offset.has_value()) + settings.ostr << (settings.hilite ? hilite_none : "") << " OFFSET " << delete_offset.value(); + } + } } else if (type == Type::UNFREEZE) { diff --git a/src/Parsers/ASTSystemQuery.h b/src/Parsers/ASTSystemQuery.h index ca4802d9a9b..ebc3e9cd430 100644 --- a/src/Parsers/ASTSystemQuery.h +++ b/src/Parsers/ASTSystemQuery.h @@ -107,6 +107,8 @@ public: UInt64 seconds{}; String filesystem_cache_name; + std::string delete_key; + std::optional delete_offset; String backup_name; diff --git a/src/Parsers/MySQL/tests/gtest_create_parser.cpp b/src/Parsers/MySQL/tests/gtest_create_parser.cpp index 554b3f0a67d..2f65eb6e592 100644 --- a/src/Parsers/MySQL/tests/gtest_create_parser.cpp +++ b/src/Parsers/MySQL/tests/gtest_create_parser.cpp @@ -40,5 +40,5 @@ TEST(CreateTableParser, SS) ASTPtr ast = parseQuery(p_create_query, input.data(), input.data() + input.size(), "", 0, 0); WriteBufferFromOStream buf(std::cerr, 4096); ast->dumpTree(buf); - + buf.finalize(); } diff --git a/src/Parsers/ParserSetQuery.cpp b/src/Parsers/ParserSetQuery.cpp index 4df74c2dd82..727d037112f 100644 --- a/src/Parsers/ParserSetQuery.cpp +++ b/src/Parsers/ParserSetQuery.cpp @@ -215,7 +215,7 @@ bool ParserSetQuery::parseNameValuePair(SettingChange & change, IParser::Pos & p else if (ParserKeyword("FALSE").ignore(pos, expected)) value = std::make_shared(Field(static_cast(0))); /// for SETTINGS disk=disk(type='s3', path='', ...) - else if (function_p.parse(pos, function_ast, expected) && function_ast->as()->name == "disk") + else if (function_p.parse(pos, function_ast, expected) && function_ast->as()->name.starts_with("disk")) { tryGetIdentifierNameInto(name, change.name); change.value = createFieldFromAST(function_ast); @@ -280,7 +280,7 @@ bool ParserSetQuery::parseNameValuePairWithParameterOrDefault( node = std::make_shared(Field(static_cast(1))); else if (ParserKeyword("FALSE").ignore(pos, expected)) node = std::make_shared(Field(static_cast(0))); - else if (function_p.parse(pos, function_ast, expected) && function_ast->as()->name == "disk") + else if (function_p.parse(pos, function_ast, expected) && function_ast->as()->name.starts_with("disk")) { change.name = name; change.value = createFieldFromAST(function_ast); diff --git a/src/Parsers/ParserSystemQuery.cpp b/src/Parsers/ParserSystemQuery.cpp index 48dbe60e241..ef71e994d56 100644 --- a/src/Parsers/ParserSystemQuery.cpp +++ b/src/Parsers/ParserSystemQuery.cpp @@ -405,7 +405,15 @@ bool ParserSystemQuery::parseImpl(IParser::Pos & pos, ASTPtr & node, Expected & ParserLiteral path_parser; ASTPtr ast; if (path_parser.parse(pos, ast, expected)) + { res->filesystem_cache_name = ast->as()->value.safeGet(); + if (ParserKeyword{"KEY"}.ignore(pos, expected) && ParserIdentifier().parse(pos, ast, expected)) + { + res->delete_key = ast->as()->name(); + if (ParserKeyword{"OFFSET"}.ignore(pos, expected) && ParserLiteral().parse(pos, ast, expected)) + res->delete_offset = ast->as()->value.safeGet(); + } + } if (!parseQueryWithOnCluster(res, pos, expected)) return false; break; diff --git a/src/Parsers/isDiskFunction.cpp b/src/Parsers/isDiskFunction.cpp index e60229cb3f7..5ba626a8b2c 100644 --- a/src/Parsers/isDiskFunction.cpp +++ b/src/Parsers/isDiskFunction.cpp @@ -10,7 +10,7 @@ bool isDiskFunction(ASTPtr ast) return false; const auto * function = ast->as(); - return function && function->name == "disk" && function->arguments->as(); + return function && function->name.starts_with("disk") && function->arguments->as(); } } diff --git a/src/Processors/Formats/Impl/RegexpRowInputFormat.h b/src/Processors/Formats/Impl/RegexpRowInputFormat.h index d6696ffe751..2469774aaf9 100644 --- a/src/Processors/Formats/Impl/RegexpRowInputFormat.h +++ b/src/Processors/Formats/Impl/RegexpRowInputFormat.h @@ -1,7 +1,6 @@ #pragma once #include -#include #include #include #include @@ -28,14 +27,14 @@ public: /// Return true if row was successfully parsed and row fields were extracted. bool parseRow(PeekableReadBuffer & buf); - re2_st::StringPiece getField(size_t index) { return matched_fields[index]; } + std::string_view getField(size_t index) { return matched_fields[index]; } size_t getMatchedFieldsSize() const { return matched_fields.size(); } size_t getNumberOfGroups() const { return regexp.NumberOfCapturingGroups(); } private: const re2_st::RE2 regexp; // The vector of fields extracted from line using regexp. - std::vector matched_fields; + std::vector matched_fields; // These two vectors are needed to use RE2::FullMatchN (function for extracting fields). std::vector re2_arguments; std::vector re2_arguments_ptrs; diff --git a/src/Processors/QueryPlan/Optimizations/Optimizations.h b/src/Processors/QueryPlan/Optimizations/Optimizations.h index 2b934ec440b..bc47413cbb5 100644 --- a/src/Processors/QueryPlan/Optimizations/Optimizations.h +++ b/src/Processors/QueryPlan/Optimizations/Optimizations.h @@ -14,6 +14,9 @@ namespace QueryPlanOptimizations void optimizeTreeFirstPass(const QueryPlanOptimizationSettings & settings, QueryPlan::Node & root, QueryPlan::Nodes & nodes); /// Second pass is used to apply read-in-order and attach a predicate to PK. void optimizeTreeSecondPass(const QueryPlanOptimizationSettings & optimization_settings, QueryPlan::Node & root, QueryPlan::Nodes & nodes); +/// Third pass is used to apply filters such as key conditions and skip indexes to the storages that support them. +/// After that it add CreateSetsStep for the subqueries that has not be used in the filters. +void optimizeTreeThirdPass(QueryPlan::Node & root, QueryPlan::Nodes & nodes); /// Optimization (first pass) is a function applied to QueryPlan::Node. /// It can read and update subtree of specified node. diff --git a/src/Processors/QueryPlan/Optimizations/optimizePrimaryKeyCondition.cpp b/src/Processors/QueryPlan/Optimizations/optimizePrimaryKeyCondition.cpp index 5ef786ff975..e98386a6ee9 100644 --- a/src/Processors/QueryPlan/Optimizations/optimizePrimaryKeyCondition.cpp +++ b/src/Processors/QueryPlan/Optimizations/optimizePrimaryKeyCondition.cpp @@ -28,8 +28,6 @@ void optimizePrimaryKeyCondition(const Stack & stack) else break; } - - source_step_with_filter->onAddFilterFinish(); } } diff --git a/src/Processors/QueryPlan/Optimizations/optimizeTree.cpp b/src/Processors/QueryPlan/Optimizations/optimizeTree.cpp index d620170e29c..e788918703e 100644 --- a/src/Processors/QueryPlan/Optimizations/optimizeTree.cpp +++ b/src/Processors/QueryPlan/Optimizations/optimizeTree.cpp @@ -167,7 +167,6 @@ void optimizeTreeSecondPass(const QueryPlanOptimizationSettings & optimization_s optimizePrewhere(stack, nodes); optimizePrimaryKeyCondition(stack); enableMemoryBoundMerging(*stack.back().node, nodes); - addPlansForSets(*stack.back().node, nodes); stack.pop_back(); } @@ -178,5 +177,35 @@ void optimizeTreeSecondPass(const QueryPlanOptimizationSettings & optimization_s "No projection is used when optimize_use_projections = 1 and force_optimize_projection = 1"); } +void optimizeTreeThirdPass(QueryPlan::Node & root, QueryPlan::Nodes & nodes) +{ + Stack stack; + stack.push_back({.node = &root}); + + while (!stack.empty()) + { + /// NOTE: frame cannot be safely used after stack was modified. + auto & frame = stack.back(); + + /// Traverse all children first. + if (frame.next_child < frame.node->children.size()) + { + auto next_frame = Frame{.node = frame.node->children[frame.next_child]}; + ++frame.next_child; + stack.push_back(next_frame); + continue; + } + + if (auto * source_step_with_filter = dynamic_cast(frame.node->step.get())) + { + source_step_with_filter->applyFilters(); + } + + addPlansForSets(*frame.node, nodes); + + stack.pop_back(); + } +} + } } diff --git a/src/Processors/QueryPlan/QueryPlan.cpp b/src/Processors/QueryPlan/QueryPlan.cpp index cb732e58855..687260441ff 100644 --- a/src/Processors/QueryPlan/QueryPlan.cpp +++ b/src/Processors/QueryPlan/QueryPlan.cpp @@ -482,6 +482,7 @@ void QueryPlan::optimize(const QueryPlanOptimizationSettings & optimization_sett QueryPlanOptimizations::optimizeTreeFirstPass(optimization_settings, *root, nodes); QueryPlanOptimizations::optimizeTreeSecondPass(optimization_settings, *root, nodes); + QueryPlanOptimizations::optimizeTreeThirdPass(*root, nodes); updateDataStreams(*root); } diff --git a/src/Processors/QueryPlan/ReadFromMergeTree.cpp b/src/Processors/QueryPlan/ReadFromMergeTree.cpp index 1e21d13e2b1..e7bf1ce2edf 100644 --- a/src/Processors/QueryPlan/ReadFromMergeTree.cpp +++ b/src/Processors/QueryPlan/ReadFromMergeTree.cpp @@ -1299,13 +1299,10 @@ static void buildIndexes( indexes->skip_indexes = std::move(skip_indexes); } -void ReadFromMergeTree::onAddFilterFinish() +void ReadFromMergeTree::applyFilters() { - if (!filter_nodes.nodes.empty()) - { - auto filter_actions_dag = buildFilterDAG(context, prewhere_info, filter_nodes, query_info); - buildIndexes(indexes, filter_actions_dag, data, context, query_info, metadata_for_reading); - } + auto filter_actions_dag = buildFilterDAG(context, prewhere_info, filter_nodes, query_info); + buildIndexes(indexes, filter_actions_dag, data, context, query_info, metadata_for_reading); } MergeTreeDataSelectAnalysisResultPtr ReadFromMergeTree::selectRangesToRead( diff --git a/src/Processors/QueryPlan/ReadFromMergeTree.h b/src/Processors/QueryPlan/ReadFromMergeTree.h index 06021cb1c5b..3e3edd4dc5c 100644 --- a/src/Processors/QueryPlan/ReadFromMergeTree.h +++ b/src/Processors/QueryPlan/ReadFromMergeTree.h @@ -226,7 +226,7 @@ public: size_t getNumStreams() const { return requested_num_streams; } bool isParallelReadingEnabled() const { return read_task_callback != std::nullopt; } - void onAddFilterFinish() override; + void applyFilters() override; private: static MergeTreeDataSelectAnalysisResultPtr selectRangesToReadImpl( diff --git a/src/Processors/QueryPlan/SourceStepWithFilter.h b/src/Processors/QueryPlan/SourceStepWithFilter.h index 34b6e3c6a7b..dbaff06fde6 100644 --- a/src/Processors/QueryPlan/SourceStepWithFilter.h +++ b/src/Processors/QueryPlan/SourceStepWithFilter.h @@ -37,7 +37,8 @@ public: filter_dags.push_back(std::move(filter_dag)); } - virtual void onAddFilterFinish() {} + /// Apply filters that can optimize reading from storage. + virtual void applyFilters() {} protected: std::vector filter_dags; diff --git a/src/Processors/Transforms/ExpressionTransform.cpp b/src/Processors/Transforms/ExpressionTransform.cpp index 49988932947..0d3341b000c 100644 --- a/src/Processors/Transforms/ExpressionTransform.cpp +++ b/src/Processors/Transforms/ExpressionTransform.cpp @@ -39,7 +39,6 @@ void ConvertingTransform::onConsume(Chunk chunk) expression->execute(block, num_rows); chunk.setColumns(block.getColumns(), num_rows); - chunk.setChunkInfo(chunk.getChunkInfo()); cur_chunk = std::move(chunk); } diff --git a/src/Processors/Transforms/buildPushingToViewsChain.cpp b/src/Processors/Transforms/buildPushingToViewsChain.cpp index 43085690519..7f7f9058f1b 100644 --- a/src/Processors/Transforms/buildPushingToViewsChain.cpp +++ b/src/Processors/Transforms/buildPushingToViewsChain.cpp @@ -282,7 +282,7 @@ Chain buildPushingToViewsChain( auto * original_thread = current_thread; SCOPE_EXIT({ current_thread = original_thread; }); - std::unique_ptr view_thread_status_ptr = std::make_unique(); + std::unique_ptr view_thread_status_ptr = std::make_unique(/*check_current_thread_on_destruction=*/ false); /// Copy of a ThreadStatus should be internal. view_thread_status_ptr->setInternalThread(); view_thread_status_ptr->attachToGroup(running_group); diff --git a/src/QueryPipeline/RemoteQueryExecutor.cpp b/src/QueryPipeline/RemoteQueryExecutor.cpp index 233dfe13339..cd6f65b7b43 100644 --- a/src/QueryPipeline/RemoteQueryExecutor.cpp +++ b/src/QueryPipeline/RemoteQueryExecutor.cpp @@ -434,11 +434,13 @@ RemoteQueryExecutor::ReadResult RemoteQueryExecutor::processPacket(Packet packet switch (packet.type) { case Protocol::Server::MergeTreeReadTaskRequest: - processMergeTreeReadTaskRequest(packet.request); + chassert(packet.request.has_value()); + processMergeTreeReadTaskRequest(packet.request.value()); return ReadResult(ReadResult::Type::ParallelReplicasToken); case Protocol::Server::MergeTreeAllRangesAnnounecement: - processMergeTreeInitialReadAnnounecement(packet.announcement); + chassert(packet.announcement.has_value()); + processMergeTreeInitialReadAnnounecement(packet.announcement.value()); return ReadResult(ReadResult::Type::ParallelReplicasToken); case Protocol::Server::ReadTaskRequest: diff --git a/src/Server/HTTPHandler.cpp b/src/Server/HTTPHandler.cpp index fe98ae5f69e..42459340c57 100644 --- a/src/Server/HTTPHandler.cpp +++ b/src/Server/HTTPHandler.cpp @@ -44,6 +44,8 @@ #include #include +#include + #include #include @@ -1163,8 +1165,8 @@ void PredefinedQueryHandler::customizeContext(HTTPServerRequest & request, Conte { int num_captures = compiled_regex->NumberOfCapturingGroups() + 1; - re2::StringPiece matches[num_captures]; - re2::StringPiece input(begin, end - begin); + std::string_view matches[num_captures]; + std::string_view input(begin, end - begin); if (compiled_regex->Match(input, 0, end - begin, re2::RE2::Anchor::ANCHOR_BOTH, matches, num_captures)) { for (const auto & [capturing_name, capturing_index] : compiled_regex->NamedCapturingGroups()) diff --git a/src/Server/HTTPHandlerRequestFilter.h b/src/Server/HTTPHandlerRequestFilter.h index c6bcdb211e1..25cbb950871 100644 --- a/src/Server/HTTPHandlerRequestFilter.h +++ b/src/Server/HTTPHandlerRequestFilter.h @@ -6,7 +6,6 @@ #include #include -#include #include #include @@ -26,9 +25,8 @@ static inline bool checkRegexExpression(std::string_view match_str, const Compil { int num_captures = compiled_regex->NumberOfCapturingGroups() + 1; - re2::StringPiece matches[num_captures]; - re2::StringPiece match_input(match_str.data(), match_str.size()); - return compiled_regex->Match(match_input, 0, match_str.size(), re2::RE2::Anchor::ANCHOR_BOTH, matches, num_captures); + std::string_view matches[num_captures]; + return compiled_regex->Match({match_str.data(), match_str.size()}, 0, match_str.size(), re2::RE2::Anchor::ANCHOR_BOTH, matches, num_captures); } static inline bool checkExpression(std::string_view match_str, const std::pair & expression) diff --git a/src/Server/TCPHandler.h b/src/Server/TCPHandler.h index 629738fa16f..235f634afec 100644 --- a/src/Server/TCPHandler.h +++ b/src/Server/TCPHandler.h @@ -83,8 +83,6 @@ struct QueryState NOT_CANCELLED }; - static std::string cancellationStatusToName(CancellationStatus status); - /// Is request cancelled CancellationStatus cancellation_status = CancellationStatus::NOT_CANCELLED; bool is_connection_closed = false; diff --git a/src/Storages/Distributed/DistributedAsyncInsertHeader.cpp b/src/Storages/Distributed/DistributedAsyncInsertHeader.cpp index 018c1d863bb..a8ed89e66f1 100644 --- a/src/Storages/Distributed/DistributedAsyncInsertHeader.cpp +++ b/src/Storages/Distributed/DistributedAsyncInsertHeader.cpp @@ -39,9 +39,8 @@ DistributedAsyncInsertHeader DistributedAsyncInsertHeader::read(ReadBufferFromFi if (expected_checksum != calculated_checksum) { throw Exception(ErrorCodes::CHECKSUM_DOESNT_MATCH, - "Checksum of extra info doesn't match: corrupted data. Reference: {}{}. Actual: {}{}.", - getHexUIntLowercase(expected_checksum.first), getHexUIntLowercase(expected_checksum.second), - getHexUIntLowercase(calculated_checksum.first), getHexUIntLowercase(calculated_checksum.second)); + "Checksum of extra info doesn't match: corrupted data. Reference: {}. Actual: {}.", + getHexUIntLowercase(expected_checksum), getHexUIntLowercase(calculated_checksum)); } /// Read the parts of the header. diff --git a/src/Storages/MergeTree/DataPartStorageOnDiskBase.cpp b/src/Storages/MergeTree/DataPartStorageOnDiskBase.cpp index 30776a8bc50..b73e2cca314 100644 --- a/src/Storages/MergeTree/DataPartStorageOnDiskBase.cpp +++ b/src/Storages/MergeTree/DataPartStorageOnDiskBase.cpp @@ -400,7 +400,7 @@ void DataPartStorageOnDiskBase::backup( if (it != checksums.files.end()) { file_size = it->second.file_size; - file_hash = {it->second.file_hash.first, it->second.file_hash.second}; + file_hash = it->second.file_hash; } BackupEntryPtr backup_entry = std::make_unique(disk, filepath_on_disk, copy_encrypted, file_size, file_hash); @@ -415,41 +415,41 @@ void DataPartStorageOnDiskBase::backup( MutableDataPartStoragePtr DataPartStorageOnDiskBase::freeze( const std::string & to, const std::string & dir_path, - bool make_source_readonly, std::function save_metadata_callback, - bool copy_instead_of_hardlink, - const NameSet & files_to_copy_instead_of_hardlinks, - DiskTransactionPtr external_transaction) const + const ClonePartParams & params) const { auto disk = volume->getDisk(); - if (external_transaction) - external_transaction->createDirectories(to); + if (params.external_transaction) + params.external_transaction->createDirectories(to); else disk->createDirectories(to); - localBackup(disk, getRelativePath(), fs::path(to) / dir_path, make_source_readonly, {}, copy_instead_of_hardlink, files_to_copy_instead_of_hardlinks, external_transaction); + localBackup(disk, getRelativePath(), fs::path(to) / dir_path, params.make_source_readonly, {}, params.copy_instead_of_hardlink, + params.files_to_copy_instead_of_hardlinks, params.external_transaction); if (save_metadata_callback) save_metadata_callback(disk); - if (external_transaction) + if (params.external_transaction) { - external_transaction->removeFileIfExists(fs::path(to) / dir_path / "delete-on-destroy.txt"); - external_transaction->removeFileIfExists(fs::path(to) / dir_path / "txn_version.txt"); - external_transaction->removeFileIfExists(fs::path(to) / dir_path / IMergeTreeDataPart::METADATA_VERSION_FILE_NAME); + params.external_transaction->removeFileIfExists(fs::path(to) / dir_path / "delete-on-destroy.txt"); + params.external_transaction->removeFileIfExists(fs::path(to) / dir_path / "txn_version.txt"); + if (!params.keep_metadata_version) + params.external_transaction->removeFileIfExists(fs::path(to) / dir_path / IMergeTreeDataPart::METADATA_VERSION_FILE_NAME); } else { disk->removeFileIfExists(fs::path(to) / dir_path / "delete-on-destroy.txt"); disk->removeFileIfExists(fs::path(to) / dir_path / "txn_version.txt"); - disk->removeFileIfExists(fs::path(to) / dir_path / IMergeTreeDataPart::METADATA_VERSION_FILE_NAME); + if (!params.keep_metadata_version) + disk->removeFileIfExists(fs::path(to) / dir_path / IMergeTreeDataPart::METADATA_VERSION_FILE_NAME); } auto single_disk_volume = std::make_shared(disk->getName(), disk, 0); /// Do not initialize storage in case of DETACH because part may be broken. bool to_detached = dir_path.starts_with("detached/"); - return create(single_disk_volume, to, dir_path, /*initialize=*/ !to_detached && !external_transaction); + return create(single_disk_volume, to, dir_path, /*initialize=*/ !to_detached && !params.external_transaction); } MutableDataPartStoragePtr DataPartStorageOnDiskBase::clonePart( diff --git a/src/Storages/MergeTree/DataPartStorageOnDiskBase.h b/src/Storages/MergeTree/DataPartStorageOnDiskBase.h index 043953eb20c..5f7dcc3fd32 100644 --- a/src/Storages/MergeTree/DataPartStorageOnDiskBase.h +++ b/src/Storages/MergeTree/DataPartStorageOnDiskBase.h @@ -62,11 +62,8 @@ public: MutableDataPartStoragePtr freeze( const std::string & to, const std::string & dir_path, - bool make_source_readonly, std::function save_metadata_callback, - bool copy_instead_of_hardlink, - const NameSet & files_to_copy_instead_of_hardlinks, - DiskTransactionPtr external_transaction) const override; + const ClonePartParams & params) const override; MutableDataPartStoragePtr clonePart( const std::string & to, diff --git a/src/Storages/MergeTree/DataPartStorageOnDiskFull.cpp b/src/Storages/MergeTree/DataPartStorageOnDiskFull.cpp index e62e5a5d6b2..20b6c5a919e 100644 --- a/src/Storages/MergeTree/DataPartStorageOnDiskFull.cpp +++ b/src/Storages/MergeTree/DataPartStorageOnDiskFull.cpp @@ -80,6 +80,11 @@ DataPartStorageIteratorPtr DataPartStorageOnDiskFull::iterate() const volume->getDisk()->iterateDirectory(fs::path(root_path) / part_dir)); } +Poco::Timestamp DataPartStorageOnDiskFull::getFileLastModified(const String & file_name) const +{ + return volume->getDisk()->getLastModified(fs::path(root_path) / part_dir / file_name); +} + size_t DataPartStorageOnDiskFull::getFileSize(const String & file_name) const { return volume->getDisk()->getFileSize(fs::path(root_path) / part_dir / file_name); diff --git a/src/Storages/MergeTree/DataPartStorageOnDiskFull.h b/src/Storages/MergeTree/DataPartStorageOnDiskFull.h index 2ceb392e80f..5d70404fcfa 100644 --- a/src/Storages/MergeTree/DataPartStorageOnDiskFull.h +++ b/src/Storages/MergeTree/DataPartStorageOnDiskFull.h @@ -20,6 +20,7 @@ public: bool isDirectory(const std::string & name) const override; DataPartStorageIteratorPtr iterate() const override; + Poco::Timestamp getFileLastModified(const String & file_name) const override; size_t getFileSize(const std::string & file_name) const override; UInt32 getRefCount(const std::string & file_name) const override; std::string getRemotePath(const std::string & file_name) const override; diff --git a/src/Storages/MergeTree/IDataPartStorage.h b/src/Storages/MergeTree/IDataPartStorage.h index 9d6c5d0dcba..8dbf5caa168 100644 --- a/src/Storages/MergeTree/IDataPartStorage.h +++ b/src/Storages/MergeTree/IDataPartStorage.h @@ -63,6 +63,9 @@ using DiskPtr = std::shared_ptr; class ISyncGuard; using SyncGuardPtr = std::unique_ptr; +class MergeTreeTransaction; +using MergeTreeTransactionPtr = std::shared_ptr; + class IBackupEntry; using BackupEntryPtr = std::shared_ptr; using BackupEntries = std::vector>; @@ -72,6 +75,17 @@ struct WriteSettings; class TemporaryFileOnDisk; + +struct HardlinkedFiles +{ + /// Shared table uuid where hardlinks live + std::string source_table_shared_id; + /// Hardlinked from part + std::string source_part_name; + /// Hardlinked files list + NameSet hardlinks_from_source_part; +}; + /// This is an abstraction of storage for data part files. /// Ideally, it is assumed to contain read-only methods from IDisk. /// It is not fulfilled now, but let's try our best. @@ -108,6 +122,7 @@ public: virtual DataPartStorageIteratorPtr iterate() const = 0; /// Get metadata for a file inside path dir. + virtual Poco::Timestamp getFileLastModified(const std::string & file_name) const = 0; virtual size_t getFileSize(const std::string & file_name) const = 0; virtual UInt32 getRefCount(const std::string & file_name) const = 0; @@ -220,14 +235,24 @@ public: /// If `external_transaction` is provided, the disk operations (creating directories, hardlinking, /// etc) won't be applied immediately; instead, they'll be added to external_transaction, which the /// caller then needs to commit. + + struct ClonePartParams + { + MergeTreeTransactionPtr txn = NO_TRANSACTION_PTR; + HardlinkedFiles * hardlinked_files = nullptr; + bool copy_instead_of_hardlink = false; + NameSet files_to_copy_instead_of_hardlinks; + bool keep_metadata_version = false; + bool make_source_readonly = false; + DiskTransactionPtr external_transaction = nullptr; + std::optional metadata_version_to_write = std::nullopt; + }; + virtual std::shared_ptr freeze( const std::string & to, const std::string & dir_path, - bool make_source_readonly, std::function save_metadata_callback, - bool copy_instead_of_hardlink, - const NameSet & files_to_copy_instead_of_hardlinks, - DiskTransactionPtr external_transaction = nullptr) const = 0; + const ClonePartParams & params) const = 0; /// Make a full copy of a data part into 'to/dir_path' (possibly to a different disk). virtual std::shared_ptr clonePart( diff --git a/src/Storages/MergeTree/IMergeTreeDataPart.cpp b/src/Storages/MergeTree/IMergeTreeDataPart.cpp index cf2d1b19326..b9591864869 100644 --- a/src/Storages/MergeTree/IMergeTreeDataPart.cpp +++ b/src/Storages/MergeTree/IMergeTreeDataPart.cpp @@ -1362,6 +1362,10 @@ void IMergeTreeDataPart::loadColumns(bool require) else { loaded_metadata_version = metadata_snapshot->getMetadataVersion(); + old_part_with_no_metadata_version_on_disk = true; + if (storage.supportsReplication()) + LOG_WARNING(storage.log, "Part {} doesn't have metadata version on disk, setting it to {}. " + "It's okay if the part was created by an old version of ClickHouse", name, loaded_metadata_version); } setColumns(loaded_columns, infos, loaded_metadata_version); @@ -1764,12 +1768,6 @@ void IMergeTreeDataPart::renameToDetached(const String & prefix) DataPartStoragePtr IMergeTreeDataPart::makeCloneInDetached(const String & prefix, const StorageMetadataPtr & /*metadata_snapshot*/) const { - auto storage_settings = storage.getSettings(); - - /// In case of zero-copy replication we copy directory instead of hardlinks - /// because hardlinks tracking doesn't work for detached parts. - bool copy_instead_of_hardlink = isStoredOnRemoteDiskWithZeroCopySupport() && storage.supportsReplication() && storage_settings->allow_remote_fs_zero_copy_replication; - /// Avoid unneeded duplicates of broken parts if we try to detach the same broken part multiple times. /// Otherwise it may pollute detached/ with dirs with _tryN suffix and we will fail to remove broken part after 10 attempts. bool broken = !prefix.empty(); @@ -1777,13 +1775,19 @@ DataPartStoragePtr IMergeTreeDataPart::makeCloneInDetached(const String & prefix if (!maybe_path_in_detached) return nullptr; + /// In case of zero-copy replication we copy directory instead of hardlinks + /// because hardlinks tracking doesn't work for detached parts. + auto storage_settings = storage.getSettings(); + IDataPartStorage::ClonePartParams params + { + .copy_instead_of_hardlink = isStoredOnRemoteDiskWithZeroCopySupport() && storage.supportsReplication() && storage_settings->allow_remote_fs_zero_copy_replication, + .make_source_readonly = true + }; return getDataPartStorage().freeze( storage.relative_data_path, *maybe_path_in_detached, - /*make_source_readonly=*/ true, /*save_metadata_callback=*/ {}, - copy_instead_of_hardlink, - /*files_to_copy_instead_of_hardlinks=*/ {}); + params); } MutableDataPartStoragePtr IMergeTreeDataPart::makeCloneOnDisk(const DiskPtr & disk, const String & directory_name) const diff --git a/src/Storages/MergeTree/IMergeTreeDataPart.h b/src/Storages/MergeTree/IMergeTreeDataPart.h index fd73d802579..3427ac2db68 100644 --- a/src/Storages/MergeTree/IMergeTreeDataPart.h +++ b/src/Storages/MergeTree/IMergeTreeDataPart.h @@ -116,6 +116,8 @@ public: /// Otherwise return information about column size on disk. ColumnSize getColumnSize(const String & column_name) const; + virtual std::optional getColumnModificationTime(const String & column_name) const = 0; + /// NOTE: Returns zeros if secondary indexes are not found in checksums. /// Otherwise return information about secondary index size on disk. IndexSize getSecondaryIndexSize(const String & secondary_index_name) const; @@ -249,6 +251,9 @@ public: /// Flag for keep S3 data when zero-copy replication over S3 turned on. mutable bool force_keep_shared_data = false; + /// Some old parts don't have metadata version, so we set it to the current table's version when loading the part + bool old_part_with_no_metadata_version_on_disk = false; + using TTLInfo = MergeTreeDataPartTTLInfo; using TTLInfos = MergeTreeDataPartTTLInfos; diff --git a/src/Storages/MergeTree/MergeFromLogEntryTask.cpp b/src/Storages/MergeTree/MergeFromLogEntryTask.cpp index f29d37312f9..17582e7df98 100644 --- a/src/Storages/MergeTree/MergeFromLogEntryTask.cpp +++ b/src/Storages/MergeTree/MergeFromLogEntryTask.cpp @@ -392,7 +392,7 @@ bool MergeFromLogEntryTask::finalize(ReplicatedMergeMutateTaskBase::PartLogWrite /** With `ZSESSIONEXPIRED` or `ZOPERATIONTIMEOUT`, we can inadvertently roll back local changes to the parts. * This is not a problem, because in this case the merge will remain in the queue, and we will try again. */ - storage.merge_selecting_task->schedule(); + finish_callback = [storage_ptr = &storage]() { storage_ptr->merge_selecting_task->schedule(); }; ProfileEvents::increment(ProfileEvents::ReplicatedPartMerges); write_part_log({}); diff --git a/src/Storages/MergeTree/MergeTreeBackgroundExecutor.cpp b/src/Storages/MergeTree/MergeTreeBackgroundExecutor.cpp index b569fa73a73..d4f8d1140a2 100644 --- a/src/Storages/MergeTree/MergeTreeBackgroundExecutor.cpp +++ b/src/Storages/MergeTree/MergeTreeBackgroundExecutor.cpp @@ -182,7 +182,6 @@ void MergeTreeBackgroundExecutor::removeTasksCorrespondingToStorage(Stora } catch (...) { - printExceptionWithRespectToAbort(log); pending.remove(id); } diff --git a/src/Storages/MergeTree/MergeTreeData.cpp b/src/Storages/MergeTree/MergeTreeData.cpp index e9c3a7f66ae..fa9bfd38a23 100644 --- a/src/Storages/MergeTree/MergeTreeData.cpp +++ b/src/Storages/MergeTree/MergeTreeData.cpp @@ -4529,9 +4529,8 @@ MergeTreeData::DataPartPtr MergeTreeData::getActiveContainingPart( } -void MergeTreeData::swapActivePart(MergeTreeData::DataPartPtr part_copy) +void MergeTreeData::swapActivePart(MergeTreeData::DataPartPtr part_copy, DataPartsLock &) { - auto lock = lockParts(); for (auto original_active_part : getDataPartsStateRange(DataPartState::Active)) // NOLINT (copy is intended) { if (part_copy->name == original_active_part->name) @@ -4587,6 +4586,12 @@ MergeTreeData::DataPartPtr MergeTreeData::getActiveContainingPart(const String & return getActiveContainingPart(part_info); } +MergeTreeData::DataPartPtr MergeTreeData::getActiveContainingPart(const String & part_name, DataPartsLock & lock) const +{ + auto part_info = MergeTreePartInfo::fromPartName(part_name, format_version); + return getActiveContainingPart(part_info, DataPartState::Active, lock); +} + MergeTreeData::DataPartsVector MergeTreeData::getVisibleDataPartsVectorInPartition(ContextPtr local_context, const String & partition_id) const { return getVisibleDataPartsVectorInPartition(local_context->getCurrentTransaction().get(), partition_id); @@ -4707,12 +4712,19 @@ MergeTreeData::DataPartPtr MergeTreeData::getPartIfExistsUnlocked(const MergeTre return nullptr; } -static void loadPartAndFixMetadataImpl(MergeTreeData::MutableDataPartPtr part) +static void loadPartAndFixMetadataImpl(MergeTreeData::MutableDataPartPtr part, ContextPtr local_context, int32_t metadata_version, bool sync) { /// Remove metadata version file and take it from table. /// Currently we cannot attach parts with different schema, so /// we can assume that it's equal to table's current schema. part->removeMetadataVersion(); + { + auto out_metadata = part->getDataPartStorage().writeFile(IMergeTreeDataPart::METADATA_VERSION_FILE_NAME, 4096, local_context->getWriteSettings()); + writeText(metadata_version, *out_metadata); + out_metadata->finalize(); + if (sync) + out_metadata->sync(); + } part->loadColumnsChecksumsIndexes(false, true); part->modification_time = part->getDataPartStorage().getLastModified().epochTime(); @@ -5844,7 +5856,7 @@ MergeTreeData::MutableDataPartsVector MergeTreeData::tryLoadPartsToAttach(const .withPartFormatFromDisk() .build(); - loadPartAndFixMetadataImpl(part); + loadPartAndFixMetadataImpl(part, local_context, getInMemoryMetadataPtr()->getMetadataVersion(), getSettings()->fsync_after_insert); loaded_parts.push_back(part); } @@ -7184,7 +7196,10 @@ QueryProcessingStage::Enum MergeTreeData::getQueryProcessingStage( if (query_context->canUseParallelReplicasOnInitiator() && to_stage >= QueryProcessingStage::WithMergeableState) { if (!canUseParallelReplicasBasedOnPKAnalysis(query_context, storage_snapshot, query_info)) + { + query_info.parallel_replicas_disabled = true; return QueryProcessingStage::Enum::FetchColumns; + } /// ReplicatedMergeTree if (supportsReplication()) @@ -7289,10 +7304,7 @@ std::pair MergeTreeData::cloneAn const String & tmp_part_prefix, const MergeTreePartInfo & dst_part_info, const StorageMetadataPtr & metadata_snapshot, - const MergeTreeTransactionPtr & txn, - HardlinkedFiles * hardlinked_files, - bool copy_instead_of_hardlink, - const NameSet & files_to_copy_instead_of_hardlinks) + const IDataPartStorage::ClonePartParams & params) { /// Check that the storage policy contains the disk where the src_part is located. bool does_storage_policy_allow_same_disk = false; @@ -7343,16 +7355,24 @@ std::pair MergeTreeData::cloneAn } String with_copy; - if (copy_instead_of_hardlink) + if (params.copy_instead_of_hardlink) with_copy = " (copying data)"; auto dst_part_storage = src_part_storage->freeze( relative_data_path, tmp_dst_part_name, - /*make_source_readonly=*/ false, /*save_metadata_callback=*/ {}, - copy_instead_of_hardlink, - files_to_copy_instead_of_hardlinks); + params); + + if (params.metadata_version_to_write.has_value()) + { + chassert(!params.keep_metadata_version); + auto out_metadata = dst_part_storage->writeFile(IMergeTreeDataPart::METADATA_VERSION_FILE_NAME, 4096, getContext()->getWriteSettings()); + writeText(metadata_snapshot->getMetadataVersion(), *out_metadata); + out_metadata->finalize(); + if (getSettings()->fsync_after_insert) + out_metadata->sync(); + } LOG_DEBUG(log, "Clone{} part {} to {}{}", src_flushed_tmp_part ? " flushed" : "", @@ -7364,18 +7384,18 @@ std::pair MergeTreeData::cloneAn .withPartFormatFromDisk() .build(); - if (!copy_instead_of_hardlink && hardlinked_files) + if (!params.copy_instead_of_hardlink && params.hardlinked_files) { - hardlinked_files->source_part_name = src_part->name; - hardlinked_files->source_table_shared_id = src_part->storage.getTableSharedID(); + params.hardlinked_files->source_part_name = src_part->name; + params.hardlinked_files->source_table_shared_id = src_part->storage.getTableSharedID(); for (auto it = src_part->getDataPartStorage().iterate(); it->isValid(); it->next()) { - if (!files_to_copy_instead_of_hardlinks.contains(it->name()) + if (!params.files_to_copy_instead_of_hardlinks.contains(it->name()) && it->name() != IMergeTreeDataPart::DELETE_ON_DESTROY_MARKER_FILE_NAME_DEPRECATED && it->name() != IMergeTreeDataPart::TXN_VERSION_METADATA_FILE_NAME) { - hardlinked_files->hardlinks_from_source_part.insert(it->name()); + params.hardlinked_files->hardlinks_from_source_part.insert(it->name()); } } @@ -7386,18 +7406,18 @@ std::pair MergeTreeData::cloneAn for (auto it = projection_storage.iterate(); it->isValid(); it->next()) { auto file_name_with_projection_prefix = fs::path(projection_storage.getPartDirectory()) / it->name(); - if (!files_to_copy_instead_of_hardlinks.contains(file_name_with_projection_prefix) + if (!params.files_to_copy_instead_of_hardlinks.contains(file_name_with_projection_prefix) && it->name() != IMergeTreeDataPart::DELETE_ON_DESTROY_MARKER_FILE_NAME_DEPRECATED && it->name() != IMergeTreeDataPart::TXN_VERSION_METADATA_FILE_NAME) { - hardlinked_files->hardlinks_from_source_part.insert(file_name_with_projection_prefix); + params.hardlinked_files->hardlinks_from_source_part.insert(file_name_with_projection_prefix); } } } } /// We should write version metadata on part creation to distinguish it from parts that were created without transaction. - TransactionID tid = txn ? txn->tid : Tx::PrehistoricTID; + TransactionID tid = params.txn ? params.txn->tid : Tx::PrehistoricTID; dst_data_part->version.setCreationTID(tid, nullptr); dst_data_part->storeVersionMetadata(); @@ -7579,13 +7599,15 @@ PartitionCommandsResultInfo MergeTreeData::freezePartitionsByMatcher( createAndStoreFreezeMetadata(disk, part, fs::path(backup_part_path) / part->getDataPartStorage().getPartDirectory()); }; + IDataPartStorage::ClonePartParams params + { + .make_source_readonly = true + }; auto new_storage = data_part_storage->freeze( backup_part_path, part->getDataPartStorage().getPartDirectory(), - /*make_source_readonly=*/ true, callback, - /*copy_instead_of_hardlink=*/ false, - /*files_to_copy_instead_of_hardlinks=*/ {}); + params); part->is_frozen.store(true, std::memory_order_relaxed); result.push_back(PartitionCommandResultInfo{ diff --git a/src/Storages/MergeTree/MergeTreeData.h b/src/Storages/MergeTree/MergeTreeData.h index b27392b355b..8c379af193d 100644 --- a/src/Storages/MergeTree/MergeTreeData.h +++ b/src/Storages/MergeTree/MergeTreeData.h @@ -504,12 +504,13 @@ public: /// Returns a part in Active state with the given name or a part containing it. If there is no such part, returns nullptr. DataPartPtr getActiveContainingPart(const String & part_name) const; + DataPartPtr getActiveContainingPart(const String & part_name, DataPartsLock & lock) const; DataPartPtr getActiveContainingPart(const MergeTreePartInfo & part_info) const; DataPartPtr getActiveContainingPart(const MergeTreePartInfo & part_info, DataPartState state, DataPartsLock & lock) const; /// Swap part with it's identical copy (possible with another path on another disk). /// If original part is not active or doesn't exist exception will be thrown. - void swapActivePart(MergeTreeData::DataPartPtr part_copy); + void swapActivePart(MergeTreeData::DataPartPtr part_copy, DataPartsLock &); /// Returns all parts in specified partition DataPartsVector getVisibleDataPartsVectorInPartition(MergeTreeTransaction * txn, const String & partition_id, DataPartsLock * acquired_lock = nullptr) const; @@ -828,21 +829,10 @@ public: MergeTreeData & checkStructureAndGetMergeTreeData(const StoragePtr & source_table, const StorageMetadataPtr & src_snapshot, const StorageMetadataPtr & my_snapshot) const; MergeTreeData & checkStructureAndGetMergeTreeData(IStorage & source_table, const StorageMetadataPtr & src_snapshot, const StorageMetadataPtr & my_snapshot) const; - struct HardlinkedFiles - { - /// Shared table uuid where hardlinks live - std::string source_table_shared_id; - /// Hardlinked from part - std::string source_part_name; - /// Hardlinked files list - NameSet hardlinks_from_source_part; - }; - std::pair cloneAndLoadDataPartOnSameDisk( const MergeTreeData::DataPartPtr & src_part, const String & tmp_part_prefix, const MergeTreePartInfo & dst_part_info, const StorageMetadataPtr & metadata_snapshot, - const MergeTreeTransactionPtr & txn, HardlinkedFiles * hardlinked_files, - bool copy_instead_of_hardlink, const NameSet & files_to_copy_instead_of_hardlinks); + const IDataPartStorage::ClonePartParams & params); virtual std::vector getMutationsStatus() const = 0; diff --git a/src/Storages/MergeTree/MergeTreeDataMergerMutator.cpp b/src/Storages/MergeTree/MergeTreeDataMergerMutator.cpp index 29a1574b66e..c0acc401506 100644 --- a/src/Storages/MergeTree/MergeTreeDataMergerMutator.cpp +++ b/src/Storages/MergeTree/MergeTreeDataMergerMutator.cpp @@ -239,8 +239,9 @@ MergeTreeDataMergerMutator::PartitionIdsHint MergeTreeDataMergerMutator::getPart if (!best_partition_id_to_optimize.empty()) res.emplace(std::move(best_partition_id_to_optimize)); - LOG_TRACE(log, "Checked {} partitions, found {} partitions with parts that may be merged: {}", - all_partition_ids.size(), res.size(), fmt::join(res, ", ")); + LOG_TRACE(log, "Checked {} partitions, found {} partitions with parts that may be merged: [{}]" + "(max_total_size_to_merge={}, merge_with_ttl_allowed{})", + all_partition_ids.size(), res.size(), fmt::join(res, ", "), max_total_size_to_merge, merge_with_ttl_allowed); return res; } diff --git a/src/Storages/MergeTree/MergeTreeDataPartChecksum.cpp b/src/Storages/MergeTree/MergeTreeDataPartChecksum.cpp index 78f68ea72fe..6628cd68eaf 100644 --- a/src/Storages/MergeTree/MergeTreeDataPartChecksum.cpp +++ b/src/Storages/MergeTree/MergeTreeDataPartChecksum.cpp @@ -154,9 +154,9 @@ bool MergeTreeDataPartChecksums::readV2(ReadBuffer & in) assertString("\n\tsize: ", in); readText(sum.file_size, in); assertString("\n\thash: ", in); - readText(sum.file_hash.first, in); + readText(sum.file_hash.low64, in); assertString(" ", in); - readText(sum.file_hash.second, in); + readText(sum.file_hash.high64, in); assertString("\n\tcompressed: ", in); readText(sum.is_compressed, in); if (sum.is_compressed) @@ -164,9 +164,9 @@ bool MergeTreeDataPartChecksums::readV2(ReadBuffer & in) assertString("\n\tuncompressed size: ", in); readText(sum.uncompressed_size, in); assertString("\n\tuncompressed hash: ", in); - readText(sum.uncompressed_hash.first, in); + readText(sum.uncompressed_hash.low64, in); assertString(" ", in); - readText(sum.uncompressed_hash.second, in); + readText(sum.uncompressed_hash.high64, in); } assertChar('\n', in); @@ -307,19 +307,7 @@ static void updateHash(SipHash & hash, const std::string & data) /// Hash is the same as MinimalisticDataPartChecksums::hash_of_all_files String MergeTreeDataPartChecksums::getTotalChecksumHex() const { - SipHash hash_of_all_files; - - for (const auto & [name, checksum] : files) - { - updateHash(hash_of_all_files, name); - hash_of_all_files.update(checksum.file_hash); - } - - UInt64 lo; - UInt64 hi; - hash_of_all_files.get128(lo, hi); - - return getHexUIntUppercase(hi) + getHexUIntUppercase(lo); + return getHexUIntUppercase(getTotalChecksumUInt128()); } MergeTreeDataPartChecksums::Checksum::uint128 MergeTreeDataPartChecksums::getTotalChecksumUInt128() const diff --git a/src/Storages/MergeTree/MergeTreeDataPartCompact.cpp b/src/Storages/MergeTree/MergeTreeDataPartCompact.cpp index 07e20f16a9f..9c47608e364 100644 --- a/src/Storages/MergeTree/MergeTreeDataPartCompact.cpp +++ b/src/Storages/MergeTree/MergeTreeDataPartCompact.cpp @@ -144,6 +144,11 @@ bool MergeTreeDataPartCompact::hasColumnFiles(const NameAndTypePair & column) co return (bin_checksum != checksums.files.end() && mrk_checksum != checksums.files.end()); } +std::optional MergeTreeDataPartCompact::getColumnModificationTime(const String & /* column_name */) const +{ + return getDataPartStorage().getFileLastModified(DATA_FILE_NAME_WITH_EXTENSION).epochTime(); +} + void MergeTreeDataPartCompact::checkConsistency(bool require_part_metadata) const { checkConsistencyBase(); diff --git a/src/Storages/MergeTree/MergeTreeDataPartCompact.h b/src/Storages/MergeTree/MergeTreeDataPartCompact.h index b115692a7cf..08764eedb43 100644 --- a/src/Storages/MergeTree/MergeTreeDataPartCompact.h +++ b/src/Storages/MergeTree/MergeTreeDataPartCompact.h @@ -55,6 +55,8 @@ public: bool hasColumnFiles(const NameAndTypePair & column) const override; + std::optional getColumnModificationTime(const String & column_name) const override; + String getFileNameForColumn(const NameAndTypePair & /* column */) const override { return DATA_FILE_NAME; } ~MergeTreeDataPartCompact() override; diff --git a/src/Storages/MergeTree/MergeTreeDataPartInMemory.h b/src/Storages/MergeTree/MergeTreeDataPartInMemory.h index db7244d8e99..2698b69b38e 100644 --- a/src/Storages/MergeTree/MergeTreeDataPartInMemory.h +++ b/src/Storages/MergeTree/MergeTreeDataPartInMemory.h @@ -43,6 +43,7 @@ public: String getFileNameForColumn(const NameAndTypePair & /* column */) const override { return ""; } void renameTo(const String & new_relative_path, bool remove_new_dir_if_exists) override; DataPartStoragePtr makeCloneInDetached(const String & prefix, const StorageMetadataPtr & metadata_snapshot) const override; + std::optional getColumnModificationTime(const String & /* column_name */) const override { return {}; } MutableDataPartStoragePtr flushToDisk(const String & new_relative_path, const StorageMetadataPtr & metadata_snapshot) const; diff --git a/src/Storages/MergeTree/MergeTreeDataPartWide.cpp b/src/Storages/MergeTree/MergeTreeDataPartWide.cpp index f44cbdd8628..2d886e2058b 100644 --- a/src/Storages/MergeTree/MergeTreeDataPartWide.cpp +++ b/src/Storages/MergeTree/MergeTreeDataPartWide.cpp @@ -260,6 +260,18 @@ bool MergeTreeDataPartWide::hasColumnFiles(const NameAndTypePair & column) const return res; } +std::optional MergeTreeDataPartWide::getColumnModificationTime(const String & column_name) const +{ + try + { + return getDataPartStorage().getFileLastModified(column_name + DATA_FILE_EXTENSION).epochTime(); + } + catch (const fs::filesystem_error &) + { + return {}; + } +} + String MergeTreeDataPartWide::getFileNameForColumn(const NameAndTypePair & column) const { String filename; diff --git a/src/Storages/MergeTree/MergeTreeDataPartWide.h b/src/Storages/MergeTree/MergeTreeDataPartWide.h index 5ee497b9b21..0b2ffeb4b18 100644 --- a/src/Storages/MergeTree/MergeTreeDataPartWide.h +++ b/src/Storages/MergeTree/MergeTreeDataPartWide.h @@ -54,6 +54,8 @@ public: bool hasColumnFiles(const NameAndTypePair & column) const override; + std::optional getColumnModificationTime(const String & column_name) const override; + protected: static void loadIndexGranularityImpl( MergeTreeIndexGranularity & index_granularity_, MergeTreeIndexGranularityInfo & index_granularity_info_, diff --git a/src/Storages/MergeTree/MergeTreeDataWriter.h b/src/Storages/MergeTree/MergeTreeDataWriter.h index ed7151886f5..795453b2afa 100644 --- a/src/Storages/MergeTree/MergeTreeDataWriter.h +++ b/src/Storages/MergeTree/MergeTreeDataWriter.h @@ -80,6 +80,11 @@ public: */ TemporaryPart writeTempPart(BlockWithPartition & block, const StorageMetadataPtr & metadata_snapshot, ContextPtr context); + MergeTreeData::MergingParams::Mode getMergingMode() const + { + return data.merging_params.mode; + } + TemporaryPart writeTempPartWithoutPrefix(BlockWithPartition & block, const StorageMetadataPtr & metadata_snapshot, int64_t block_number, ContextPtr context); /// For insertion. diff --git a/src/Storages/MergeTree/MergeTreeMarksLoader.cpp b/src/Storages/MergeTree/MergeTreeMarksLoader.cpp index 9a5576f0ad2..5c722eec380 100644 --- a/src/Storages/MergeTree/MergeTreeMarksLoader.cpp +++ b/src/Storages/MergeTree/MergeTreeMarksLoader.cpp @@ -135,6 +135,7 @@ MarkCache::MappedPtr MergeTreeMarksLoader::loadMarksImpl() if (!index_granularity_info.mark_type.adaptive) { /// Read directly to marks. + chassert(expected_uncompressed_size == plain_marks.size() * sizeof(MarkInCompressedFile)); reader->readStrict(reinterpret_cast(plain_marks.data()), expected_uncompressed_size); if (!reader->eof()) @@ -148,23 +149,25 @@ MarkCache::MappedPtr MergeTreeMarksLoader::loadMarksImpl() } else { - size_t i = 0; - size_t granularity; - while (!reader->eof()) + for (size_t i = 0; i < marks_count; ++i) { + if (reader->eof()) + throw Exception( + ErrorCodes::CANNOT_READ_ALL_DATA, + "Cannot read all marks from file {}, marks expected {} (bytes size {}), marks read {} (bytes size {})", + mrk_path, marks_count, expected_uncompressed_size, i, reader->count()); + + size_t granularity; reader->readStrict( reinterpret_cast(plain_marks.data() + i * columns_in_mark), columns_in_mark * sizeof(MarkInCompressedFile)); readIntBinary(granularity, *reader); - ++i; } - if (i * mark_size != expected_uncompressed_size) - { + if (!reader->eof()) throw Exception( ErrorCodes::CANNOT_READ_ALL_DATA, - "Cannot read all marks from file {}, marks expected {} (bytes size {}), marks read {} (bytes size {})", - mrk_path, marks_count, expected_uncompressed_size, i, reader->count()); - } + "Too many marks in file {}, marks expected {} (bytes size {})", + mrk_path, marks_count, expected_uncompressed_size); } auto res = std::make_shared(plain_marks); diff --git a/src/Storages/MergeTree/MergeTreePartsMover.cpp b/src/Storages/MergeTree/MergeTreePartsMover.cpp index 8fa4ac6c78a..a8f34ba4cec 100644 --- a/src/Storages/MergeTree/MergeTreePartsMover.cpp +++ b/src/Storages/MergeTree/MergeTreePartsMover.cpp @@ -263,7 +263,10 @@ void MergeTreePartsMover::swapClonedPart(TemporaryClonedPart & cloned_part) cons if (moves_blocker.isCancelled()) throw Exception(ErrorCodes::ABORTED, "Cancelled moving parts."); - auto active_part = data->getActiveContainingPart(cloned_part.part->name); + /// `getActiveContainingPart` and `swapActivePart` are called under the same lock + /// to prevent part becoming inactive between calls + auto part_lock = data->lockParts(); + auto active_part = data->getActiveContainingPart(cloned_part.part->name, part_lock); /// It's ok, because we don't block moving parts for merges or mutations if (!active_part || active_part->name != cloned_part.part->name) @@ -284,7 +287,7 @@ void MergeTreePartsMover::swapClonedPart(TemporaryClonedPart & cloned_part) cons cloned_part.part->renameTo(active_part->name, false); /// TODO what happen if server goes down here? - data->swapActivePart(cloned_part.part); + data->swapActivePart(cloned_part.part, part_lock); LOG_TRACE(log, "Part {} was moved to {}", cloned_part.part->name, cloned_part.part->getDataPartStorage().getFullPath()); diff --git a/src/Storages/MergeTree/MergeTreeReadPool.cpp b/src/Storages/MergeTree/MergeTreeReadPool.cpp index ba8c2c6385f..2ab90189f9d 100644 --- a/src/Storages/MergeTree/MergeTreeReadPool.cpp +++ b/src/Storages/MergeTree/MergeTreeReadPool.cpp @@ -6,6 +6,7 @@ #include #include #include +#include namespace ProfileEvents @@ -433,8 +434,12 @@ MergeTreeReadTaskPtr MergeTreeReadPoolParallelReplicas::getTask(size_t thread) if (buffered_ranges.empty()) { - auto result = extension.callback(ParallelReadRequest{ - .replica_num = extension.number_of_current_replica, .min_number_of_marks = min_marks_for_concurrent_read * threads}); + auto result = extension.callback(ParallelReadRequest( + CoordinationMode::Default, + extension.number_of_current_replica, + min_marks_for_concurrent_read * threads, + /// For Default coordination mode we don't need to pass part names. + RangesInDataPartsDescription{})); if (!result || result->finish) { @@ -529,12 +534,12 @@ MarkRanges MergeTreeInOrderReadPoolParallelReplicas::getNewTask(RangesInDataPart if (no_more_tasks) return {}; - auto response = extension.callback(ParallelReadRequest{ - .mode = mode, - .replica_num = extension.number_of_current_replica, - .min_number_of_marks = min_marks_for_concurrent_read * request.size(), - .description = request, - }); + auto response = extension.callback(ParallelReadRequest( + mode, + extension.number_of_current_replica, + min_marks_for_concurrent_read * request.size(), + request + )); if (!response || response->description.empty() || response->finish) { diff --git a/src/Storages/MergeTree/MergeTreeReadPool.h b/src/Storages/MergeTree/MergeTreeReadPool.h index 21273904e00..68d5438cb3d 100644 --- a/src/Storages/MergeTree/MergeTreeReadPool.h +++ b/src/Storages/MergeTree/MergeTreeReadPool.h @@ -193,10 +193,11 @@ public: predict_block_size_bytes, column_names, virtual_column_names, prewhere_info, actions_settings, reader_settings, per_part_params); - extension.all_callback({ - .description = parts_ranges.getDescriptions(), - .replica_num = extension.number_of_current_replica - }); + extension.all_callback(InitialAllRangesAnnouncement( + CoordinationMode::Default, + parts_ranges.getDescriptions(), + extension.number_of_current_replica + )); } ~MergeTreeReadPoolParallelReplicas() override; @@ -253,10 +254,11 @@ public: for (const auto & part : parts_ranges) buffered_tasks.push_back({part.data_part->info, MarkRanges{}}); - extension.all_callback({ - .description = parts_ranges.getDescriptions(), - .replica_num = extension.number_of_current_replica - }); + extension.all_callback(InitialAllRangesAnnouncement( + mode, + parts_ranges.getDescriptions(), + extension.number_of_current_replica + )); } MarkRanges getNewTask(RangesInDataPartDescription description); diff --git a/src/Storages/MergeTree/MergeTreeWhereOptimizer.cpp b/src/Storages/MergeTree/MergeTreeWhereOptimizer.cpp index 25a4579c73e..1620ba98d58 100644 --- a/src/Storages/MergeTree/MergeTreeWhereOptimizer.cpp +++ b/src/Storages/MergeTree/MergeTreeWhereOptimizer.cpp @@ -110,6 +110,9 @@ static void collectColumns(const RPNBuilderTreeNode & node, const NameSet & colu if (node.isConstant()) return; + if (node.isSubqueryOrSet()) + return; + if (!node.isFunction()) { auto column_name = node.getColumnName(); diff --git a/src/Storages/MergeTree/MutateFromLogEntryTask.cpp b/src/Storages/MergeTree/MutateFromLogEntryTask.cpp index 4c2def34e8a..ba55fb400ca 100644 --- a/src/Storages/MergeTree/MutateFromLogEntryTask.cpp +++ b/src/Storages/MergeTree/MutateFromLogEntryTask.cpp @@ -267,7 +267,7 @@ bool MutateFromLogEntryTask::finalize(ReplicatedMergeMutateTaskBase::PartLogWrit /** With `ZSESSIONEXPIRED` or `ZOPERATIONTIMEOUT`, we can inadvertently roll back local changes to the parts. * This is not a problem, because in this case the entry will remain in the queue, and we will try again. */ - storage.merge_selecting_task->schedule(); + finish_callback = [storage_ptr = &storage]() { storage_ptr->merge_selecting_task->schedule(); }; ProfileEvents::increment(ProfileEvents::ReplicatedPartMutations); write_part_log({}); diff --git a/src/Storages/MergeTree/MutateTask.cpp b/src/Storages/MergeTree/MutateTask.cpp index b98b0844ee7..f4a071b8f27 100644 --- a/src/Storages/MergeTree/MutateTask.cpp +++ b/src/Storages/MergeTree/MutateTask.cpp @@ -153,20 +153,22 @@ static void splitAndModifyMutationCommands( /// But we don't know for sure what happened. auto part_metadata_version = part->getMetadataVersion(); auto table_metadata_version = metadata_snapshot->getMetadataVersion(); - /// StorageMergeTree does not have metadata version - if (table_metadata_version <= part_metadata_version && part->storage.supportsReplication()) - throw Exception(ErrorCodes::LOGICAL_ERROR, "Part {} with metadata version {} contains column {} that is absent " - "in table {} with metadata version {}", - part->name, part_metadata_version, column.name, - part->storage.getStorageID().getNameForLogs(), table_metadata_version); - if (part_metadata_version < table_metadata_version) + bool allow_equal_versions = part_metadata_version == table_metadata_version && part->old_part_with_no_metadata_version_on_disk; + if (part_metadata_version < table_metadata_version || allow_equal_versions) { LOG_WARNING(log, "Ignoring column {} from part {} with metadata version {} because there is no such column " "in table {} with metadata version {}. Assuming the column was dropped", column.name, part->name, part_metadata_version, part->storage.getStorageID().getNameForLogs(), table_metadata_version); continue; } + + /// StorageMergeTree does not have metadata version + if (part->storage.supportsReplication()) + throw Exception(ErrorCodes::LOGICAL_ERROR, "Part {} with metadata version {} contains column {} that is absent " + "in table {} with metadata version {}", + part->name, part_metadata_version, column.name, + part->storage.getStorageID().getNameForLogs(), table_metadata_version); } for_interpreter.emplace_back( @@ -850,7 +852,7 @@ struct MutationContext MergeTreeTransactionPtr txn; - MergeTreeData::HardlinkedFiles hardlinked_files; + HardlinkedFiles hardlinked_files; bool need_prefix = true; @@ -1801,7 +1803,12 @@ bool MutateTask::prepare() if (ctx->need_prefix) prefix = "tmp_clone_"; - auto [part, lock] = ctx->data->cloneAndLoadDataPartOnSameDisk(ctx->source_part, prefix, ctx->future_part->part_info, ctx->metadata_snapshot, ctx->txn, &ctx->hardlinked_files, false, files_to_copy_instead_of_hardlinks); + IDataPartStorage::ClonePartParams clone_params + { + .txn = ctx->txn, .hardlinked_files = &ctx->hardlinked_files, + .files_to_copy_instead_of_hardlinks = std::move(files_to_copy_instead_of_hardlinks), .keep_metadata_version = true + }; + auto [part, lock] = ctx->data->cloneAndLoadDataPartOnSameDisk(ctx->source_part, prefix, ctx->future_part->part_info, ctx->metadata_snapshot, clone_params); part->getDataPartStorage().beginTransaction(); ctx->temporary_directory_lock = std::move(lock); @@ -1930,7 +1937,7 @@ bool MutateTask::prepare() return true; } -const MergeTreeData::HardlinkedFiles & MutateTask::getHardlinkedFiles() const +const HardlinkedFiles & MutateTask::getHardlinkedFiles() const { return ctx->hardlinked_files; } diff --git a/src/Storages/MergeTree/MutateTask.h b/src/Storages/MergeTree/MutateTask.h index 54ad996ad4c..dc21df018d7 100644 --- a/src/Storages/MergeTree/MutateTask.h +++ b/src/Storages/MergeTree/MutateTask.h @@ -45,7 +45,7 @@ public: return promise.get_future(); } - const MergeTreeData::HardlinkedFiles & getHardlinkedFiles() const; + const HardlinkedFiles & getHardlinkedFiles() const; private: diff --git a/src/Storages/MergeTree/ParallelReplicasReadingCoordinator.cpp b/src/Storages/MergeTree/ParallelReplicasReadingCoordinator.cpp index 2814d13cff0..57cd91cc995 100644 --- a/src/Storages/MergeTree/ParallelReplicasReadingCoordinator.cpp +++ b/src/Storages/MergeTree/ParallelReplicasReadingCoordinator.cpp @@ -102,7 +102,6 @@ public: explicit DefaultCoordinator(size_t replicas_count_) : ParallelReplicasReadingCoordinator::ImplInterface(replicas_count_) - , announcements(replicas_count_) , reading_state(replicas_count_) { } @@ -119,7 +118,6 @@ public: PartitionToBlockRanges partitions; size_t sent_initial_requests{0}; - std::vector announcements; Parts all_parts_to_read; /// Contains only parts which we haven't started to read from diff --git a/src/Storages/MergeTree/PartMetadataManagerWithCache.cpp b/src/Storages/MergeTree/PartMetadataManagerWithCache.cpp index 7c0aedf699b..bb6462b3058 100644 --- a/src/Storages/MergeTree/PartMetadataManagerWithCache.cpp +++ b/src/Storages/MergeTree/PartMetadataManagerWithCache.cpp @@ -250,8 +250,8 @@ std::unordered_map PartMetadataManagerWit ErrorCodes::CORRUPTED_DATA, "Checksums doesn't match in part {} for {}. Expected: {}. Found {}.", part->name, file_path, - getHexUIntUppercase(disk_checksum.first) + getHexUIntUppercase(disk_checksum.second), - getHexUIntUppercase(cache_checksums[i].first) + getHexUIntUppercase(cache_checksums[i].second)); + getHexUIntUppercase(disk_checksum), + getHexUIntUppercase(cache_checksums[i])); disk_checksums.push_back(disk_checksum); continue; @@ -287,8 +287,8 @@ std::unordered_map PartMetadataManagerWit ErrorCodes::CORRUPTED_DATA, "Checksums doesn't match in projection part {} {}. Expected: {}. Found {}.", part->name, proj_name, - getHexUIntUppercase(disk_checksum.first) + getHexUIntUppercase(disk_checksum.second), - getHexUIntUppercase(cache_checksums[i].first) + getHexUIntUppercase(cache_checksums[i].second)); + getHexUIntUppercase(disk_checksum), + getHexUIntUppercase(cache_checksums[i])); disk_checksums.push_back(disk_checksum); } return results; diff --git a/src/Storages/MergeTree/RPNBuilder.cpp b/src/Storages/MergeTree/RPNBuilder.cpp index 0b42111be45..a0c96c13d59 100644 --- a/src/Storages/MergeTree/RPNBuilder.cpp +++ b/src/Storages/MergeTree/RPNBuilder.cpp @@ -181,6 +181,21 @@ bool RPNBuilderTreeNode::isConstant() const } } +bool RPNBuilderTreeNode::isSubqueryOrSet() const +{ + if (ast_node) + { + return + typeid_cast(ast_node) || + typeid_cast(ast_node); + } + else + { + const auto * node_without_alias = getNodeWithoutAlias(dag_node); + return node_without_alias->result_type->getTypeId() == TypeIndex::Set; + } +} + ColumnWithTypeAndName RPNBuilderTreeNode::getConstantColumn() const { if (!isConstant()) diff --git a/src/Storages/MergeTree/RPNBuilder.h b/src/Storages/MergeTree/RPNBuilder.h index 6f624d93cd6..9eeb6deefd5 100644 --- a/src/Storages/MergeTree/RPNBuilder.h +++ b/src/Storages/MergeTree/RPNBuilder.h @@ -98,6 +98,8 @@ public: /// Is node constant bool isConstant() const; + bool isSubqueryOrSet() const; + /** Get constant as constant column. * Node must be constant before calling these method, otherwise logical exception is thrown. */ diff --git a/src/Storages/MergeTree/ReplicatedMergeMutateTaskBase.h b/src/Storages/MergeTree/ReplicatedMergeMutateTaskBase.h index a7bf1290274..1e7f9834245 100644 --- a/src/Storages/MergeTree/ReplicatedMergeMutateTaskBase.h +++ b/src/Storages/MergeTree/ReplicatedMergeMutateTaskBase.h @@ -21,10 +21,10 @@ public: StorageReplicatedMergeTree & storage_, ReplicatedMergeTreeQueue::SelectedEntryPtr & selected_entry_, IExecutableTask::TaskResultCallback & task_result_callback_) - : selected_entry(selected_entry_) + : storage(storage_) + , selected_entry(selected_entry_) , entry(*selected_entry->log_entry) , log(log_) - , storage(storage_) /// This is needed to ask an asssignee to assign a new merge/mutate operation /// It takes bool argument and true means that current task is successfully executed. , task_result_callback(task_result_callback_) @@ -52,13 +52,18 @@ protected: /// Will execute a part of inner MergeTask or MutateTask virtual bool executeInnerTask() = 0; + StorageReplicatedMergeTree & storage; + + /// A callback to reschedule merge_selecting_task after destroying merge_mutate_entry + /// The order is important, because merge_selecting_task may rely on the number of entries in MergeList + scope_guard finish_callback; + /// This is important not to execute the same mutation in parallel /// selected_entry is a RAII class, so the time of living must be the same as for the whole task ReplicatedMergeTreeQueue::SelectedEntryPtr selected_entry; ReplicatedMergeTreeLogEntry & entry; MergeList::EntryPtr merge_mutate_entry{nullptr}; Poco::Logger * log; - StorageReplicatedMergeTree & storage; /// ProfileEvents for current part will be stored here ProfileEvents::Counters profile_counters; ContextMutablePtr task_context; diff --git a/src/Storages/MergeTree/ReplicatedMergeTreeQueue.cpp b/src/Storages/MergeTree/ReplicatedMergeTreeQueue.cpp index 3ba3048b812..792843cbe18 100644 --- a/src/Storages/MergeTree/ReplicatedMergeTreeQueue.cpp +++ b/src/Storages/MergeTree/ReplicatedMergeTreeQueue.cpp @@ -1448,6 +1448,15 @@ bool ReplicatedMergeTreeQueue::shouldExecuteLogEntry( LOG_TRACE(LogToStr(out_postpone_reason, log), fmt_string, entry.znode_name, entry.alter_version, head_alter); return false; } + + auto database_name = storage.getStorageID().database_name; + auto database = DatabaseCatalog::instance().getDatabase(database_name); + if (!database->canExecuteReplicatedMetadataAlter()) + { + LOG_TRACE(LogToStr(out_postpone_reason, log), "Cannot execute alter metadata {} with version {} " + "because database {} cannot process metadata alters now", entry.znode_name, entry.alter_version, database_name); + return false; + } } /// If this MUTATE_PART is part of alter modify/drop query, than we have to execute them one by one diff --git a/src/Storages/MergeTree/ReplicatedMergeTreeSink.cpp b/src/Storages/MergeTree/ReplicatedMergeTreeSink.cpp index c93077fb4fb..4128654a632 100644 --- a/src/Storages/MergeTree/ReplicatedMergeTreeSink.cpp +++ b/src/Storages/MergeTree/ReplicatedMergeTreeSink.cpp @@ -7,6 +7,8 @@ #include #include #include +#include +#include #include #include #include @@ -54,6 +56,9 @@ struct ReplicatedMergeTreeSinkImpl::DelayedChunk UInt64 elapsed_ns; BlockIDsType block_id; BlockWithPartition block_with_partition; + /// Some merging algorithms can mofidy the block which loses the information about the async insert offsets + /// when preprocessing or filtering data for asnyc inserts deduplication we want to use the initial, unmerged block + std::optional unmerged_block_with_partition; std::unordered_map> block_id_to_offset_idx; ProfileEvents::Counters part_counters; @@ -63,12 +68,14 @@ struct ReplicatedMergeTreeSinkImpl::DelayedChunk UInt64 elapsed_ns_, BlockIDsType && block_id_, BlockWithPartition && block_, + std::optional && unmerged_block_with_partition_, ProfileEvents::Counters && part_counters_) : log(log_), temp_part(std::move(temp_part_)), elapsed_ns(elapsed_ns_), block_id(std::move(block_id_)), block_with_partition(std::move(block_)), + unmerged_block_with_partition(std::move(unmerged_block_with_partition_)), part_counters(std::move(part_counters_)) { initBlockIDMap(); @@ -113,6 +120,7 @@ struct ReplicatedMergeTreeSinkImpl::DelayedChunk { if constexpr (async_insert) { + auto * current_block_with_partition = unmerged_block_with_partition.has_value() ? &unmerged_block_with_partition.value() : &block_with_partition; std::vector offset_idx; for (const auto & raw_path : block_paths) { @@ -127,14 +135,14 @@ struct ReplicatedMergeTreeSinkImpl::DelayedChunk } std::sort(offset_idx.begin(), offset_idx.end()); - auto & offsets = block_with_partition.offsets; + auto & offsets = current_block_with_partition->offsets; size_t idx = 0, remove_count = 0; auto it = offset_idx.begin(); std::vector new_offsets; std::vector new_block_ids; /// construct filter - size_t rows = block_with_partition.block.rows(); + size_t rows = current_block_with_partition->block.rows(); auto filter_col = ColumnUInt8::create(rows, 1u); ColumnUInt8::Container & vec = filter_col->getData(); UInt8 * pos = vec.data(); @@ -162,18 +170,21 @@ struct ReplicatedMergeTreeSinkImpl::DelayedChunk LOG_TRACE(log, "New block IDs: {}, new offsets: {}, size: {}", toString(new_block_ids), toString(new_offsets), new_offsets.size()); - block_with_partition.offsets = std::move(new_offsets); + current_block_with_partition->offsets = std::move(new_offsets); block_id = std::move(new_block_ids); - auto cols = block_with_partition.block.getColumns(); + auto cols = current_block_with_partition->block.getColumns(); for (auto & col : cols) { col = col->filter(vec, rows - remove_count); } - block_with_partition.block.setColumns(cols); + current_block_with_partition->block.setColumns(cols); - LOG_TRACE(log, "New block rows {}", block_with_partition.block.rows()); + LOG_TRACE(log, "New block rows {}", current_block_with_partition->block.rows()); initBlockIDMap(); + + if (unmerged_block_with_partition.has_value()) + block_with_partition.block = unmerged_block_with_partition->block; } else { @@ -202,7 +213,7 @@ std::vector testSelfDeduplicate(std::vector data, std::vector::DelayedChunk::Partition part( - &Poco::Logger::get("testSelfDeduplicate"), MergeTreeDataWriter::TemporaryPart(), 0, std::move(hashes), std::move(block1), std::move(profile_counters)); + &Poco::Logger::get("testSelfDeduplicate"), MergeTreeDataWriter::TemporaryPart(), 0, std::move(hashes), std::move(block1), std::nullopt, std::move(profile_counters)); part.filterSelfDuplicate(); @@ -235,8 +246,10 @@ namespace { SipHash hash; for (size_t i = start; i < offset; ++i) + { for (const auto & col : cols) col->updateHashWithValue(i, hash); + } union { char bytes[16]; @@ -432,8 +445,18 @@ void ReplicatedMergeTreeSinkImpl::consume(Chunk chunk) ProfileEvents::Counters part_counters; auto profile_events_scope = std::make_unique(&part_counters); - /// Write part to the filesystem under temporary name. Calculate a checksum. + /// Some merging algorithms can mofidy the block which loses the information about the async insert offsets + /// when preprocessing or filtering data for asnyc inserts deduplication we want to use the initial, unmerged block + std::optional unmerged_block; + if constexpr (async_insert) + { + /// we copy everything but offsets which we move because they are only used by async insert + if (settings.optimize_on_insert && storage.writer.getMergingMode() != MergeTreeData::MergingParams::Mode::Ordinary) + unmerged_block.emplace(Block(current_block.block), Row(current_block.partition), std::move(current_block.offsets)); + } + + /// Write part to the filesystem under temporary name. Calculate a checksum. auto temp_part = storage.writer.writeTempPart(current_block, metadata_snapshot, context); /// If optimize_on_insert setting is true, current_block could become empty after merge @@ -446,31 +469,35 @@ void ReplicatedMergeTreeSinkImpl::consume(Chunk chunk) if constexpr (async_insert) { /// TODO consider insert_deduplication_token - block_id = getHashesForBlocks(current_block, temp_part.part->info.partition_id); + block_id = getHashesForBlocks(unmerged_block.has_value() ? *unmerged_block : current_block, temp_part.part->info.partition_id); LOG_TRACE(log, "async insert part, part id {}, block id {}, offsets {}, size {}", temp_part.part->info.partition_id, toString(block_id), toString(current_block.offsets), current_block.offsets.size()); } - else if (deduplicate) - { - String block_dedup_token; - - /// We add the hash from the data and partition identifier to deduplication ID. - /// That is, do not insert the same data to the same partition twice. - - const String & dedup_token = settings.insert_deduplication_token; - if (!dedup_token.empty()) - { - /// multiple blocks can be inserted within the same insert query - /// an ordinal number is added to dedup token to generate a distinctive block id for each block - block_dedup_token = fmt::format("{}_{}", dedup_token, chunk_dedup_seqnum); - ++chunk_dedup_seqnum; - } - - block_id = temp_part.part->getZeroLevelPartBlockID(block_dedup_token); - LOG_DEBUG(log, "Wrote block with ID '{}', {} rows{}", block_id, current_block.block.rows(), quorumLogMessage(replicas_num)); - } else { - LOG_DEBUG(log, "Wrote block with {} rows{}", current_block.block.rows(), quorumLogMessage(replicas_num)); + + if (deduplicate) + { + String block_dedup_token; + + /// We add the hash from the data and partition identifier to deduplication ID. + /// That is, do not insert the same data to the same partition twice. + + const String & dedup_token = settings.insert_deduplication_token; + if (!dedup_token.empty()) + { + /// multiple blocks can be inserted within the same insert query + /// an ordinal number is added to dedup token to generate a distinctive block id for each block + block_dedup_token = fmt::format("{}_{}", dedup_token, chunk_dedup_seqnum); + ++chunk_dedup_seqnum; + } + + block_id = temp_part.part->getZeroLevelPartBlockID(block_dedup_token); + LOG_DEBUG(log, "Wrote block with ID '{}', {} rows{}", block_id, current_block.block.rows(), quorumLogMessage(replicas_num)); + } + else + { + LOG_DEBUG(log, "Wrote block with {} rows{}", current_block.block.rows(), quorumLogMessage(replicas_num)); + } } profile_events_scope.reset(); @@ -501,6 +528,7 @@ void ReplicatedMergeTreeSinkImpl::consume(Chunk chunk) elapsed_ns, std::move(block_id), std::move(current_block), + std::move(unmerged_block), std::move(part_counters) /// profile_events_scope must be reset here. )); } @@ -603,7 +631,6 @@ template void ReplicatedMergeTreeSinkImpl::writeExistingPart(MergeTreeData::MutableDataPartPtr & part) { /// NOTE: No delay in this case. That's Ok. - auto origin_zookeeper = storage.getZooKeeper(); assertSessionIsNotExpired(origin_zookeeper); auto zookeeper = std::make_shared(origin_zookeeper); diff --git a/src/Storages/MergeTree/RequestResponse.cpp b/src/Storages/MergeTree/RequestResponse.cpp index 05930d5a4c4..2ce0e20dcd2 100644 --- a/src/Storages/MergeTree/RequestResponse.cpp +++ b/src/Storages/MergeTree/RequestResponse.cpp @@ -51,7 +51,7 @@ String ParallelReadRequest::describe() const return result; } -void ParallelReadRequest::deserialize(ReadBuffer & in) +ParallelReadRequest ParallelReadRequest::deserialize(ReadBuffer & in) { UInt64 version; readIntBinary(version, in); @@ -60,12 +60,24 @@ void ParallelReadRequest::deserialize(ReadBuffer & in) "from replicas differ. Got: {}, supported version: {}", version, DBMS_PARALLEL_REPLICAS_PROTOCOL_VERSION); + CoordinationMode mode; + size_t replica_num; + size_t min_number_of_marks; + RangesInDataPartsDescription description; + uint8_t mode_candidate; readIntBinary(mode_candidate, in); mode = validateAndGet(mode_candidate); readIntBinary(replica_num, in); readIntBinary(min_number_of_marks, in); description.deserialize(in); + + return ParallelReadRequest( + mode, + replica_num, + min_number_of_marks, + std::move(description) + ); } void ParallelReadRequest::merge(ParallelReadRequest & other) @@ -125,7 +137,7 @@ String InitialAllRangesAnnouncement::describe() return result; } -void InitialAllRangesAnnouncement::deserialize(ReadBuffer & in) +InitialAllRangesAnnouncement InitialAllRangesAnnouncement::deserialize(ReadBuffer & in) { UInt64 version; readIntBinary(version, in); @@ -134,11 +146,21 @@ void InitialAllRangesAnnouncement::deserialize(ReadBuffer & in) "from replicas differ. Got: {}, supported version: {}", version, DBMS_PARALLEL_REPLICAS_PROTOCOL_VERSION); + CoordinationMode mode; + RangesInDataPartsDescription description; + size_t replica_num; + uint8_t mode_candidate; readIntBinary(mode_candidate, in); mode = validateAndGet(mode_candidate); description.deserialize(in); readIntBinary(replica_num, in); + + return InitialAllRangesAnnouncement { + mode, + description, + replica_num + }; } } diff --git a/src/Storages/MergeTree/RequestResponse.h b/src/Storages/MergeTree/RequestResponse.h index 7e5563c0553..3a5bfde6c20 100644 --- a/src/Storages/MergeTree/RequestResponse.h +++ b/src/Storages/MergeTree/RequestResponse.h @@ -40,21 +40,40 @@ struct PartBlockRange } }; +/// ParallelReadRequest is used by remote replicas during parallel read +/// to signal an initiator that they need more marks to read. struct ParallelReadRequest { + /// No default constructor, you must initialize all fields at once. + + ParallelReadRequest( + CoordinationMode mode_, + size_t replica_num_, + size_t min_number_of_marks_, + RangesInDataPartsDescription description_) + : mode(mode_) + , replica_num(replica_num_) + , min_number_of_marks(min_number_of_marks_) + , description(std::move(description_)) + {} + CoordinationMode mode; size_t replica_num; size_t min_number_of_marks; - - /// Extension for ordered mode + /// Extension for Ordered (InOrder or ReverseOrder) mode + /// Contains only data part names without mark ranges. RangesInDataPartsDescription description; void serialize(WriteBuffer & out) const; String describe() const; - void deserialize(ReadBuffer & in); + static ParallelReadRequest deserialize(ReadBuffer & in); void merge(ParallelReadRequest & other); }; +/// ParallelReadResponse is used by an initiator to tell +/// remote replicas about what to read during parallel reading. +/// Additionally contains information whether there are more available +/// marks to read (whether it is the last packet or not). struct ParallelReadResponse { bool finish{false}; @@ -66,15 +85,30 @@ struct ParallelReadResponse }; +/// The set of parts (their names) along with ranges to read which is sent back +/// to the initiator by remote replicas during parallel reading. +/// Additionally contains an identifier (replica_num) plus +/// the reading algorithm chosen (Default, InOrder or ReverseOrder). struct InitialAllRangesAnnouncement { + /// No default constructor, you must initialize all fields at once. + + InitialAllRangesAnnouncement( + CoordinationMode mode_, + RangesInDataPartsDescription description_, + size_t replica_num_) + : mode(mode_) + , description(description_) + , replica_num(replica_num_) + {} + CoordinationMode mode; RangesInDataPartsDescription description; size_t replica_num; void serialize(WriteBuffer & out) const; String describe(); - void deserialize(ReadBuffer & in); + static InitialAllRangesAnnouncement deserialize(ReadBuffer & in); }; diff --git a/src/Storages/SelectQueryInfo.h b/src/Storages/SelectQueryInfo.h index 8fbc64b7a24..13d6909fd52 100644 --- a/src/Storages/SelectQueryInfo.h +++ b/src/Storages/SelectQueryInfo.h @@ -255,6 +255,8 @@ struct SelectQueryInfo Block minmax_count_projection_block; MergeTreeDataSelectAnalysisResultPtr merge_tree_select_result_ptr; + bool parallel_replicas_disabled = false; + bool is_parameterized_view = false; NameToNameMap parameterized_view_values; diff --git a/src/Storages/StorageMergeTree.cpp b/src/Storages/StorageMergeTree.cpp index c02c96f62be..4653d81594f 100644 --- a/src/Storages/StorageMergeTree.cpp +++ b/src/Storages/StorageMergeTree.cpp @@ -209,7 +209,9 @@ void StorageMergeTree::read( size_t max_block_size, size_t num_streams) { - if (local_context->canUseParallelReplicasOnInitiator() && local_context->getSettingsRef().parallel_replicas_for_non_replicated_merge_tree) + if (!query_info.parallel_replicas_disabled && + local_context->canUseParallelReplicasOnInitiator() && + local_context->getSettingsRef().parallel_replicas_for_non_replicated_merge_tree) { auto table_id = getStorageID(); @@ -240,7 +242,10 @@ void StorageMergeTree::read( } else { - const bool enable_parallel_reading = local_context->canUseParallelReplicasOnFollower() && local_context->getSettingsRef().parallel_replicas_for_non_replicated_merge_tree; + const bool enable_parallel_reading = + !query_info.parallel_replicas_disabled && + local_context->canUseParallelReplicasOnFollower() && + local_context->getSettingsRef().parallel_replicas_for_non_replicated_merge_tree; if (auto plan = reader.read( column_names, storage_snapshot, query_info, @@ -929,44 +934,70 @@ MergeMutateSelectedEntryPtr StorageMergeTree::selectPartsToMerge( SelectPartsDecision select_decision = SelectPartsDecision::CANNOT_SELECT; - if (!canEnqueueBackgroundTask()) + auto is_background_memory_usage_ok = [](String * disable_reason) -> bool { - if (out_disable_reason) - *out_disable_reason = fmt::format("Current background tasks memory usage ({}) is more than the limit ({})", + if (canEnqueueBackgroundTask()) + return true; + if (disable_reason) + *disable_reason = fmt::format("Current background tasks memory usage ({}) is more than the limit ({})", formatReadableSizeWithBinarySuffix(background_memory_tracker.get()), formatReadableSizeWithBinarySuffix(background_memory_tracker.getSoftLimit())); - } - else if (partition_id.empty()) - { - UInt64 max_source_parts_size = merger_mutator.getMaxSourcePartsSizeForMerge(); - bool merge_with_ttl_allowed = getTotalMergesWithTTLInMergeList() < data_settings->max_number_of_merges_with_ttl_in_pool; + return false; + }; - /// TTL requirements is much more strict than for regular merge, so - /// if regular not possible, than merge with ttl is not also not - /// possible. - if (max_source_parts_size > 0) + if (partition_id.empty()) + { + if (is_background_memory_usage_ok(out_disable_reason)) { - select_decision = merger_mutator.selectPartsToMerge( - future_part, - aggressive, - max_source_parts_size, - can_merge, - merge_with_ttl_allowed, - txn, - out_disable_reason); + UInt64 max_source_parts_size = merger_mutator.getMaxSourcePartsSizeForMerge(); + bool merge_with_ttl_allowed = getTotalMergesWithTTLInMergeList() < data_settings->max_number_of_merges_with_ttl_in_pool; + + /// TTL requirements is much more strict than for regular merge, so + /// if regular not possible, than merge with ttl is not also not + /// possible. + if (max_source_parts_size > 0) + { + select_decision = merger_mutator.selectPartsToMerge( + future_part, + aggressive, + max_source_parts_size, + can_merge, + merge_with_ttl_allowed, + txn, + out_disable_reason); + } + else if (out_disable_reason) + *out_disable_reason = "Current value of max_source_parts_size is zero"; } - else if (out_disable_reason) - *out_disable_reason = "Current value of max_source_parts_size is zero"; } else { while (true) { - select_decision = merger_mutator.selectAllPartsToMergeWithinPartition( - future_part, can_merge, partition_id, final, metadata_snapshot, txn, out_disable_reason, optimize_skip_merged_partitions); auto timeout_ms = getSettings()->lock_acquire_timeout_for_background_operations.totalMilliseconds(); auto timeout = std::chrono::milliseconds(timeout_ms); + if (!is_background_memory_usage_ok(out_disable_reason)) + { + constexpr auto poll_interval = std::chrono::seconds(1); + Int64 attempts = timeout / poll_interval; + bool ok = false; + for (Int64 i = 0; i < attempts; ++i) + { + std::this_thread::sleep_for(poll_interval); + if (is_background_memory_usage_ok(out_disable_reason)) + { + ok = true; + break; + } + } + if (!ok) + break; + } + + select_decision = merger_mutator.selectAllPartsToMergeWithinPartition( + future_part, can_merge, partition_id, final, metadata_snapshot, txn, out_disable_reason, optimize_skip_merged_partitions); + /// If final - we will wait for currently processing merges to finish and continue. if (final && select_decision != SelectPartsDecision::SELECTED @@ -1932,7 +1963,8 @@ void StorageMergeTree::replacePartitionFrom(const StoragePtr & source_table, con Int64 temp_index = insert_increment.get(); MergeTreePartInfo dst_part_info(partition_id, temp_index, temp_index, src_part->info.level); - auto [dst_part, part_lock] = cloneAndLoadDataPartOnSameDisk(src_part, TMP_PREFIX, dst_part_info, my_metadata_snapshot, local_context->getCurrentTransaction(), {}, false, {}); + IDataPartStorage::ClonePartParams clone_params{.txn = local_context->getCurrentTransaction()}; + auto [dst_part, part_lock] = cloneAndLoadDataPartOnSameDisk(src_part, TMP_PREFIX, dst_part_info, my_metadata_snapshot, clone_params); dst_parts.emplace_back(std::move(dst_part)); dst_parts_locks.emplace_back(std::move(part_lock)); } @@ -2030,7 +2062,8 @@ void StorageMergeTree::movePartitionToTable(const StoragePtr & dest_table, const Int64 temp_index = insert_increment.get(); MergeTreePartInfo dst_part_info(partition_id, temp_index, temp_index, src_part->info.level); - auto [dst_part, part_lock] = dest_table_storage->cloneAndLoadDataPartOnSameDisk(src_part, TMP_PREFIX, dst_part_info, dest_metadata_snapshot, local_context->getCurrentTransaction(), {}, false, {}); + IDataPartStorage::ClonePartParams clone_params{.txn = local_context->getCurrentTransaction()}; + auto [dst_part, part_lock] = dest_table_storage->cloneAndLoadDataPartOnSameDisk(src_part, TMP_PREFIX, dst_part_info, dest_metadata_snapshot, clone_params); dst_parts.emplace_back(std::move(dst_part)); dst_parts_locks.emplace_back(std::move(part_lock)); } diff --git a/src/Storages/StorageReplicatedMergeTree.cpp b/src/Storages/StorageReplicatedMergeTree.cpp index b1ba06c77f9..dac9e6923a5 100644 --- a/src/Storages/StorageReplicatedMergeTree.cpp +++ b/src/Storages/StorageReplicatedMergeTree.cpp @@ -1578,7 +1578,7 @@ void StorageReplicatedMergeTree::checkPartChecksumsAndAddCommitOps(const zkutil: } MergeTreeData::DataPartsVector StorageReplicatedMergeTree::checkPartChecksumsAndCommit(Transaction & transaction, - const MutableDataPartPtr & part, std::optional hardlinked_files, bool replace_zero_copy_lock) + const MutableDataPartPtr & part, std::optional hardlinked_files, bool replace_zero_copy_lock) { auto zookeeper = getZooKeeper(); @@ -2183,7 +2183,7 @@ bool StorageReplicatedMergeTree::executeReplaceRange(const LogEntry & entry) /// A replica that will be used to fetch part String replica; - MergeTreeData::HardlinkedFiles hardlinked_files; + HardlinkedFiles hardlinked_files; scope_guard temporary_part_lock; }; @@ -2433,17 +2433,26 @@ bool StorageReplicatedMergeTree::executeReplaceRange(const LogEntry & entry) static const String TMP_PREFIX = "tmp_replace_from_"; - std::vector hardlinked_files_for_parts; - auto obtain_part = [&] (PartDescriptionPtr & part_desc) { - if (part_desc->src_table_part) + /// Fetches with zero-copy-replication are cheap, but cloneAndLoadDataPartOnSameDisk will do full copy. + /// It's okay to check the setting for current table and disk for the source table, because src and dst part are on the same disk. + bool prefer_fetch_from_other_replica = !part_desc->replica.empty() && storage_settings_ptr->allow_remote_fs_zero_copy_replication + && part_desc->src_table_part && part_desc->src_table_part->isStoredOnRemoteDiskWithZeroCopySupport(); + + if (part_desc->src_table_part && !prefer_fetch_from_other_replica) { if (part_desc->checksum_hex != part_desc->src_table_part->checksums.getTotalChecksumHex()) throw Exception(ErrorCodes::UNFINISHED, "Checksums of {} is suddenly changed", part_desc->src_table_part->name); + bool zero_copy_enabled = dynamic_cast(source_table.get())->getSettings()->allow_remote_fs_zero_copy_replication; + IDataPartStorage::ClonePartParams clone_params + { + .copy_instead_of_hardlink = zero_copy_enabled && part_desc->src_table_part->isStoredOnRemoteDiskWithZeroCopySupport(), + .metadata_version_to_write = metadata_snapshot->getMetadataVersion() + }; auto [res_part, temporary_part_lock] = cloneAndLoadDataPartOnSameDisk( - part_desc->src_table_part, TMP_PREFIX + "clone_", part_desc->new_part_info, metadata_snapshot, NO_TRANSACTION_PTR, &part_desc->hardlinked_files, false, {}); + part_desc->src_table_part, TMP_PREFIX + "clone_", part_desc->new_part_info, metadata_snapshot, clone_params); part_desc->res_part = std::move(res_part); part_desc->temporary_part_lock = std::move(temporary_part_lock); } @@ -2891,8 +2900,7 @@ void StorageReplicatedMergeTree::cloneReplica(const String & source_replica, Coo desired_checksums = MinimalisticDataPartChecksums::deserializeFrom(desired_checksums_str); } - const auto [lo, hi] = desired_checksums.hash_of_all_files; - log_entry.part_checksum = getHexUIntUppercase(hi) + getHexUIntUppercase(lo); + log_entry.part_checksum = getHexUIntUppercase(desired_checksums.hash_of_all_files); } else { @@ -4270,6 +4278,11 @@ bool StorageReplicatedMergeTree::fetchPart( profile_events_scope.getSnapshot()); }; + auto is_zero_copy_part = [&settings_ptr](const auto & data_part) + { + return settings_ptr->allow_remote_fs_zero_copy_replication && data_part->isStoredOnRemoteDiskWithZeroCopySupport(); + }; + DataPartPtr part_to_clone; { /// If the desired part is a result of a part mutation, try to find the source part and compare @@ -4281,7 +4294,7 @@ bool StorageReplicatedMergeTree::fetchPart( auto source_part = getActiveContainingPart(covered_part_info); /// Fetch for zero-copy replication is cheap and straightforward, so we don't use local clone here - if (source_part && (!settings_ptr->allow_remote_fs_zero_copy_replication || !source_part->getDataPartStorage().supportZeroCopyReplication())) + if (source_part && !is_zero_copy_part(source_part)) { auto source_part_header = ReplicatedMergeTreePartHeader::fromColumnsAndChecksums( source_part->getColumns(), source_part->checksums); @@ -4330,14 +4343,15 @@ bool StorageReplicatedMergeTree::fetchPart( InterserverCredentialsPtr credentials; std::optional tagger_ptr; std::function get_part; - MergeTreeData::HardlinkedFiles hardlinked_files; scope_guard part_directory_lock; if (part_to_clone) { get_part = [&, part_to_clone]() { - auto [cloned_part, lock] = cloneAndLoadDataPartOnSameDisk(part_to_clone, "tmp_clone_", part_info, metadata_snapshot, NO_TRANSACTION_PTR, &hardlinked_files, false, {}); + chassert(!is_zero_copy_part(part_to_clone)); + IDataPartStorage::ClonePartParams clone_params{ .keep_metadata_version = true }; + auto [cloned_part, lock] = cloneAndLoadDataPartOnSameDisk(part_to_clone, "tmp_clone_", part_info, metadata_snapshot, clone_params); part_directory_lock = std::move(lock); return cloned_part; }; @@ -4387,7 +4401,8 @@ bool StorageReplicatedMergeTree::fetchPart( Transaction transaction(*this, NO_TRANSACTION_RAW); renameTempPartAndReplace(part, transaction); - replaced_parts = checkPartChecksumsAndCommit(transaction, part, hardlinked_files, !part_to_clone); + chassert(!part_to_clone || !is_zero_copy_part(part)); + replaced_parts = checkPartChecksumsAndCommit(transaction, part, /*hardlinked_files*/ {}, /*replace_zero_copy_lock*/ true); /** If a quorum is tracked for this part, you must update it. * If you do not have time, in case of losing the session, when you restart the server - see the `ReplicatedMergeTreeRestartingThread::updateQuorumIfWeHavePart` method. @@ -7255,7 +7270,6 @@ void StorageReplicatedMergeTree::replacePartitionFrom( assert(replace == !LogEntry::ReplaceRangeEntry::isMovePartitionOrAttachFrom(drop_range)); String drop_range_fake_part_name = getPartNamePossiblyFake(format_version, drop_range); - std::vector hardlinked_files_for_parts; for (const auto & src_part : src_all_parts) { @@ -7286,19 +7300,21 @@ void StorageReplicatedMergeTree::replacePartitionFrom( UInt64 index = lock->getNumber(); MergeTreePartInfo dst_part_info(partition_id, index, index, src_part->info.level); - MergeTreeData::HardlinkedFiles hardlinked_files; - bool copy_instead_of_hardlink = storage_settings_ptr->allow_remote_fs_zero_copy_replication - && src_part->isStoredOnRemoteDiskWithZeroCopySupport(); - - auto [dst_part, part_lock] = cloneAndLoadDataPartOnSameDisk(src_part, TMP_PREFIX, dst_part_info, metadata_snapshot, NO_TRANSACTION_PTR, &hardlinked_files, copy_instead_of_hardlink, {}); + bool zero_copy_enabled = storage_settings_ptr->allow_remote_fs_zero_copy_replication + || dynamic_cast(source_table.get())->getSettings()->allow_remote_fs_zero_copy_replication; + IDataPartStorage::ClonePartParams clone_params + { + .copy_instead_of_hardlink = zero_copy_enabled && src_part->isStoredOnRemoteDiskWithZeroCopySupport(), + .metadata_version_to_write = metadata_snapshot->getMetadataVersion() + }; + auto [dst_part, part_lock] = cloneAndLoadDataPartOnSameDisk(src_part, TMP_PREFIX, dst_part_info, metadata_snapshot, clone_params); src_parts.emplace_back(src_part); dst_parts.emplace_back(dst_part); dst_parts_locks.emplace_back(std::move(part_lock)); ephemeral_locks.emplace_back(std::move(*lock)); block_id_paths.emplace_back(block_id_path); part_checksums.emplace_back(hash_hex); - hardlinked_files_for_parts.emplace_back(hardlinked_files); } ReplicatedMergeTreeLogEntryData entry; @@ -7359,8 +7375,8 @@ void StorageReplicatedMergeTree::replacePartitionFrom( renameTempPartAndReplaceUnlocked(part, transaction, data_parts_lock); } - for (size_t i = 0; i < dst_parts.size(); ++i) - lockSharedData(*dst_parts[i], false, hardlinked_files_for_parts[i]); + for (const auto & dst_part : dst_parts) + lockSharedData(*dst_part, false, /*hardlinked_files*/ {}); Coordination::Error code = zookeeper->tryMulti(ops, op_results); if (code == Coordination::Error::ZOK) @@ -7501,7 +7517,6 @@ void StorageReplicatedMergeTree::movePartitionToTable(const StoragePtr & dest_ta String dest_alter_partition_version_path = dest_table_storage->zookeeper_path + "/alter_partition_version"; Coordination::Stat dest_alter_partition_version_stat; zookeeper->get(dest_alter_partition_version_path, &dest_alter_partition_version_stat); - std::vector hardlinked_files_for_parts; std::vector temporary_parts_locks; for (const auto & src_part : src_all_parts) @@ -7524,12 +7539,14 @@ void StorageReplicatedMergeTree::movePartitionToTable(const StoragePtr & dest_ta UInt64 index = lock->getNumber(); MergeTreePartInfo dst_part_info(partition_id, index, index, src_part->info.level); - MergeTreeData::HardlinkedFiles hardlinked_files; - - bool copy_instead_of_hardlink = storage_settings_ptr->allow_remote_fs_zero_copy_replication - && src_part->isStoredOnRemoteDiskWithZeroCopySupport(); - - auto [dst_part, dst_part_lock] = dest_table_storage->cloneAndLoadDataPartOnSameDisk(src_part, TMP_PREFIX, dst_part_info, dest_metadata_snapshot, NO_TRANSACTION_PTR, &hardlinked_files, copy_instead_of_hardlink, {}); + bool zero_copy_enabled = storage_settings_ptr->allow_remote_fs_zero_copy_replication + || dynamic_cast(dest_table.get())->getSettings()->allow_remote_fs_zero_copy_replication; + IDataPartStorage::ClonePartParams clone_params + { + .copy_instead_of_hardlink = zero_copy_enabled && src_part->isStoredOnRemoteDiskWithZeroCopySupport(), + .metadata_version_to_write = dest_metadata_snapshot->getMetadataVersion() + }; + auto [dst_part, dst_part_lock] = dest_table_storage->cloneAndLoadDataPartOnSameDisk(src_part, TMP_PREFIX, dst_part_info, dest_metadata_snapshot, clone_params); src_parts.emplace_back(src_part); dst_parts.emplace_back(dst_part); @@ -7537,7 +7554,6 @@ void StorageReplicatedMergeTree::movePartitionToTable(const StoragePtr & dest_ta ephemeral_locks.emplace_back(std::move(*lock)); block_id_paths.emplace_back(block_id_path); part_checksums.emplace_back(hash_hex); - hardlinked_files_for_parts.emplace_back(hardlinked_files); } ReplicatedMergeTreeLogEntryData entry_delete; @@ -7605,8 +7621,8 @@ void StorageReplicatedMergeTree::movePartitionToTable(const StoragePtr & dest_ta for (auto & part : dst_parts) dest_table_storage->renameTempPartAndReplaceUnlocked(part, transaction, dest_data_parts_lock); - for (size_t i = 0; i < dst_parts.size(); ++i) - dest_table_storage->lockSharedData(*dst_parts[i], false, hardlinked_files_for_parts[i]); + for (const auto & dst_part : dst_parts) + dest_table_storage->lockSharedData(*dst_part, false, /*hardlinked_files*/ {}); Coordination::Error code = zookeeper->tryMulti(ops, op_results); if (code == Coordination::Error::ZBADVERSION) diff --git a/src/Storages/System/StorageSystemContributors.generated.cpp b/src/Storages/System/StorageSystemContributors.generated.cpp index f83ee3197fe..f84c554afc0 100644 --- a/src/Storages/System/StorageSystemContributors.generated.cpp +++ b/src/Storages/System/StorageSystemContributors.generated.cpp @@ -226,6 +226,7 @@ const char * auto_contributors[] { "Carbyn", "Carlos Rodríguez Hernández", "Caspian", + "Chang Chen", "Chao Ma", "Chao Wang", "CheSema", @@ -291,6 +292,7 @@ const char * auto_contributors[] { "Dmitry Belyavtsev", "Dmitry Bilunov", "Dmitry Galuza", + "Dmitry Kardymon", "Dmitry Krylov", "Dmitry Luhtionov", "Dmitry Moskowski", @@ -408,6 +410,7 @@ const char * auto_contributors[] { "HeenaBansal2009", "Hiroaki Nakamura", "Hongbin", + "Hongbin Ma", "Hosun Lee", "HuFuwang", "Hui Wang", @@ -491,6 +494,7 @@ const char * auto_contributors[] { "Josh Taylor", "João Figueiredo", "Julian Gilyadov", + "Julian Maicher", "Julian Zhou", "Julio Jimenez", "Junfu Wu", @@ -917,6 +921,7 @@ const char * auto_contributors[] { "Thom O'Connor", "Thomas Berdy", "Thomas Casteleyn", + "Thomas Panetti", "Tian Xinhui", "Tiaonmmn", "Tigran Khudaverdyan", @@ -1022,6 +1027,7 @@ const char * auto_contributors[] { "Y Lu", "Yakko Majuri", "Yakov Olkhovskiy", + "YalalovSM", "Yangkuan Liu", "Yatian Xu", "Yatsishin Ilya", @@ -1300,6 +1306,7 @@ const char * auto_contributors[] { "kashwy", "keenwolf", "kevin wan", + "kevinyhzou", "kgurjev", "khamadiev", "kigerzhang", @@ -1545,6 +1552,7 @@ const char * auto_contributors[] { "tiger.yan", "tison", "topvisor", + "tpanetti", "turbo jason", "tyrionhuang", "ubuntu", @@ -1673,6 +1681,7 @@ const char * auto_contributors[] { "董海镔", "谢磊", "贾顺名(Jarvis)", + "郭小龙", "陈小玉", "靳阳", "黄朝晖", diff --git a/src/Storages/System/StorageSystemParts.cpp b/src/Storages/System/StorageSystemParts.cpp index 86ecb336b51..7399bd789a7 100644 --- a/src/Storages/System/StorageSystemParts.cpp +++ b/src/Storages/System/StorageSystemParts.cpp @@ -252,17 +252,17 @@ void StorageSystemParts::processNextStorage( if (columns_mask[src_index++]) { auto checksum = helper.hash_of_all_files; - columns[res_index++]->insert(getHexUIntLowercase(checksum.first) + getHexUIntLowercase(checksum.second)); + columns[res_index++]->insert(getHexUIntLowercase(checksum)); } if (columns_mask[src_index++]) { auto checksum = helper.hash_of_uncompressed_files; - columns[res_index++]->insert(getHexUIntLowercase(checksum.first) + getHexUIntLowercase(checksum.second)); + columns[res_index++]->insert(getHexUIntLowercase(checksum)); } if (columns_mask[src_index++]) { auto checksum = helper.uncompressed_hash_of_compressed_files; - columns[res_index++]->insert(getHexUIntLowercase(checksum.first) + getHexUIntLowercase(checksum.second)); + columns[res_index++]->insert(getHexUIntLowercase(checksum)); } } diff --git a/src/Storages/System/StorageSystemPartsColumns.cpp b/src/Storages/System/StorageSystemPartsColumns.cpp index 00b958b015f..67c8d06e432 100644 --- a/src/Storages/System/StorageSystemPartsColumns.cpp +++ b/src/Storages/System/StorageSystemPartsColumns.cpp @@ -8,6 +8,7 @@ #include #include #include +#include #include #include #include @@ -62,6 +63,8 @@ StorageSystemPartsColumns::StorageSystemPartsColumns(const StorageID & table_id_ {"column_data_compressed_bytes", std::make_shared()}, {"column_data_uncompressed_bytes", std::make_shared()}, {"column_marks_bytes", std::make_shared()}, + {"column_modification_time", std::make_shared(std::make_shared())}, + {"serialization_kind", std::make_shared()}, {"subcolumns.names", std::make_shared(std::make_shared())}, {"subcolumns.types", std::make_shared(std::make_shared())}, @@ -235,6 +238,13 @@ void StorageSystemPartsColumns::processNextStorage( columns[res_index++]->insert(column_size.data_uncompressed); if (columns_mask[src_index++]) columns[res_index++]->insert(column_size.marks); + if (columns_mask[src_index++]) + { + if (auto column_modification_time = part->getColumnModificationTime(column.name)) + columns[res_index++]->insert(UInt64(column_modification_time.value())); + else + columns[res_index++]->insertDefault(); + } auto serialization = part->getSerialization(column.name); if (columns_mask[src_index++]) diff --git a/src/Storages/System/StorageSystemProjectionParts.cpp b/src/Storages/System/StorageSystemProjectionParts.cpp index d2c6c3ef287..213865a8d61 100644 --- a/src/Storages/System/StorageSystemProjectionParts.cpp +++ b/src/Storages/System/StorageSystemProjectionParts.cpp @@ -221,17 +221,17 @@ void StorageSystemProjectionParts::processNextStorage( if (columns_mask[src_index++]) { auto checksum = helper.hash_of_all_files; - columns[res_index++]->insert(getHexUIntLowercase(checksum.first) + getHexUIntLowercase(checksum.second)); + columns[res_index++]->insert(getHexUIntLowercase(checksum)); } if (columns_mask[src_index++]) { auto checksum = helper.hash_of_uncompressed_files; - columns[res_index++]->insert(getHexUIntLowercase(checksum.first) + getHexUIntLowercase(checksum.second)); + columns[res_index++]->insert(getHexUIntLowercase(checksum)); } if (columns_mask[src_index++]) { auto checksum = helper.uncompressed_hash_of_compressed_files; - columns[res_index++]->insert(getHexUIntLowercase(checksum.first) + getHexUIntLowercase(checksum.second)); + columns[res_index++]->insert(getHexUIntLowercase(checksum)); } } diff --git a/src/Storages/System/StorageSystemProjectionPartsColumns.cpp b/src/Storages/System/StorageSystemProjectionPartsColumns.cpp index a5968597885..06becc6d91c 100644 --- a/src/Storages/System/StorageSystemProjectionPartsColumns.cpp +++ b/src/Storages/System/StorageSystemProjectionPartsColumns.cpp @@ -7,6 +7,7 @@ #include #include #include +#include #include #include #include @@ -66,7 +67,8 @@ StorageSystemProjectionPartsColumns::StorageSystemProjectionPartsColumns(const S {"column_bytes_on_disk", std::make_shared()}, {"column_data_compressed_bytes", std::make_shared()}, {"column_data_uncompressed_bytes", std::make_shared()}, - {"column_marks_bytes", std::make_shared()} + {"column_marks_bytes", std::make_shared()}, + {"column_modification_time", std::make_shared(std::make_shared())}, } ) { @@ -247,6 +249,13 @@ void StorageSystemProjectionPartsColumns::processNextStorage( columns[res_index++]->insert(column_size.data_uncompressed); if (columns_mask[src_index++]) columns[res_index++]->insert(column_size.marks); + if (columns_mask[src_index++]) + { + if (auto column_modification_time = part->getColumnModificationTime(column.name)) + columns[res_index++]->insert(UInt64(column_modification_time.value())); + else + columns[res_index++]->insertDefault(); + } if (has_state_column) columns[res_index++]->insert(part->stateString()); diff --git a/src/Storages/System/StorageSystemZooKeeper.cpp b/src/Storages/System/StorageSystemZooKeeper.cpp index d361a4173c1..4d7f59b8ccd 100644 --- a/src/Storages/System/StorageSystemZooKeeper.cpp +++ b/src/Storages/System/StorageSystemZooKeeper.cpp @@ -184,7 +184,7 @@ public: void initializePipeline(QueryPipelineBuilder & pipeline, const BuildQueryPipelineSettings & settings) override; - void onAddFilterFinish() override; + void applyFilters() override; private: void fillData(MutableColumns & res_columns); @@ -421,7 +421,7 @@ static Paths extractPath(const ActionsDAG::NodeRawConstPtrs & filter_nodes, Cont } -void ReadFromSystemZooKeeper::onAddFilterFinish() +void ReadFromSystemZooKeeper::applyFilters() { paths = extractPath(getFilterNodes().nodes, context, context->getSettingsRef().allow_unrestricted_reads_from_keeper); } diff --git a/src/Storages/System/StorageSystemZooKeeperConnection.cpp b/src/Storages/System/StorageSystemZooKeeperConnection.cpp index 03886e6ef8b..33268d58358 100644 --- a/src/Storages/System/StorageSystemZooKeeperConnection.cpp +++ b/src/Storages/System/StorageSystemZooKeeperConnection.cpp @@ -1,8 +1,11 @@ #include +#include +#include #include #include #include #include +#include #include namespace DB @@ -10,6 +13,13 @@ namespace DB NamesAndTypesList StorageSystemZooKeeperConnection::getNamesAndTypes() { + DataTypeEnum16::Values feature_flags_enum_values; + feature_flags_enum_values.reserve(magic_enum::enum_count()); + for (const auto & [feature_flag, feature_flag_string] : magic_enum::enum_entries()) + feature_flags_enum_values.push_back(std::pair{std::string{feature_flag_string}, static_cast(feature_flag)}); + + auto feature_flags_enum = std::make_shared(std::move(feature_flags_enum_values)); + return { {"name", std::make_shared()}, {"host", std::make_shared()}, @@ -19,7 +29,8 @@ NamesAndTypesList StorageSystemZooKeeperConnection::getNamesAndTypes() {"session_uptime_elapsed_seconds", std::make_shared()}, {"is_expired", std::make_shared()}, {"keeper_api_version", std::make_shared()}, - {"client_id", std::make_shared()} + {"client_id", std::make_shared()}, + {"enabled_feature_flags", std::make_shared(std::move(feature_flags_enum))} }; } @@ -33,9 +44,28 @@ void StorageSystemZooKeeperConnection::fillData(MutableColumns & res_columns, Co res_columns[4]->insert(context->getZooKeeper()->getConnectedTime()); res_columns[5]->insert(context->getZooKeeperSessionUptime()); res_columns[6]->insert(context->getZooKeeper()->expired()); - res_columns[7]->insert(context->getZooKeeper()->getApiVersion()); + res_columns[7]->insert(0); res_columns[8]->insert(context->getZooKeeper()->getClientID()); + const auto add_enabled_feature_flags = [&](const auto & zookeeper) + { + Array enabled_feature_flags; + const auto * feature_flags = zookeeper->getKeeperFeatureFlags(); + if (feature_flags) + { + for (const auto & feature_flag : magic_enum::enum_values()) + { + if (feature_flags->isEnabled(feature_flag)) + { + enabled_feature_flags.push_back(feature_flag); + } + } + } + res_columns[9]->insert(std::move(enabled_feature_flags)); + }; + + add_enabled_feature_flags(context->getZooKeeper()); + for (const auto & elem : context->getAuxiliaryZooKeepers()) { res_columns[0]->insert(elem.first); @@ -45,8 +75,9 @@ void StorageSystemZooKeeperConnection::fillData(MutableColumns & res_columns, Co res_columns[4]->insert(elem.second->getConnectedTime()); res_columns[5]->insert(elem.second->getSessionUptime()); res_columns[6]->insert(elem.second->expired()); - res_columns[7]->insert(elem.second->getApiVersion()); + res_columns[7]->insert(0); res_columns[8]->insert(elem.second->getClientID()); + add_enabled_feature_flags(elem.second); } } diff --git a/tests/analyzer_tech_debt.txt b/tests/analyzer_tech_debt.txt index 0872033aed0..f7cc13dd2e2 100644 --- a/tests/analyzer_tech_debt.txt +++ b/tests/analyzer_tech_debt.txt @@ -36,6 +36,7 @@ 01455_shard_leaf_max_rows_bytes_to_read 01495_subqueries_in_with_statement 01504_rocksdb +01526_client_start_and_exit 01527_dist_sharding_key_dictGet_reload 01528_allow_nondeterministic_optimize_skip_unused_shards 01540_verbatim_partition_pruning @@ -50,6 +51,7 @@ 01624_soft_constraints 01651_bugs_from_15889 01656_test_query_log_factories_info +01676_clickhouse_client_autocomplete 01681_bloom_filter_nullable_column 01700_system_zookeeper_path_in 01710_projection_additional_filters diff --git a/tests/ci/attach_gdb.lib b/tests/ci/attach_gdb.lib index 2df6243f796..e937cf6dba7 100644 --- a/tests/ci/attach_gdb.lib +++ b/tests/ci/attach_gdb.lib @@ -1,5 +1,7 @@ #!/bin/bash +source /usr/share/clickhouse-test/ci/utils.lib + function attach_gdb_to_clickhouse() { # Set follow-fork-mode to parent, because we attach to clickhouse-server, not to watchdog @@ -38,5 +40,5 @@ quit gdb -batch -command script.gdb -p "$(cat /var/run/clickhouse-server/clickhouse-server.pid)" | ts '%Y-%m-%d %H:%M:%S' >> /test_output/gdb.log & sleep 5 # gdb will send SIGSTOP, spend some time loading debug info and then send SIGCONT, wait for it (up to send_timeout, 300s) - time clickhouse-client --query "SELECT 'Connected to clickhouse-server after attaching gdb'" ||: + run_with_retry 60 clickhouse-client --query "SELECT 'Connected to clickhouse-server after attaching gdb'" } diff --git a/tests/ci/commit_status_helper.py b/tests/ci/commit_status_helper.py index 4437ddc52c6..efe149b0aa4 100644 --- a/tests/ci/commit_status_helper.py +++ b/tests/ci/commit_status_helper.py @@ -7,7 +7,7 @@ from typing import Dict, List, Literal, Optional, Union import logging from github import Github -from github.GithubObject import _NotSetType, NotSet as NotSet # type: ignore +from github.GithubObject import _NotSetType, NotSet as NotSet from github.Commit import Commit from github.CommitStatus import CommitStatus from github.IssueComment import IssueComment diff --git a/tests/ci/fast_test_check.py b/tests/ci/fast_test_check.py index 2849759a3ee..d5198e5c3d7 100644 --- a/tests/ci/fast_test_check.py +++ b/tests/ci/fast_test_check.py @@ -11,6 +11,7 @@ from typing import List, Tuple from github import Github +from build_check import get_release_or_pr from clickhouse_helper import ( ClickHouseHelper, mark_flaky_tests, @@ -31,6 +32,7 @@ from s3_helper import S3Helper from stopwatch import Stopwatch from tee_popen import TeePopen from upload_result_helper import upload_results +from version_helper import get_version_from_repo NAME = "Fast test" @@ -189,6 +191,17 @@ def main(): ch_helper = ClickHouseHelper() mark_flaky_tests(ch_helper, NAME, test_results) + s3_path_prefix = os.path.join( + get_release_or_pr(pr_info, get_version_from_repo())[0], + pr_info.sha, + "fast_tests", + ) + build_urls = s3_helper.upload_build_folder_to_s3( + os.path.join(output_path, "binaries"), + s3_path_prefix, + keep_dirs_in_s3_path=False, + upload_symlinks=False, + ) report_url = upload_results( s3_helper, @@ -197,6 +210,7 @@ def main(): test_results, [run_log_path] + additional_logs, NAME, + build_urls, ) print(f"::notice ::Report url: {report_url}") post_commit_status(commit, state, report_url, description, NAME, pr_info) diff --git a/tests/ci/github_helper.py b/tests/ci/github_helper.py index 834c8247cb8..f7256e523b1 100644 --- a/tests/ci/github_helper.py +++ b/tests/ci/github_helper.py @@ -111,7 +111,7 @@ class GitHub(github.Github): # See https://github.com/PyGithub/PyGithub/issues/2202, # obj._rawData doesn't spend additional API requests # pylint: disable=protected-access - repo_url = issue._rawData["repository_url"] # type: ignore + repo_url = issue._rawData["repository_url"] if repo_url not in repos: repos[repo_url] = issue.repository prs.append( diff --git a/tests/ci/merge_pr.py b/tests/ci/merge_pr.py index 2d6d81a152a..14844ed9b25 100644 --- a/tests/ci/merge_pr.py +++ b/tests/ci/merge_pr.py @@ -154,7 +154,7 @@ def get_workflows_for_head(repo: Repository, head_sha: str) -> List[WorkflowRun] return list( PaginatedList( WorkflowRun, - repo._requester, # type:ignore # pylint:disable=protected-access + repo._requester, # pylint:disable=protected-access f"{repo.url}/actions/runs", {"head_sha": head_sha}, list_item="workflow_runs", diff --git a/tests/ci/stress.py b/tests/ci/stress.py index 1aa483854fc..6d17384c63f 100755 --- a/tests/ci/stress.py +++ b/tests/ci/stress.py @@ -38,6 +38,9 @@ def get_options(i, upgrade_check): client_options.append("join_algorithm='partial_merge'") if join_alg_num % 5 == 2: client_options.append("join_algorithm='full_sorting_merge'") + if join_alg_num % 5 == 3 and not upgrade_check: + # Some crashes are not fixed in 23.2 yet, so ignore the setting in Upgrade check + client_options.append("join_algorithm='grace_hash'") if join_alg_num % 5 == 4: client_options.append("join_algorithm='auto'") client_options.append("max_rows_in_join=1000") diff --git a/tests/ci/stress_tests.lib b/tests/ci/stress_tests.lib index 2b8ac77b952..190f3f39f9e 100644 --- a/tests/ci/stress_tests.lib +++ b/tests/ci/stress_tests.lib @@ -243,7 +243,7 @@ function check_logs_for_critical_errors() # Remove file fatal_messages.txt if it's empty [ -s /test_output/fatal_messages.txt ] || rm /test_output/fatal_messages.txt - rg -Fa "########################################" /test_output/* > /dev/null \ + rg -Faz "########################################" /test_output/* > /dev/null \ && echo -e "Killed by signal (output files)$FAIL" >> /test_output/test_results.tsv function get_gdb_log_context() diff --git a/tests/ci/upload_result_helper.py b/tests/ci/upload_result_helper.py index 150af7aff4a..fbb89ef8078 100644 --- a/tests/ci/upload_result_helper.py +++ b/tests/ci/upload_result_helper.py @@ -1,5 +1,5 @@ from pathlib import Path -from typing import Dict, List +from typing import Dict, List, Optional import os import logging @@ -58,14 +58,19 @@ def upload_results( test_results: TestResults, additional_files: List[str], check_name: str, + additional_urls: Optional[List[str]] = None, ) -> str: normalized_check_name = check_name.lower() for r in ((" ", "_"), ("(", "_"), (")", "_"), (",", "_"), ("/", "_")): normalized_check_name = normalized_check_name.replace(*r) + + # Preserve additional_urls to not modify the original one + original_additional_urls = additional_urls or [] s3_path_prefix = f"{pr_number}/{commit_sha}/{normalized_check_name}" additional_urls = process_logs( s3_client, additional_files, s3_path_prefix, test_results ) + additional_urls.extend(original_additional_urls) branch_url = f"{GITHUB_SERVER_URL}/{GITHUB_REPOSITORY}/commits/master" branch_name = "master" diff --git a/tests/ci/utils.lib b/tests/ci/utils.lib new file mode 100644 index 00000000000..b5ce4ae0d78 --- /dev/null +++ b/tests/ci/utils.lib @@ -0,0 +1,29 @@ +#!/bin/bash + +function run_with_retry() +{ + set +e + + local total_retries="$1" + shift + + local retry=0 + + until [ "$retry" -ge "$total_retries" ] + do + if "$@"; then + set -e + return + else + retry=$((retry + 1)) + sleep 5 + fi + done + + echo "Command '$*' failed after $total_retries retries, exiting" + exit 1 +} + +function fn_exists() { + declare -F "$1" > /dev/null; +} \ No newline at end of file diff --git a/tests/config/config.d/database_replicated.xml b/tests/config/config.d/database_replicated.xml index 9a405f85908..2504a7ca526 100644 --- a/tests/config/config.d/database_replicated.xml +++ b/tests/config/config.d/database_replicated.xml @@ -40,7 +40,7 @@ 10000 30000 1000 - 4000 + 2000 5000 information false diff --git a/tests/config/config.d/keeper_port.xml b/tests/config/config.d/keeper_port.xml index cffd325e968..7db174c5419 100644 --- a/tests/config/config.d/keeper_port.xml +++ b/tests/config/config.d/keeper_port.xml @@ -28,5 +28,9 @@ 9234 + + + 1 + diff --git a/tests/config/config.d/merge_tree_old_dirs_cleanup.xml b/tests/config/config.d/merge_tree_old_dirs_cleanup.xml index 2b8ea63b63d..e6b50724c97 100644 --- a/tests/config/config.d/merge_tree_old_dirs_cleanup.xml +++ b/tests/config/config.d/merge_tree_old_dirs_cleanup.xml @@ -5,4 +5,5 @@ 5 + true diff --git a/tests/config/config.d/named_collection.xml b/tests/config/config.d/named_collection.xml index 2e49c0c596f..5b716a7b8da 100644 --- a/tests/config/config.d/named_collection.xml +++ b/tests/config/config.d/named_collection.xml @@ -32,5 +32,10 @@ testtest auto + + http://localhost:11111/test/ + test + testtest + diff --git a/tests/integration/helpers/cluster.py b/tests/integration/helpers/cluster.py index c77e67062a1..21398790be3 100644 --- a/tests/integration/helpers/cluster.py +++ b/tests/integration/helpers/cluster.py @@ -624,10 +624,12 @@ class ClickHouseCluster: # if you change packages, don't forget to update them in docker/test/integration/runner/dockerd-entrypoint.sh ( pyspark.sql.SparkSession.builder.appName("spark_test") - .config( - "spark.jars.packages", - "org.apache.hudi:hudi-spark3.3-bundle_2.12:0.13.0,io.delta:delta-core_2.12:2.2.0,org.apache.iceberg:iceberg-spark-runtime-3.3_2.12:1.1.0", - ) + # The jars are now linked to "$SPARK_HOME/jars" and we don't + # need packages to be downloaded once and once again + # .config( + # "spark.jars.packages", + # "org.apache.hudi:hudi-spark3.3-bundle_2.12:0.13.0,io.delta:delta-core_2.12:2.2.0,org.apache.iceberg:iceberg-spark-runtime-3.3_2.12:1.1.0", + # ) .master("local") .getOrCreate() .stop() @@ -3054,7 +3056,6 @@ CLICKHOUSE_STAY_ALIVE_COMMAND = "bash -c \"trap 'pkill tail' INT TERM; {} --daem CLICKHOUSE_START_COMMAND ) -# /run/xtables.lock passed inside for correct iptables --wait DOCKER_COMPOSE_TEMPLATE = """ version: '2.3' services: @@ -3066,7 +3067,6 @@ services: - {db_dir}:/var/lib/clickhouse/ - {logs_dir}:/var/log/clickhouse-server/ - /etc/passwd:/etc/passwd:ro - - /run/xtables.lock:/run/xtables.lock:ro {binary_volume} {odbc_bridge_volume} {library_bridge_volume} diff --git a/tests/integration/helpers/network.py b/tests/integration/helpers/network.py index 471aa2bdc2e..2df560708e0 100644 --- a/tests/integration/helpers/network.py +++ b/tests/integration/helpers/network.py @@ -231,6 +231,9 @@ class _NetworkManager: def _ensure_container(self): if self._container is None or self._container_expire_time <= time.time(): + image_name = "clickhouse/integration-helper:" + os.getenv( + "DOCKER_HELPER_TAG", "latest" + ) for i in range(5): if self._container is not None: try: @@ -247,7 +250,7 @@ class _NetworkManager: time.sleep(i) image = subprocess.check_output( - "docker images -q clickhouse/integration-helper 2>/dev/null", shell=True + f"docker images -q {image_name} 2>/dev/null", shell=True ) if not image.strip(): print("No network image helper, will try download") @@ -256,22 +259,18 @@ class _NetworkManager: for i in range(5): try: subprocess.check_call( # STYLE_CHECK_ALLOW_SUBPROCESS_CHECK_CALL - "docker pull clickhouse/integration-helper", shell=True + f"docker pull {image_name}", shell=True ) break except: time.sleep(i) else: - raise Exception("Cannot pull clickhouse/integration-helper image") + raise Exception(f"Cannot pull {image_name} image") self._container = self._docker_client.containers.run( - "clickhouse/integration-helper", + image_name, auto_remove=True, command=("sleep %s" % self.container_exit_timeout), - # /run/xtables.lock passed inside for correct iptables --wait - volumes={ - "/run/xtables.lock": {"bind": "/run/xtables.lock", "mode": "ro"} - }, detach=True, network_mode="host", ) diff --git a/tests/integration/pytest.ini b/tests/integration/pytest.ini index 772c96f7361..e40959bd37b 100644 --- a/tests/integration/pytest.ini +++ b/tests/integration/pytest.ini @@ -19,3 +19,6 @@ markers = long_run: marks tests which run for a long time addopts = -m 'not long_run' +; 'The asyncore module is deprecated' comes from casandra driver +filterwarnings = + ignore:The asyncore module is deprecated:DeprecationWarning diff --git a/tests/integration/runner b/tests/integration/runner index f658bac412b..df52f587eee 100755 --- a/tests/integration/runner +++ b/tests/integration/runner @@ -336,6 +336,8 @@ if __name__ == "__main__": env_tags += "-e {}={} ".format("DOCKER_MYSQL_PHP_CLIENT_TAG", tag) elif image == "clickhouse/postgresql-java-client": env_tags += "-e {}={} ".format("DOCKER_POSTGRESQL_JAVA_CLIENT_TAG", tag) + elif image == "clickhouse/integration-helper": + env_tags += "-e {}={} ".format("DOCKER_HELPER_TAG", tag) elif image == "clickhouse/integration-test": env_tags += "-e {}={} ".format("DOCKER_BASE_TAG", tag) elif image == "clickhouse/kerberized-hadoop": @@ -393,39 +395,27 @@ if __name__ == "__main__": if args.keyword_expression: args.pytest_args += ["-k", args.keyword_expression] - cmd_base = "docker run {net} {tty} --rm --name {name} --privileged \ - --volume={odbc_bridge_bin}:/clickhouse-odbc-bridge --volume={bin}:/clickhouse \ - --volume={library_bridge_bin}:/clickhouse-library-bridge \ - --volume={base_cfg}:/clickhouse-config --volume={cases_dir}:/ClickHouse/tests/integration \ - --volume={src_dir}/Server/grpc_protos:/ClickHouse/src/Server/grpc_protos \ - --volume=/run:/run/host:ro \ - {dockerd_internal_volume} -e DOCKER_CLIENT_TIMEOUT=300 -e COMPOSE_HTTP_TIMEOUT=600 \ - -e XTABLES_LOCKFILE=/run/host/xtables.lock \ - -e PYTHONUNBUFFERED=1 \ - {env_tags} {env_cleanup} -e PYTEST_OPTS='{parallel} {opts} {tests_list} {rand} -vvv' {img}".format( - net=net, - tty=tty, - bin=args.binary, - odbc_bridge_bin=args.odbc_bridge_binary, - library_bridge_bin=args.library_bridge_binary, - base_cfg=args.base_configs_dir, - cases_dir=args.cases_dir, - src_dir=args.src_dir, - env_tags=env_tags, - env_cleanup=env_cleanup, - parallel=parallel_args, - rand=rand_args, - opts=" ".join(args.pytest_args).replace("'", "\\'"), - tests_list=" ".join(args.tests_list), - dockerd_internal_volume=dockerd_internal_volume, - img=DIND_INTEGRATION_TESTS_IMAGE_NAME + ":" + args.docker_image_version, - name=CONTAINER_NAME, + pytest_opts = " ".join(args.pytest_args).replace("'", "\\'") + tests_list = " ".join(args.tests_list) + cmd_base = ( + f"docker run {net} {tty} --rm --name {CONTAINER_NAME} " + "--privileged --dns-search='.' " # since recent dns search leaks from host + f"--volume={args.odbc_bridge_binary}:/clickhouse-odbc-bridge " + f"--volume={args.binary}:/clickhouse " + f"--volume={args.library_bridge_binary}:/clickhouse-library-bridge " + f"--volume={args.base_configs_dir}:/clickhouse-config " + f"--volume={args.cases_dir}:/ClickHouse/tests/integration " + f"--volume={args.src_dir}/Server/grpc_protos:/ClickHouse/src/Server/grpc_protos " + f"--volume=/run:/run/host:ro {dockerd_internal_volume} {env_tags} {env_cleanup} " + "-e DOCKER_CLIENT_TIMEOUT=300 -e COMPOSE_HTTP_TIMEOUT=600 -e PYTHONUNBUFFERED=1 " + f"-e PYTEST_OPTS='{parallel_args} {pytest_opts} {tests_list} {rand_args} -vvv'" + f" {DIND_INTEGRATION_TESTS_IMAGE_NAME}:{args.docker_image_version}" ) cmd = cmd_base + " " + args.command cmd_pre_pull = ( - cmd_base - + " find /compose -name docker_compose_*.yml -exec docker-compose -f '{}' pull \;" + f"{cmd_base} find /compose -name docker_compose_*.yml " + r"-exec docker-compose -f '{}' pull \;" ) containers = subprocess.check_output( diff --git a/tests/integration/test_cgroup_limit/test.py b/tests/integration/test_cgroup_limit/test.py index f6392eca4d7..e77b0f70960 100644 --- a/tests/integration/test_cgroup_limit/test.py +++ b/tests/integration/test_cgroup_limit/test.py @@ -1,5 +1,6 @@ #!/usr/bin/env python3 +import logging import os import math import subprocess @@ -16,18 +17,19 @@ def run_command_in_container(cmd, *args): f"{alternative_binary}:/usr/bin/clickhouse", ) - return subprocess.check_output( - [ - "docker", - "run", - "--rm", - *args, - "ubuntu:20.04", - "sh", - "-c", - cmd, - ] - ) + command = [ + "docker", + "run", + "--rm", + *args, + "ubuntu:22.04", + "sh", + "-c", + cmd, + ] + + logging.debug("Command: %s", " ".join(command)) + return subprocess.check_output(command) def run_with_cpu_limit(cmd, num_cpus, *args): diff --git a/tests/integration/test_disk_over_web_server/test.py b/tests/integration/test_disk_over_web_server/test.py index 719de5e8bef..d62adfb3343 100644 --- a/tests/integration/test_disk_over_web_server/test.py +++ b/tests/integration/test_disk_over_web_server/test.py @@ -45,6 +45,8 @@ def cluster(): f"CREATE TABLE data{i} (id Int32) ENGINE = MergeTree() ORDER BY id SETTINGS storage_policy = 'def', min_bytes_for_wide_part=1;" ) + node.query("SYSTEM STOP MERGES") + for _ in range(10): node.query( f"INSERT INTO data{i} SELECT number FROM numbers(500000 * {i+1})" diff --git a/tests/integration/test_hedged_requests/test.py b/tests/integration/test_hedged_requests/test.py index be6cea80f87..18ea3e50619 100644 --- a/tests/integration/test_hedged_requests/test.py +++ b/tests/integration/test_hedged_requests/test.py @@ -203,6 +203,9 @@ def update_configs( def test_stuck_replica(started_cluster): + if NODES["node"].is_built_with_thread_sanitizer(): + pytest.skip("Hedged requests don't work under Thread Sanitizer") + update_configs() cluster.pause_container("node_1") @@ -233,6 +236,9 @@ def test_stuck_replica(started_cluster): def test_long_query(started_cluster): + if NODES["node"].is_built_with_thread_sanitizer(): + pytest.skip("Hedged requests don't work under Thread Sanitizer") + update_configs() # Restart to reset pool states. @@ -249,12 +255,18 @@ def test_long_query(started_cluster): def test_send_table_status_sleep(started_cluster): + if NODES["node"].is_built_with_thread_sanitizer(): + pytest.skip("Hedged requests don't work under Thread Sanitizer") + update_configs(node_1_sleep_in_send_tables_status=sleep_time) check_query(expected_replica="node_2") check_changing_replica_events(1) def test_send_table_status_sleep2(started_cluster): + if NODES["node"].is_built_with_thread_sanitizer(): + pytest.skip("Hedged requests don't work under Thread Sanitizer") + update_configs( node_1_sleep_in_send_tables_status=sleep_time, node_2_sleep_in_send_tables_status=sleep_time, @@ -264,12 +276,18 @@ def test_send_table_status_sleep2(started_cluster): def test_send_data(started_cluster): + if NODES["node"].is_built_with_thread_sanitizer(): + pytest.skip("Hedged requests don't work under Thread Sanitizer") + update_configs(node_1_sleep_in_send_data=sleep_time) check_query(expected_replica="node_2") check_changing_replica_events(1) def test_send_data2(started_cluster): + if NODES["node"].is_built_with_thread_sanitizer(): + pytest.skip("Hedged requests don't work under Thread Sanitizer") + update_configs( node_1_sleep_in_send_data=sleep_time, node_2_sleep_in_send_data=sleep_time ) @@ -278,6 +296,9 @@ def test_send_data2(started_cluster): def test_combination1(started_cluster): + if NODES["node"].is_built_with_thread_sanitizer(): + pytest.skip("Hedged requests don't work under Thread Sanitizer") + update_configs( node_1_sleep_in_send_tables_status=sleep_time, node_2_sleep_in_send_data=sleep_time, @@ -287,6 +308,9 @@ def test_combination1(started_cluster): def test_combination2(started_cluster): + if NODES["node"].is_built_with_thread_sanitizer(): + pytest.skip("Hedged requests don't work under Thread Sanitizer") + update_configs( node_1_sleep_in_send_data=sleep_time, node_2_sleep_in_send_tables_status=sleep_time, @@ -296,6 +320,9 @@ def test_combination2(started_cluster): def test_combination3(started_cluster): + if NODES["node"].is_built_with_thread_sanitizer(): + pytest.skip("Hedged requests don't work under Thread Sanitizer") + update_configs( node_1_sleep_in_send_data=sleep_time, node_2_sleep_in_send_tables_status=1000, @@ -306,6 +333,9 @@ def test_combination3(started_cluster): def test_combination4(started_cluster): + if NODES["node"].is_built_with_thread_sanitizer(): + pytest.skip("Hedged requests don't work under Thread Sanitizer") + update_configs( node_1_sleep_in_send_tables_status=1000, node_1_sleep_in_send_data=sleep_time, @@ -317,6 +347,9 @@ def test_combination4(started_cluster): def test_receive_timeout1(started_cluster): + if NODES["node"].is_built_with_thread_sanitizer(): + pytest.skip("Hedged requests don't work under Thread Sanitizer") + # Check the situation when first two replicas get receive timeout # in establishing connection, but the third replica is ok. update_configs( @@ -329,6 +362,9 @@ def test_receive_timeout1(started_cluster): def test_receive_timeout2(started_cluster): + if NODES["node"].is_built_with_thread_sanitizer(): + pytest.skip("Hedged requests don't work under Thread Sanitizer") + # Check the situation when first replica get receive timeout # in packet receiving but there are replicas in process of # connection establishing. @@ -342,6 +378,9 @@ def test_receive_timeout2(started_cluster): def test_initial_receive_timeout(started_cluster): + if NODES["node"].is_built_with_thread_sanitizer(): + pytest.skip("Hedged requests don't work under Thread Sanitizer") + # Check the situation when replicas don't respond after # receiving query (so, no packets were send to initiator) update_configs( @@ -360,6 +399,9 @@ def test_initial_receive_timeout(started_cluster): def test_async_connect(started_cluster): + if NODES["node"].is_built_with_thread_sanitizer(): + pytest.skip("Hedged requests don't work under Thread Sanitizer") + update_configs() NODES["node"].restart_clickhouse() @@ -390,6 +432,9 @@ def test_async_connect(started_cluster): def test_async_query_sending(started_cluster): + if NODES["node"].is_built_with_thread_sanitizer(): + pytest.skip("Hedged requests don't work under Thread Sanitizer") + update_configs( node_1_sleep_after_receiving_query=5000, node_2_sleep_after_receiving_query=5000, diff --git a/tests/integration/test_hedged_requests_parallel/test.py b/tests/integration/test_hedged_requests_parallel/test.py index 492b869614f..728697c690d 100644 --- a/tests/integration/test_hedged_requests_parallel/test.py +++ b/tests/integration/test_hedged_requests_parallel/test.py @@ -172,6 +172,9 @@ def update_configs( def test_send_table_status_sleep(started_cluster): + if NODES["node"].is_built_with_thread_sanitizer(): + pytest.skip("Hedged requests don't work under Thread Sanitizer") + update_configs( node_1_sleep_in_send_tables_status=sleep_time, node_2_sleep_in_send_tables_status=sleep_time, @@ -181,6 +184,9 @@ def test_send_table_status_sleep(started_cluster): def test_send_data(started_cluster): + if NODES["node"].is_built_with_thread_sanitizer(): + pytest.skip("Hedged requests don't work under Thread Sanitizer") + update_configs( node_1_sleep_in_send_data=sleep_time, node_2_sleep_in_send_data=sleep_time ) @@ -189,6 +195,9 @@ def test_send_data(started_cluster): def test_combination1(started_cluster): + if NODES["node"].is_built_with_thread_sanitizer(): + pytest.skip("Hedged requests don't work under Thread Sanitizer") + update_configs( node_1_sleep_in_send_tables_status=1000, node_2_sleep_in_send_tables_status=1000, @@ -199,6 +208,9 @@ def test_combination1(started_cluster): def test_combination2(started_cluster): + if NODES["node"].is_built_with_thread_sanitizer(): + pytest.skip("Hedged requests don't work under Thread Sanitizer") + update_configs( node_1_sleep_in_send_data=sleep_time, node_2_sleep_in_send_tables_status=1000, @@ -210,6 +222,9 @@ def test_combination2(started_cluster): def test_query_with_no_data_to_sample(started_cluster): + if NODES["node"].is_built_with_thread_sanitizer(): + pytest.skip("Hedged requests don't work under Thread Sanitizer") + update_configs( node_1_sleep_in_send_data=sleep_time, node_2_sleep_in_send_data=sleep_time ) diff --git a/tests/integration/test_host_regexp_multiple_ptr_records/coredns_config/Corefile b/tests/integration/test_host_regexp_multiple_ptr_records/coredns_config/Corefile index 0dd198441dc..3edf37dafa5 100644 --- a/tests/integration/test_host_regexp_multiple_ptr_records/coredns_config/Corefile +++ b/tests/integration/test_host_regexp_multiple_ptr_records/coredns_config/Corefile @@ -1,6 +1,6 @@ . { hosts /example.com { - reload "200ms" + reload "20ms" fallthrough } forward . 127.0.0.11 diff --git a/tests/integration/test_host_regexp_multiple_ptr_records_concurrent/coredns_config/Corefile b/tests/integration/test_host_regexp_multiple_ptr_records_concurrent/coredns_config/Corefile index 0dd198441dc..3edf37dafa5 100644 --- a/tests/integration/test_host_regexp_multiple_ptr_records_concurrent/coredns_config/Corefile +++ b/tests/integration/test_host_regexp_multiple_ptr_records_concurrent/coredns_config/Corefile @@ -1,6 +1,6 @@ . { hosts /example.com { - reload "200ms" + reload "20ms" fallthrough } forward . 127.0.0.11 diff --git a/tests/integration/test_jemalloc_percpu_arena/test.py b/tests/integration/test_jemalloc_percpu_arena/test.py index 0dccde3776e..8de3dcd7ea2 100755 --- a/tests/integration/test_jemalloc_percpu_arena/test.py +++ b/tests/integration/test_jemalloc_percpu_arena/test.py @@ -26,7 +26,7 @@ def run_command_in_container(cmd, *args): "run", "--rm", *args, - "ubuntu:20.04", + "ubuntu:22.04", "sh", "-c", cmd, diff --git a/tests/integration/test_keeper_disks/__init__.py b/tests/integration/test_keeper_disks/__init__.py new file mode 100644 index 00000000000..e69de29bb2d diff --git a/tests/integration/test_keeper_disks/configs/enable_keeper.xml b/tests/integration/test_keeper_disks/configs/enable_keeper.xml new file mode 100644 index 00000000000..5814979229c --- /dev/null +++ b/tests/integration/test_keeper_disks/configs/enable_keeper.xml @@ -0,0 +1,53 @@ + + + + + local + /var/lib/clickhouse/coordination/logs/ + + + s3_plain + http://minio1:9001/root/logs/ + minio + minio123 + + + local + /var/lib/clickhouse/coordination/snapshots/ + + + s3_plain + http://minio1:9001/root/snapshots/ + minio + minio123 + + + + + + false + 9181 + 1 + false + + + 5000 + 10000 + trace + 10 + 10 + 1 + 3 + + + + + + + 1 + node + 9234 + + + + \ No newline at end of file diff --git a/tests/integration/test_keeper_disks/test.py b/tests/integration/test_keeper_disks/test.py new file mode 100644 index 00000000000..11bb215be54 --- /dev/null +++ b/tests/integration/test_keeper_disks/test.py @@ -0,0 +1,258 @@ +#!/usr/bin/env python3 +import pytest +from helpers.cluster import ClickHouseCluster +import helpers.keeper_utils as keeper_utils +from minio.deleteobjects import DeleteObject + +import os + +CURRENT_TEST_DIR = os.path.dirname(os.path.abspath(__file__)) +cluster = ClickHouseCluster(__file__) +node = cluster.add_instance( + "node", main_configs=["configs/enable_keeper.xml"], stay_alive=True, with_minio=True +) + +from kazoo.client import KazooClient, KazooState + + +@pytest.fixture(scope="module") +def started_cluster(): + try: + cluster.start() + yield cluster + + finally: + cluster.shutdown() + + +def get_fake_zk(nodename, timeout=30.0): + _fake_zk_instance = KazooClient( + hosts=cluster.get_instance_ip(nodename) + ":9181", timeout=timeout + ) + _fake_zk_instance.start() + return _fake_zk_instance + + +def stop_zk(zk): + try: + if zk: + zk.stop() + zk.close() + except: + pass + + +def stop_clickhouse(cluster, cleanup_disks): + node.stop_clickhouse() + + if not cleanup_disks: + return + + node.exec_in_container(["rm", "-rf", "/var/lib/clickhouse/coordination/logs"]) + node.exec_in_container(["rm", "-rf", "/var/lib/clickhouse/coordination/snapshots"]) + + s3_objects = list_s3_objects(cluster, prefix="") + if len(s3_objects) == 0: + return + + assert ( + len( + list( + cluster.minio_client.remove_objects( + cluster.minio_bucket, + [DeleteObject(obj) for obj in s3_objects], + ) + ) + ) + == 0 + ) + + +def setup_storage(cluster, storage_config, cleanup_disks): + stop_clickhouse(cluster, cleanup_disks) + node.copy_file_to_container( + os.path.join(CURRENT_TEST_DIR, "configs/enable_keeper.xml"), + "/etc/clickhouse-server/config.d/enable_keeper.xml", + ) + node.replace_in_config( + "/etc/clickhouse-server/config.d/enable_keeper.xml", + "", + storage_config, + ) + node.start_clickhouse() + keeper_utils.wait_until_connected(cluster, node) + + +def setup_local_storage(cluster): + setup_storage( + cluster, + "log_local<\\/log_storage_disk>" + "snapshot_local<\\/snapshot_storage_disk>", + cleanup_disks=True, + ) + + +def list_s3_objects(cluster, prefix=""): + minio = cluster.minio_client + prefix_len = len(prefix) + return [ + obj.object_name[prefix_len:] + for obj in minio.list_objects( + cluster.minio_bucket, prefix=prefix, recursive=True + ) + ] + + +def get_local_files(path): + files = node.exec_in_container(["ls", path]).strip().split("\n") + files.sort() + return files + + +def get_local_logs(): + return get_local_files("/var/lib/clickhouse/coordination/logs") + + +def get_local_snapshots(): + return get_local_files("/var/lib/clickhouse/coordination/snapshots") + + +def test_logs_with_disks(started_cluster): + setup_local_storage(started_cluster) + + node_zk = get_fake_zk("node") + try: + node_zk.create("/test") + for _ in range(30): + node_zk.create("/test/somenode", b"somedata", sequence=True) + + stop_zk(node_zk) + + previous_log_files = get_local_logs() + + setup_storage( + started_cluster, + "log_s3_plain<\\/log_storage_disk>" + "log_local<\\/latest_log_storage_disk>" + "snapshot_local<\\/snapshot_storage_disk>", + cleanup_disks=False, + ) + + # all but the latest log should be on S3 + s3_log_files = list_s3_objects(started_cluster, "logs/") + assert set(s3_log_files) == set(previous_log_files[:-1]) + local_log_files = get_local_logs() + assert len(local_log_files) == 1 + assert local_log_files[0] == previous_log_files[-1] + + previous_log_files = s3_log_files + local_log_files + + node_zk = get_fake_zk("node") + + for _ in range(30): + node_zk.create("/test/somenode", b"somedata", sequence=True) + + stop_zk(node_zk) + + log_files = list_s3_objects(started_cluster, "logs/") + local_log_files = get_local_logs() + assert len(local_log_files) == 1 + + log_files.extend(local_log_files) + assert set(log_files) != previous_log_files + + previous_log_files = log_files + + setup_storage( + started_cluster, + "log_s3_plain<\\/old_log_storage_disk>" + "log_local<\\/log_storage_disk>" + "snapshot_local<\\/snapshot_storage_disk>", + cleanup_disks=False, + ) + + local_log_files = get_local_logs() + assert set(local_log_files) == set(previous_log_files) + + node_zk = get_fake_zk("node") + + for child in node_zk.get_children("/test"): + assert node_zk.get(f"/test/{child}")[0] == b"somedata" + + finally: + stop_zk(node_zk) + + +def test_snapshots_with_disks(started_cluster): + setup_local_storage(started_cluster) + + node_zk = get_fake_zk("node") + try: + node_zk.create("/test2") + for _ in range(30): + node_zk.create("/test2/somenode", b"somedata", sequence=True) + + stop_zk(node_zk) + + snapshot_idx = keeper_utils.send_4lw_cmd(cluster, node, "csnp") + node.wait_for_log_line( + f"Created persistent snapshot {snapshot_idx}", look_behind_lines=1000 + ) + + previous_snapshot_files = get_local_snapshots() + + setup_storage( + started_cluster, + "snapshot_s3_plain<\\/snapshot_storage_disk>" + "snapshot_local<\\/latest_snapshot_storage_disk>" + "log_local<\\/log_storage_disk>", + cleanup_disks=False, + ) + + ## all but the latest log should be on S3 + s3_snapshot_files = list_s3_objects(started_cluster, "snapshots/") + assert set(s3_snapshot_files) == set(previous_snapshot_files[:-1]) + local_snapshot_files = get_local_snapshots() + assert len(local_snapshot_files) == 1 + assert local_snapshot_files[0] == previous_snapshot_files[-1] + + previous_snapshot_files = s3_snapshot_files + local_snapshot_files + + node_zk = get_fake_zk("node") + + for _ in range(30): + node_zk.create("/test2/somenode", b"somedata", sequence=True) + + stop_zk(node_zk) + + snapshot_idx = keeper_utils.send_4lw_cmd(cluster, node, "csnp") + node.wait_for_log_line( + f"Created persistent snapshot {snapshot_idx}", look_behind_lines=1000 + ) + + snapshot_files = list_s3_objects(started_cluster, "snapshots/") + local_snapshot_files = get_local_snapshots() + assert len(local_snapshot_files) == 1 + + snapshot_files.extend(local_snapshot_files) + + previous_snapshot_files = snapshot_files + + setup_storage( + started_cluster, + "snapshot_s3_plain<\\/old_snapshot_storage_disk>" + "snapshot_local<\\/snapshot_storage_disk>" + "log_local<\\/log_storage_disk>", + cleanup_disks=False, + ) + + local_snapshot_files = get_local_snapshots() + assert set(local_snapshot_files) == set(previous_snapshot_files) + + node_zk = get_fake_zk("node") + + for child in node_zk.get_children("/test2"): + assert node_zk.get(f"/test2/{child}")[0] == b"somedata" + + finally: + stop_zk(node_zk) diff --git a/tests/integration/test_keeper_feature_flags_config/__init__.py b/tests/integration/test_keeper_feature_flags_config/__init__.py new file mode 100644 index 00000000000..e69de29bb2d diff --git a/tests/integration/test_keeper_feature_flags_config/configs/enable_keeper.xml b/tests/integration/test_keeper_feature_flags_config/configs/enable_keeper.xml new file mode 100644 index 00000000000..53a169c4c3c --- /dev/null +++ b/tests/integration/test_keeper_feature_flags_config/configs/enable_keeper.xml @@ -0,0 +1,31 @@ + + + 9181 + 1 + /var/lib/clickhouse/coordination/log + /var/lib/clickhouse/coordination/snapshots + + + 10 + 5 + 5000 + 10000 + trace + + + 0 + 0 + 0 + + + + + + + 1 + localhost + 9234 + + + + diff --git a/tests/integration/test_keeper_feature_flags_config/test.py b/tests/integration/test_keeper_feature_flags_config/test.py new file mode 100644 index 00000000000..93ac6cbd3bd --- /dev/null +++ b/tests/integration/test_keeper_feature_flags_config/test.py @@ -0,0 +1,96 @@ +#!/usr/bin/env python3 + +import pytest +import os +from helpers.cluster import ClickHouseCluster +import helpers.keeper_utils as keeper_utils +from kazoo.client import KazooClient, KazooState + +CURRENT_TEST_DIR = os.path.dirname(os.path.abspath(__file__)) +cluster = ClickHouseCluster(__file__) + +# clickhouse itself will use external zookeeper +node = cluster.add_instance( + "node", + main_configs=["configs/enable_keeper.xml"], + stay_alive=True, +) + + +@pytest.fixture(scope="module") +def started_cluster(): + try: + cluster.start() + + yield cluster + + finally: + cluster.shutdown() + + +def get_connection_zk(nodename, timeout=30.0): + _fake_zk_instance = KazooClient( + hosts=cluster.get_instance_ip(nodename) + ":9181", timeout=timeout + ) + _fake_zk_instance.start() + return _fake_zk_instance + + +def restart_clickhouse(feature_flags=[], expect_fail=True): + node.stop_clickhouse() + node.copy_file_to_container( + os.path.join(CURRENT_TEST_DIR, "configs/enable_keeper.xml"), + "/etc/clickhouse-server/config.d/enable_keeper.xml", + ) + + if len(feature_flags) > 0: + feature_flags_config = "" + + for feature, is_enabled in feature_flags: + feature_flags_config += f"<{feature}>{is_enabled}<\\/{feature}>" + + feature_flags_config += "<\\/feature_flags>" + + node.replace_in_config( + "/etc/clickhouse-server/config.d/enable_keeper.xml", + "", + feature_flags_config, + ) + + node.start_clickhouse(retry_start=not expect_fail) + keeper_utils.wait_until_connected(cluster, node) + + +def test_keeper_feature_flags(started_cluster): + restart_clickhouse() + + def assert_feature_flags(feature_flags): + res = keeper_utils.send_4lw_cmd(started_cluster, node, "ftfl") + + for feature, is_enabled in feature_flags: + node.wait_for_log_line( + f"ZooKeeperClient: Keeper feature flag {feature.upper()}: {'enabled' if is_enabled else 'disabled'}", + look_behind_lines=1000, + ) + + node.wait_for_log_line( + f"KeeperContext: Keeper feature flag {feature.upper()}: {'enabled' if is_enabled else 'disabled'}", + look_behind_lines=1000, + ) + + assert f"{feature}\t{1 if is_enabled else 0}" in res + + assert_feature_flags( + [("filtered_list", 1), ("multi_read", 1), ("check_not_exists", 0)] + ) + + feature_flags = [("multi_read", 0), ("check_not_exists", 1)] + restart_clickhouse(feature_flags) + assert_feature_flags(feature_flags + [("filtered_list", 1)]) + + feature_flags = [("multi_read", 0), ("check_not_exists", 0), ("filtered_list", 0)] + restart_clickhouse(feature_flags) + assert_feature_flags(feature_flags) + + with pytest.raises(Exception): + restart_clickhouse([("invalid_feature", 1)], expect_fail=True) diff --git a/tests/integration/test_keeper_four_word_command/test.py b/tests/integration/test_keeper_four_word_command/test.py index 2098daea5fe..b546d879c75 100644 --- a/tests/integration/test_keeper_four_word_command/test.py +++ b/tests/integration/test_keeper_four_word_command/test.py @@ -183,8 +183,8 @@ def test_cmd_mntr(started_cluster): # contains: # 10 nodes created by test # 3 nodes created by clickhouse "/clickhouse/task_queue/ddl" - # 1 root node, 2 keeper system nodes - assert int(result["zk_znode_count"]) == 13 + # 1 root node, 3 keeper system nodes + assert int(result["zk_znode_count"]) == 14 assert int(result["zk_watch_count"]) == 2 assert int(result["zk_ephemerals_count"]) == 2 assert int(result["zk_approximate_data_size"]) > 0 @@ -252,10 +252,12 @@ def test_cmd_conf(started_cluster): assert result["four_letter_word_allow_list"] == "*" assert result["log_storage_path"] == "/var/lib/clickhouse/coordination/log" + assert result["log_storage_disk"] == "LocalLogDisk" assert ( result["snapshot_storage_path"] == "/var/lib/clickhouse/coordination/snapshots" ) + assert result["snapshot_storage_disk"] == "LocalSnapshotDisk" assert result["session_timeout_ms"] == "30000" assert result["min_session_timeout_ms"] == "10000" @@ -329,7 +331,7 @@ def test_cmd_srvr(started_cluster): assert int(result["Connections"]) == 1 assert int(result["Zxid"]) > 10 assert result["Mode"] == "leader" - assert result["Node count"] == "13" + assert result["Node count"] == "14" finally: destroy_zk_client(zk) @@ -369,7 +371,7 @@ def test_cmd_stat(started_cluster): assert int(result["Connections"]) == 1 assert int(result["Zxid"]) >= 10 assert result["Mode"] == "leader" - assert result["Node count"] == "13" + assert result["Node count"] == "14" # filter connection statistics cons = [n for n in data.split("\n") if "=" in n] diff --git a/tests/integration/test_keeper_restore_from_snapshot/configs/disk_s3_storage.xml b/tests/integration/test_keeper_restore_from_snapshot/configs/disk_s3_storage.xml new file mode 100644 index 00000000000..2ec4bcd77fb --- /dev/null +++ b/tests/integration/test_keeper_restore_from_snapshot/configs/disk_s3_storage.xml @@ -0,0 +1,26 @@ + + + + + s3_plain + http://minio1:9001/root/data/snapshots1/ + minio + minio123 + + + s3_plain + http://minio1:9001/root/data/snapshots2/ + minio + minio123 + + + s3_plain + http://minio1:9001/root/data/snapshots3/ + minio + minio123 + + + + + + diff --git a/tests/integration/test_keeper_restore_from_snapshot/configs/enable_keeper1.xml b/tests/integration/test_keeper_restore_from_snapshot/configs/enable_keeper1.xml index 4ea543e6f31..510913b5dc4 100644 --- a/tests/integration/test_keeper_restore_from_snapshot/configs/enable_keeper1.xml +++ b/tests/integration/test_keeper_restore_from_snapshot/configs/enable_keeper1.xml @@ -2,8 +2,6 @@ 9181 1 - /var/lib/clickhouse/coordination/log - /var/lib/clickhouse/coordination/snapshots 5000 diff --git a/tests/integration/test_keeper_restore_from_snapshot/configs/enable_keeper2.xml b/tests/integration/test_keeper_restore_from_snapshot/configs/enable_keeper2.xml index 4bf3083c1fa..0d50573df01 100644 --- a/tests/integration/test_keeper_restore_from_snapshot/configs/enable_keeper2.xml +++ b/tests/integration/test_keeper_restore_from_snapshot/configs/enable_keeper2.xml @@ -2,8 +2,6 @@ 9181 2 - /var/lib/clickhouse/coordination/log - /var/lib/clickhouse/coordination/snapshots 5000 diff --git a/tests/integration/test_keeper_restore_from_snapshot/configs/enable_keeper3.xml b/tests/integration/test_keeper_restore_from_snapshot/configs/enable_keeper3.xml index b9e2a2d0422..8fa322cb9e7 100644 --- a/tests/integration/test_keeper_restore_from_snapshot/configs/enable_keeper3.xml +++ b/tests/integration/test_keeper_restore_from_snapshot/configs/enable_keeper3.xml @@ -2,8 +2,6 @@ 9181 3 - /var/lib/clickhouse/coordination/log - /var/lib/clickhouse/coordination/snapshots 5000 diff --git a/tests/integration/test_keeper_restore_from_snapshot/configs/keeper1_snapshot_disk.xml b/tests/integration/test_keeper_restore_from_snapshot/configs/keeper1_snapshot_disk.xml new file mode 100644 index 00000000000..01759e2771a --- /dev/null +++ b/tests/integration/test_keeper_restore_from_snapshot/configs/keeper1_snapshot_disk.xml @@ -0,0 +1,5 @@ + + + snapshot_s3_plain1 + + \ No newline at end of file diff --git a/tests/integration/test_keeper_restore_from_snapshot/configs/keeper2_snapshot_disk.xml b/tests/integration/test_keeper_restore_from_snapshot/configs/keeper2_snapshot_disk.xml new file mode 100644 index 00000000000..6c33c5fb300 --- /dev/null +++ b/tests/integration/test_keeper_restore_from_snapshot/configs/keeper2_snapshot_disk.xml @@ -0,0 +1,5 @@ + + + snapshot_s3_plain2 + + \ No newline at end of file diff --git a/tests/integration/test_keeper_restore_from_snapshot/configs/keeper3_snapshot_disk.xml b/tests/integration/test_keeper_restore_from_snapshot/configs/keeper3_snapshot_disk.xml new file mode 100644 index 00000000000..5016ccd581b --- /dev/null +++ b/tests/integration/test_keeper_restore_from_snapshot/configs/keeper3_snapshot_disk.xml @@ -0,0 +1,5 @@ + + + snapshot_s3_plain3 + + \ No newline at end of file diff --git a/tests/integration/test_keeper_restore_from_snapshot/configs/local_storage_path.xml b/tests/integration/test_keeper_restore_from_snapshot/configs/local_storage_path.xml new file mode 100644 index 00000000000..63cb958c88e --- /dev/null +++ b/tests/integration/test_keeper_restore_from_snapshot/configs/local_storage_path.xml @@ -0,0 +1,6 @@ + + + /var/lib/clickhouse/coordination/log + /var/lib/clickhouse/coordination/snapshots + + \ No newline at end of file diff --git a/tests/integration/test_keeper_restore_from_snapshot/test.py b/tests/integration/test_keeper_restore_from_snapshot/test.py index bc33689dd20..e4d5793bb17 100644 --- a/tests/integration/test_keeper_restore_from_snapshot/test.py +++ b/tests/integration/test_keeper_restore_from_snapshot/test.py @@ -9,13 +9,19 @@ import time cluster = ClickHouseCluster(__file__) node1 = cluster.add_instance( - "node1", main_configs=["configs/enable_keeper1.xml"], stay_alive=True + "node1", + main_configs=["configs/enable_keeper1.xml", "configs/local_storage_path.xml"], + stay_alive=True, ) node2 = cluster.add_instance( - "node2", main_configs=["configs/enable_keeper2.xml"], stay_alive=True + "node2", + main_configs=["configs/enable_keeper2.xml", "configs/local_storage_path.xml"], + stay_alive=True, ) node3 = cluster.add_instance( - "node3", main_configs=["configs/enable_keeper3.xml"], stay_alive=True + "node3", + main_configs=["configs/enable_keeper3.xml", "configs/local_storage_path.xml"], + stay_alive=True, ) from kazoo.client import KazooClient, KazooState diff --git a/tests/integration/test_keeper_restore_from_snapshot/test_disk_s3.py b/tests/integration/test_keeper_restore_from_snapshot/test_disk_s3.py new file mode 100644 index 00000000000..1226df75203 --- /dev/null +++ b/tests/integration/test_keeper_restore_from_snapshot/test_disk_s3.py @@ -0,0 +1,152 @@ +#!/usr/bin/env python3 +import pytest +from helpers.cluster import ClickHouseCluster +import helpers.keeper_utils as keeper_utils +import random +import string +import os +import time + +cluster = ClickHouseCluster(__file__) +node1 = cluster.add_instance( + "node1", + main_configs=[ + "configs/enable_keeper1.xml", + "configs/disk_s3_storage.xml", + "configs/keeper1_snapshot_disk.xml", + ], + stay_alive=True, + with_minio=True, +) +node2 = cluster.add_instance( + "node2", + main_configs=[ + "configs/enable_keeper2.xml", + "configs/disk_s3_storage.xml", + "configs/keeper2_snapshot_disk.xml", + ], + stay_alive=True, + with_minio=True, +) +node3 = cluster.add_instance( + "node3", + main_configs=[ + "configs/enable_keeper3.xml", + "configs/disk_s3_storage.xml", + "configs/keeper3_snapshot_disk.xml", + ], + stay_alive=True, + with_minio=True, +) + +from kazoo.client import KazooClient, KazooState + + +@pytest.fixture(scope="module") +def started_cluster(): + try: + cluster.start() + + yield cluster + + finally: + cluster.shutdown() + + +def get_fake_zk(nodename, timeout=30.0): + _fake_zk_instance = KazooClient( + hosts=cluster.get_instance_ip(nodename) + ":9181", timeout=timeout + ) + _fake_zk_instance.start() + return _fake_zk_instance + + +def stop_zk(zk): + try: + if zk: + zk.stop() + zk.close() + except: + pass + + +def test_recover_from_snapshot_with_disk_s3(started_cluster): + try: + node1_zk = node2_zk = node3_zk = None + node1_zk = get_fake_zk("node1") + node2_zk = get_fake_zk("node2") + node3_zk = get_fake_zk("node3") + + node1_zk.create("/test_snapshot_multinode_recover", "somedata".encode()) + + node2_zk.sync("/test_snapshot_multinode_recover") + node3_zk.sync("/test_snapshot_multinode_recover") + + assert node1_zk.get("/test_snapshot_multinode_recover")[0] == b"somedata" + assert node2_zk.get("/test_snapshot_multinode_recover")[0] == b"somedata" + assert node3_zk.get("/test_snapshot_multinode_recover")[0] == b"somedata" + + node3.stop_clickhouse(kill=True) + + # at least we will have 2 snapshots + for i in range(435): + node1_zk.create( + "/test_snapshot_multinode_recover" + str(i), + ("somedata" + str(i)).encode(), + ) + + for i in range(435): + if i % 10 == 0: + node1_zk.delete("/test_snapshot_multinode_recover" + str(i)) + + finally: + for zk in [node1_zk, node2_zk, node3_zk]: + stop_zk(zk) + + # stale node should recover from leader's snapshot + # with some sanitizers can start longer than 5 seconds + node3.start_clickhouse(20) + keeper_utils.wait_until_connected(cluster, node3) + print("Restarted") + + try: + node1_zk = node2_zk = node3_zk = None + node1_zk = get_fake_zk("node1") + node2_zk = get_fake_zk("node2") + node3_zk = get_fake_zk("node3") + + node1_zk.sync("/test_snapshot_multinode_recover") + node2_zk.sync("/test_snapshot_multinode_recover") + node3_zk.sync("/test_snapshot_multinode_recover") + + assert node1_zk.get("/test_snapshot_multinode_recover")[0] == b"somedata" + assert node2_zk.get("/test_snapshot_multinode_recover")[0] == b"somedata" + assert node3_zk.get("/test_snapshot_multinode_recover")[0] == b"somedata" + + for i in range(435): + if i % 10 != 0: + assert ( + node1_zk.get("/test_snapshot_multinode_recover" + str(i))[0] + == ("somedata" + str(i)).encode() + ) + assert ( + node2_zk.get("/test_snapshot_multinode_recover" + str(i))[0] + == ("somedata" + str(i)).encode() + ) + assert ( + node3_zk.get("/test_snapshot_multinode_recover" + str(i))[0] + == ("somedata" + str(i)).encode() + ) + else: + assert ( + node1_zk.exists("/test_snapshot_multinode_recover" + str(i)) is None + ) + assert ( + node2_zk.exists("/test_snapshot_multinode_recover" + str(i)) is None + ) + assert ( + node3_zk.exists("/test_snapshot_multinode_recover" + str(i)) is None + ) + finally: + for zk in [node1_zk, node2_zk, node3_zk]: + stop_zk(zk) diff --git a/tests/integration/test_merge_tree_hdfs/test.py b/tests/integration/test_merge_tree_hdfs/test.py index d1a145c00c1..95b63a5c8a3 100644 --- a/tests/integration/test_merge_tree_hdfs/test.py +++ b/tests/integration/test_merge_tree_hdfs/test.py @@ -235,12 +235,7 @@ def test_attach_detach_partition(cluster): assert node.query("SELECT count(*) FROM hdfs_test FORMAT Values") == "(8192)" hdfs_objects = fs.listdir("/clickhouse") - assert ( - len(hdfs_objects) - == FILES_OVERHEAD - + FILES_OVERHEAD_PER_PART_WIDE * 2 - - FILES_OVERHEAD_METADATA_VERSION - ) + assert len(hdfs_objects) == FILES_OVERHEAD + FILES_OVERHEAD_PER_PART_WIDE * 2 node.query("ALTER TABLE hdfs_test DROP PARTITION '2020-01-03'") assert node.query("SELECT count(*) FROM hdfs_test FORMAT Values") == "(4096)" diff --git a/tests/integration/test_merge_tree_s3/test.py b/tests/integration/test_merge_tree_s3/test.py index 303fff66166..f754bc905bf 100644 --- a/tests/integration/test_merge_tree_s3/test.py +++ b/tests/integration/test_merge_tree_s3/test.py @@ -336,9 +336,7 @@ def test_attach_detach_partition(cluster, node_name): assert node.query("SELECT count(*) FROM s3_test FORMAT Values") == "(8192)" assert ( len(list_objects(cluster, "data/")) - == FILES_OVERHEAD - + FILES_OVERHEAD_PER_PART_WIDE * 2 - - FILES_OVERHEAD_METADATA_VERSION + == FILES_OVERHEAD + FILES_OVERHEAD_PER_PART_WIDE * 2 ) node.query("ALTER TABLE s3_test DROP PARTITION '2020-01-03'") diff --git a/tests/integration/test_postgresql_protocol/test.py b/tests/integration/test_postgresql_protocol/test.py index e1d8cbf9bcc..de01bba6862 100644 --- a/tests/integration/test_postgresql_protocol/test.py +++ b/tests/integration/test_postgresql_protocol/test.py @@ -111,7 +111,7 @@ def test_psql_client(started_cluster): def test_python_client(started_cluster): node = cluster.instances["node"] - with pytest.raises(py_psql.InternalError) as exc_info: + with pytest.raises(py_psql.OperationalError) as exc_info: ch = py_psql.connect( host=node.ip_address, port=server_port, @@ -122,9 +122,7 @@ def test_python_client(started_cluster): cur = ch.cursor() cur.execute("select name from tables;") - assert exc_info.value.args == ( - "Query execution failed.\nDB::Exception: Table default.tables doesn't exist\nSSL connection has been closed unexpectedly\n", - ) + assert exc_info.value.args == ("SSL connection has been closed unexpectedly\n",) ch = py_psql.connect( host=node.ip_address, diff --git a/tests/integration/test_secure_socket/test.py b/tests/integration/test_secure_socket/test.py index 2dffbed03d6..123715e5f05 100644 --- a/tests/integration/test_secure_socket/test.py +++ b/tests/integration/test_secure_socket/test.py @@ -58,6 +58,9 @@ def test(started_cluster): config.format(sleep_in_send_data_ms=1000000), ) + if NODES["node1"].is_built_with_thread_sanitizer(): + pytest.skip("Hedged requests don't work under Thread Sanitizer") + attempts = 0 while attempts < 1000: setting = NODES["node2"].http_query( diff --git a/tests/integration/test_ssl_cert_authentication/test.py b/tests/integration/test_ssl_cert_authentication/test.py index b05a6acc16b..ff2de7491e1 100644 --- a/tests/integration/test_ssl_cert_authentication/test.py +++ b/tests/integration/test_ssl_cert_authentication/test.py @@ -2,10 +2,11 @@ import pytest from helpers.client import Client from helpers.cluster import ClickHouseCluster from helpers.ssl_context import WrapSSLContextWithSNI +import urllib.request, urllib.parse import ssl import os.path from os import remove -import urllib3 +import logging # The test cluster is configured with certificate for that host name, see 'server-ext.cnf'. @@ -14,6 +15,7 @@ SSL_HOST = "integration-tests.clickhouse.com" HTTPS_PORT = 8443 # It's important for the node to work at this IP because 'server-cert.pem' requires that (see server-ext.cnf). SCRIPT_DIR = os.path.dirname(os.path.realpath(__file__)) +MAX_RETRY = 5 cluster = ClickHouseCluster(__file__) instance = cluster.add_instance( @@ -164,17 +166,19 @@ def get_ssl_context(cert_name): def execute_query_https( query, user, enable_ssl_auth=True, cert_name=None, password=None ): - url = f"https://{instance.ip_address}:{HTTPS_PORT}/?query={query}" - headers = {"X-ClickHouse-User": user} + url = ( + f"https://{instance.ip_address}:{HTTPS_PORT}/?query={urllib.parse.quote(query)}" + ) + request = urllib.request.Request(url) + request.add_header("X-ClickHouse-User", user) if enable_ssl_auth: - headers["X-ClickHouse-SSL-Certificate-Auth"] = "on" + request.add_header("X-ClickHouse-SSL-Certificate-Auth", "on") if password: - headers["X-ClickHouse-Key"] = password - http_client = urllib3.PoolManager(ssl_context=get_ssl_context(cert_name)) - response = http_client.request("GET", url, headers=headers) - if response.status != 200: - raise Exception(response.status) - return response.data.decode("utf-8") + request.add_header("X-ClickHouse-Key", password) + response = urllib.request.urlopen( + request, context=get_ssl_context(cert_name) + ).read() + return response.decode("utf-8") def test_https(): @@ -198,10 +202,18 @@ def test_https_wrong_cert(): execute_query_https("SELECT currentUser()", user="john", cert_name="client2") assert "403" in str(err.value) + count = 0 # Wrong certificate: self-signed certificate. - with pytest.raises(Exception) as err: - execute_query_https("SELECT currentUser()", user="john", cert_name="wrong") - assert "unknown ca" in str(err.value) + while count <= MAX_RETRY: + with pytest.raises(Exception) as err: + execute_query_https("SELECT currentUser()", user="john", cert_name="wrong") + err_str = str(err.value) + if count < MAX_RETRY and "Broken pipe" in err_str: + count = count + 1 + logging.warning(f"Failed attempt with wrong cert, err: {err_str}") + continue + assert "unknown ca" in err_str + break # No certificate. with pytest.raises(Exception) as err: @@ -291,24 +303,45 @@ def test_https_non_ssl_auth(): == "jane\n" ) + count = 0 # However if we send a certificate it must not be wrong. - with pytest.raises(Exception) as err: - execute_query_https( - "SELECT currentUser()", - user="peter", - enable_ssl_auth=False, - cert_name="wrong", - ) - assert "unknown ca" in str(err.value) - with pytest.raises(Exception) as err: - execute_query_https( - "SELECT currentUser()", - user="jane", - enable_ssl_auth=False, - password="qwe123", - cert_name="wrong", - ) - assert "unknown ca" in str(err.value) + while count <= MAX_RETRY: + with pytest.raises(Exception) as err: + execute_query_https( + "SELECT currentUser()", + user="peter", + enable_ssl_auth=False, + cert_name="wrong", + ) + err_str = str(err.value) + if count < MAX_RETRY and "Broken pipe" in err_str: + count = count + 1 + logging.warning( + f"Failed attempt with wrong cert, user: peter, err: {err_str}" + ) + continue + assert "unknown ca" in err_str + break + + count = 0 + while count <= MAX_RETRY: + with pytest.raises(Exception) as err: + execute_query_https( + "SELECT currentUser()", + user="jane", + enable_ssl_auth=False, + password="qwe123", + cert_name="wrong", + ) + err_str = str(err.value) + if count < MAX_RETRY and "Broken pipe" in err_str: + count = count + 1 + logging.warning( + f"Failed attempt with wrong cert, user: jane, err: {err_str}" + ) + continue + assert "unknown ca" in err_str + break def test_create_user(): diff --git a/tests/integration/test_storage_hudi/test.py b/tests/integration/test_storage_hudi/test.py index de9cde43609..2b77f4d6d61 100644 --- a/tests/integration/test_storage_hudi/test.py +++ b/tests/integration/test_storage_hudi/test.py @@ -79,7 +79,7 @@ def run_query(instance, query, stdin=None, settings=None): def write_hudi_from_df(spark, table_name, df, result_path, mode="overwrite"): - if mode is "overwrite": + if mode == "overwrite": hudi_write_mode = "insert_overwrite" else: hudi_write_mode = "upsert" diff --git a/tests/integration/test_storage_kerberized_hdfs/hdfs_configs/bootstrap.sh b/tests/integration/test_storage_kerberized_hdfs/hdfs_configs/bootstrap.sh index 687ddd8fb46..db6921bc1c8 100755 --- a/tests/integration/test_storage_kerberized_hdfs/hdfs_configs/bootstrap.sh +++ b/tests/integration/test_storage_kerberized_hdfs/hdfs_configs/bootstrap.sh @@ -111,6 +111,23 @@ cat > /usr/local/hadoop/etc/hadoop/hdfs-site.xml << EOF dfs.datanode.http.address 0.0.0.0:1006 + + + dfs.datanode.ipc.address + 0.0.0.0:0 + + + dfs.namenode.secondary.http-address + 0.0.0.0:0 + + + dfs.namenode.backup.address + 0.0.0.0:0 + + + dfs.namenode.backup.http-address + 0.0.0.0:0 +