Merge branch 'master' into add-test-35801

This commit is contained in:
Alexey Milovidov 2023-07-03 19:00:09 +03:00 committed by GitHub
commit 504715c6fd
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
170 changed files with 3773 additions and 1120 deletions

View File

@ -121,8 +121,6 @@ jobs:
docker ps --all --quiet | xargs --no-run-if-empty docker rm -f ||:
sudo rm -fr "$TEMP_PATH" "$CACHES_PATH"
SonarCloud:
# TODO: Remove if: whenever SonarCloud supports c++23
if: ${{ false }}
runs-on: [self-hosted, builder]
env:
SONAR_SCANNER_VERSION: 4.8.0.2856
@ -159,7 +157,7 @@ jobs:
- name: Set Up Build Tools
run: |
sudo apt-get update
sudo apt-get install -yq git cmake ccache ninja-build python3 yasm
sudo apt-get install -yq git cmake ccache ninja-build python3 yasm nasm
sudo bash -c "$(wget -O - https://apt.llvm.org/llvm.sh)"
- name: Run build-wrapper
run: |
@ -178,4 +176,5 @@ jobs:
--define sonar.cfamily.build-wrapper-output="${{ env.BUILD_WRAPPER_OUT_DIR }}" \
--define sonar.projectKey="ClickHouse_ClickHouse" \
--define sonar.organization="clickhouse-java" \
--define sonar.exclusions="**/*.java,**/*.ts,**/*.js,**/*.css,**/*.sql" \
--define sonar.cfamily.cpp23.enabled=true \
--define sonar.exclusions="**/*.java,**/*.ts,**/*.js,**/*.css,**/*.sql"

View File

@ -23,11 +23,11 @@ curl https://clickhouse.com/ | sh
## Upcoming Events
* [**v23.6 Release Webinar**](https://clickhouse.com/company/events/v23-6-release-call?utm_source=github&utm_medium=social&utm_campaign=release-webinar-2023-06) - Jun 29 - 23.6 is rapidly approaching. Original creator, co-founder, and CTO of ClickHouse Alexey Milovidov will walk us through the highlights of the release.
* [**ClickHouse Meetup in Paris**](https://www.meetup.com/clickhouse-france-user-group/events/294283460) - Jul 4
* [**ClickHouse Meetup in Boston**](https://www.meetup.com/clickhouse-boston-user-group/events/293913596) - Jul 18
* [**ClickHouse Meetup in NYC**](https://www.meetup.com/clickhouse-new-york-user-group/events/293913441) - Jul 19
* [**ClickHouse Meetup in Toronto**](https://www.meetup.com/clickhouse-toronto-user-group/events/294183127) - Jul 20
* [**ClickHouse Meetup in Singapore**](https://www.meetup.com/clickhouse-singapore-meetup-group/events/294428050/) - Jul 27
* [**ClickHouse Meetup in Paris**](https://www.meetup.com/clickhouse-france-user-group/events/294283460) - Sep 12
Also, keep an eye out for upcoming meetups around the world. Somewhere else you want us to be? Please feel free to reach out to tyler <at> clickhouse <dot> com.

View File

@ -13,6 +13,7 @@ The following versions of ClickHouse server are currently being supported with s
| Version | Supported |
|:-|:-|
| 23.6 | ✔️ |
| 23.5 | ✔️ |
| 23.4 | ✔️ |
| 23.3 | ✔️ |

View File

@ -2,11 +2,11 @@
# NOTE: has nothing common with DBMS_TCP_PROTOCOL_VERSION,
# only DBMS_TCP_PROTOCOL_VERSION should be incremented on protocol changes.
SET(VERSION_REVISION 54475)
SET(VERSION_REVISION 54476)
SET(VERSION_MAJOR 23)
SET(VERSION_MINOR 6)
SET(VERSION_MINOR 7)
SET(VERSION_PATCH 1)
SET(VERSION_GITHASH 2fec796e73efda10a538a03af3205ce8ffa1b2de)
SET(VERSION_DESCRIBE v23.6.1.1-testing)
SET(VERSION_STRING 23.6.1.1)
SET(VERSION_GITHASH d1c7e13d08868cb04d3562dcced704dd577cb1df)
SET(VERSION_DESCRIBE v23.7.1.1-testing)
SET(VERSION_STRING 23.7.1.1)
# end of autochange

2
contrib/libhdfs3 vendored

@ -1 +1 @@
Subproject commit 164b89253fad7991bce77882f01b51ab81d19f3d
Subproject commit 377220ef351ae24994a5fcd2b5fa3930d00c4db0

View File

@ -120,11 +120,12 @@
"docker/test/base": {
"name": "clickhouse/test-base",
"dependent": [
"docker/test/stateless",
"docker/test/integration/base",
"docker/test/fuzzer",
"docker/test/integration/base",
"docker/test/keeper-jepsen",
"docker/test/server-jepsen"
"docker/test/server-jepsen",
"docker/test/sqllogic",
"docker/test/stateless"
]
},
"docker/test/integration/kerberized_hadoop": {

View File

@ -32,7 +32,7 @@ RUN arch=${TARGETARCH:-amd64} \
esac
ARG REPOSITORY="https://s3.amazonaws.com/clickhouse-builds/22.4/31c367d3cd3aefd316778601ff6565119fe36682/package_release"
ARG VERSION="23.5.4.25"
ARG VERSION="23.6.1.1524"
ARG PACKAGES="clickhouse-keeper"
# user/group precreated explicitly with fixed uid/gid on purpose.

View File

@ -33,7 +33,7 @@ RUN arch=${TARGETARCH:-amd64} \
# lts / testing / prestable / etc
ARG REPO_CHANNEL="stable"
ARG REPOSITORY="https://packages.clickhouse.com/tgz/${REPO_CHANNEL}"
ARG VERSION="23.5.4.25"
ARG VERSION="23.6.1.1524"
ARG PACKAGES="clickhouse-client clickhouse-server clickhouse-common-static"
# user/group precreated explicitly with fixed uid/gid on purpose.

View File

@ -23,7 +23,7 @@ RUN sed -i "s|http://archive.ubuntu.com|${apt_archive}|g" /etc/apt/sources.list
ARG REPO_CHANNEL="stable"
ARG REPOSITORY="deb [signed-by=/usr/share/keyrings/clickhouse-keyring.gpg] https://packages.clickhouse.com/deb ${REPO_CHANNEL} main"
ARG VERSION="23.5.4.25"
ARG VERSION="23.6.1.1524"
ARG PACKAGES="clickhouse-client clickhouse-server clickhouse-common-static"
# set non-empty deb_location_url url to create a docker image

View File

@ -13,6 +13,7 @@ RUN apt-get update --yes \
sqlite3 \
unixodbc \
unixodbc-dev \
odbcinst \
sudo \
&& apt-get clean

View File

@ -18,6 +18,9 @@ ln -s /usr/share/clickhouse-test/clickhouse-test /usr/bin/clickhouse-test
# shellcheck disable=SC1091
source /usr/share/clickhouse-test/ci/attach_gdb.lib || true # FIXME: to not break old builds, clean on 2023-09-01
# shellcheck disable=SC1091
source /usr/share/clickhouse-test/ci/utils.lib || true # FIXME: to not break old builds, clean on 2023-09-01
# install test configs
/usr/share/clickhouse-test/config/install.sh
@ -90,28 +93,20 @@ sleep 5
attach_gdb_to_clickhouse || true # FIXME: to not break old builds, clean on 2023-09-01
function run_with_retry()
{
set +e
function fn_exists() {
declare -F "$1" > /dev/null;
}
# FIXME: to not break old builds, clean on 2023-09-01
function try_run_with_retry() {
local total_retries="$1"
shift
local retry=0
until [ "$retry" -ge "$total_retries" ]
do
if "$@"; then
set -e
return
else
retry=$((retry + 1))
sleep 3
fi
done
echo "Command '$*' failed after $total_retries retries, exiting"
exit 1
if fn_exists run_with_retry; then
run_with_retry "$total_retries" "$@"
else
"$@"
fi
}
function run_tests()
@ -161,9 +156,7 @@ function run_tests()
ADDITIONAL_OPTIONS+=('--report-logs-stats')
clickhouse-test "00001_select_1" > /dev/null ||:
run_with_retry 5 clickhouse-client -q "insert into system.zookeeper (name, path, value) values ('auxiliary_zookeeper2', '/test/chroot/', '')"
try_run_with_retry 10 clickhouse-client -q "insert into system.zookeeper (name, path, value) values ('auxiliary_zookeeper2', '/test/chroot/', '')"
set +e
clickhouse-test --testname --shard --zookeeper --check-zookeeper-session --hung-check --print-time \

View File

@ -189,6 +189,7 @@ rg -Fav -e "Code: 236. DB::Exception: Cancelled merging parts" \
-e "Authentication failed" \
-e "Cannot flush" \
-e "Container already exists" \
-e "doesn't have metadata version on disk" \
clickhouse-server.upgrade.log \
| grep -av -e "_repl_01111_.*Mapping for table with UUID" \
| zgrep -Fa "<Error>" > /test_output/upgrade_error_messages.txt \

View File

@ -0,0 +1,301 @@
---
sidebar_position: 1
sidebar_label: 2023
---
# 2023 Changelog
### ClickHouse release v23.6.1.1524-stable (d1c7e13d088) FIXME as compared to v23.5.1.3174-stable (2fec796e73e)
#### Backward Incompatible Change
* Delete feature `do_not_evict_index_and_mark_files` in the fs cache. This feature was only making things worse. [#51253](https://github.com/ClickHouse/ClickHouse/pull/51253) ([Kseniia Sumarokova](https://github.com/kssenii)).
* Remove ALTER support for experimental LIVE VIEW. [#51287](https://github.com/ClickHouse/ClickHouse/pull/51287) ([Alexey Milovidov](https://github.com/alexey-milovidov)).
#### New Feature
* Add setting `session_timezone`, it is used as default timezone for session when not explicitly specified. [#44149](https://github.com/ClickHouse/ClickHouse/pull/44149) ([Andrey Zvonov](https://github.com/zvonand)).
* Added overlay database engine and representation of a directory as a database This commit adds 4 databases: 1. DatabaseOverlay: Implements the IDatabase interface. Allow to combine multiple databases, such as FileSystem and Memory. Internally, it stores a vector with other database pointers and proxies requests to them in turn until it is executed successfully. 2. DatabaseFilesystem: allows to read-only interact with files stored on the file system. Internally, it uses TableFunctionFile to implicitly load file when a user requests the table. Result of TableFunctionFile call cached inside to provide quick access. 3. DatabaseS3: allows to read-only interact with s3 storage. It uses TableFunctionS3 to implicitly load table from s3 4. DatabaseHDFS: allows to interact with hdfs storage. It uses TableFunctionHDFS to implicitly load table from hdfs. [#48821](https://github.com/ClickHouse/ClickHouse/pull/48821) ([alekseygolub](https://github.com/alekseygolub)).
* Add a new setting named `use_mysql_types_in_show_columns` to alter the `SHOW COLUMNS` SQL statement to display MySQL equivalent types when a client is connected via the MySQL compatibility port. [#49577](https://github.com/ClickHouse/ClickHouse/pull/49577) ([Thomas Panetti](https://github.com/tpanetti)).
* Added option `--rename_files_after_processing <pattern>`. This closes [#34207](https://github.com/ClickHouse/ClickHouse/issues/34207). [#49626](https://github.com/ClickHouse/ClickHouse/pull/49626) ([alekseygolub](https://github.com/alekseygolub)).
* 1. Add `TableFunctionRedis` 3. Add table engine Redis 4. Add `RedisCommon` which contains Redis related tools and types 5. Support `equals` and `in` filter push down into Redis. [#50150](https://github.com/ClickHouse/ClickHouse/pull/50150) ([JackyWoo](https://github.com/JackyWoo)).
* Allow to skip empty files in file/s3/url/hdfs table functions using settings `s3_skip_empty_files`, `hdfs_skip_empty_files`, `engine_file_skip_empty_files`, `engine_url_skip_empty_files`. [#50364](https://github.com/ClickHouse/ClickHouse/pull/50364) ([Kruglov Pavel](https://github.com/Avogar)).
* Clickhouse-client can now be called with a connection instead of "--host", "--port", "--user" etc. [#50689](https://github.com/ClickHouse/ClickHouse/pull/50689) ([Alexey Gerasimchuck](https://github.com/Demilivor)).
* Codec DEFLATE_QPL is now controlled via server setting "enable_deflate_qpl_codec" (default: false) instead of setting "allow_experimental_codecs". This marks QPL_DEFLATE non-experimental. [#50775](https://github.com/ClickHouse/ClickHouse/pull/50775) ([Robert Schulze](https://github.com/rschu1ze)).
#### Performance Improvement
* Improve performance with enabled QueryProfiler using thread-local timer_id instead of global object. [#48778](https://github.com/ClickHouse/ClickHouse/pull/48778) ([Jiebin Sun](https://github.com/jiebinn)).
* Rewrite CapnProto input/output format to improve its performance. Map column names and CapnProto fields case insensitive, fix reading/writing of nested structure fields. [#49752](https://github.com/ClickHouse/ClickHouse/pull/49752) ([Kruglov Pavel](https://github.com/Avogar)).
* Optimize parquet write performance for parallel threads. [#50102](https://github.com/ClickHouse/ClickHouse/pull/50102) ([Hongbin Ma](https://github.com/binmahone)).
* ### Documentation entry for user-facing changes Disable `parallelize_output_from_storages` for processing MATERIALIZED VIEWs and storages with one block only. [#50214](https://github.com/ClickHouse/ClickHouse/pull/50214) ([Azat Khuzhin](https://github.com/azat)).
* Merge PR https://github.com/ClickHouse/ClickHouse/pull/46558 (Avoid processing already sorted data). Avoid block permutation during sort if the block is already sorted. [#50697](https://github.com/ClickHouse/ClickHouse/pull/50697) ([Maksim Kita](https://github.com/kitaisreal)).
* In the earlier PRs ([#50062](https://github.com/ClickHouse/ClickHouse/issues/50062), [#50307](https://github.com/ClickHouse/ClickHouse/issues/50307)), we used to propose an optimization pattern which transforms the predicates with toYear/toYYYYMM into its equivalent but converter-free form. This transformation could bring significant performance impact to some workloads, such as SSB. However, as issue [#50628](https://github.com/ClickHouse/ClickHouse/issues/50628) indicated, these two PRs would introduce some issues which may results in incomplete query results, and as a result, they were reverted by [#50629](https://github.com/ClickHouse/ClickHouse/issues/50629). [#50951](https://github.com/ClickHouse/ClickHouse/pull/50951) ([Zhiguo Zhou](https://github.com/ZhiguoZh)).
* Make multiple list requests to ZooKeeper in parallel to speed up reading from system.zookeeper table. [#51042](https://github.com/ClickHouse/ClickHouse/pull/51042) ([Alexander Gololobov](https://github.com/davenger)).
* Speedup initialization of DateTime lookup tables for time zones. This should reduce startup/connect time of clickhouse client especially in debug build as it is rather heavy. [#51347](https://github.com/ClickHouse/ClickHouse/pull/51347) ([Alexander Gololobov](https://github.com/davenger)).
#### Improvement
* Allow to cast IPv6 to IPv4 address for CIDR ::ffff:0:0/96 (IPv4-mapped addresses). [#49759](https://github.com/ClickHouse/ClickHouse/pull/49759) ([Yakov Olkhovskiy](https://github.com/yakov-olkhovskiy)).
* Update MongoDB protocol to support MongoDB 5.1 version and newer. Support for the versions with the old protocol (<3.6) is preserved. Closes [#45621](https://github.com/ClickHouse/ClickHouse/issues/45621), [#49879](https://github.com/ClickHouse/ClickHouse/issues/49879). [#50061](https://github.com/ClickHouse/ClickHouse/pull/50061) ([Nikolay Degterinsky](https://github.com/evillique)).
* Improved scheduling of merge selecting and cleanup tasks in `ReplicatedMergeTree`. The tasks will not be executed too frequently when there's nothing to merge or cleanup. Added settings `max_merge_selecting_sleep_ms`, `merge_selecting_sleep_slowdown_factor`, `max_cleanup_delay_period` and `cleanup_thread_preferred_points_per_iteration`. It should close [#31919](https://github.com/ClickHouse/ClickHouse/issues/31919). [#50107](https://github.com/ClickHouse/ClickHouse/pull/50107) ([Alexander Tokmakov](https://github.com/tavplubix)).
* Support parallel replicas with the analyzer. [#50441](https://github.com/ClickHouse/ClickHouse/pull/50441) ([Raúl Marín](https://github.com/Algunenano)).
* Add setting `input_format_max_bytes_to_read_for_schema_inference` to limit the number of bytes to read in schema inference. Closes [#50577](https://github.com/ClickHouse/ClickHouse/issues/50577). [#50592](https://github.com/ClickHouse/ClickHouse/pull/50592) ([Kruglov Pavel](https://github.com/Avogar)).
* Respect setting input_format_as_default in schema inference. [#50602](https://github.com/ClickHouse/ClickHouse/pull/50602) ([Kruglov Pavel](https://github.com/Avogar)).
* Make filter push down through cross join. [#50605](https://github.com/ClickHouse/ClickHouse/pull/50605) ([Han Fei](https://github.com/hanfei1991)).
* Actual lz4 version is used now. [#50621](https://github.com/ClickHouse/ClickHouse/pull/50621) ([Nikita Taranov](https://github.com/nickitat)).
* Allow to skip trailing empty lines in CSV/TSV/CustomSeparated formats via settings `input_format_csv_skip_trailing_empty_lines`, `input_format_tsv_skip_trailing_empty_lines` and `input_format_custom_skip_trailing_empty_lines` (disabled by default). Closes [#49315](https://github.com/ClickHouse/ClickHouse/issues/49315). [#50635](https://github.com/ClickHouse/ClickHouse/pull/50635) ([Kruglov Pavel](https://github.com/Avogar)).
* Functions "toDateOrDefault|OrNull()" and "accuateCast[OrDefault|OrNull]()" now correctly parse numeric arguments. [#50709](https://github.com/ClickHouse/ClickHouse/pull/50709) ([Dmitry Kardymon](https://github.com/kardymonds)).
* Currently, the csv input format can not parse the csv file with whitespace or \t field delimiter, and these delimiters is supported in spark. [#50712](https://github.com/ClickHouse/ClickHouse/pull/50712) ([KevinyhZou](https://github.com/KevinyhZou)).
* Settings `number_of_mutations_to_delay` and `number_of_mutations_to_throw` are enabled by default now with values 500 and 1000 respectively. [#50726](https://github.com/ClickHouse/ClickHouse/pull/50726) ([Anton Popov](https://github.com/CurtizJ)).
* Keeper improvement: add feature flags for Keeper API. Each feature flag can be disabled or enabled by defining it under `keeper_server.feature_flags` config. E.g. to enable `CheckNotExists` request, `keeper_server.feature_flags.check_not_exists` should be set to `1` on Keeper. [#50796](https://github.com/ClickHouse/ClickHouse/pull/50796) ([Antonio Andelic](https://github.com/antonio2368)).
* The dashboard correctly shows missing values. This closes [#50831](https://github.com/ClickHouse/ClickHouse/issues/50831). [#50832](https://github.com/ClickHouse/ClickHouse/pull/50832) ([Alexey Milovidov](https://github.com/alexey-milovidov)).
* CGroups metrics related to CPU are replaced with one metric, `CGroupMaxCPU` for better usability. The `Normalized` CPU usage metrics will be normalized to CGroups limits instead of the total number of CPUs when they are set. This closes [#50836](https://github.com/ClickHouse/ClickHouse/issues/50836). [#50835](https://github.com/ClickHouse/ClickHouse/pull/50835) ([Alexey Milovidov](https://github.com/alexey-milovidov)).
* Relax the thresholds for "too many parts" to be more modern. Return the backpressure during long-running insert queries. [#50856](https://github.com/ClickHouse/ClickHouse/pull/50856) ([Alexey Milovidov](https://github.com/alexey-milovidov)).
* Added the possibility to use date and time arguments in syslog timestamp format in functions parseDateTimeBestEffort*() and parseDateTime64BestEffort*(). [#50925](https://github.com/ClickHouse/ClickHouse/pull/50925) ([Victor Krasnov](https://github.com/sirvickr)).
* Suggest using `APPEND` or `TRUNCATE` for `INTO OUTFILE` when file exists. [#50950](https://github.com/ClickHouse/ClickHouse/pull/50950) ([alekar](https://github.com/alekar)).
* Add embedded keeper-client to standalone keeper binary. [#50964](https://github.com/ClickHouse/ClickHouse/pull/50964) ([pufit](https://github.com/pufit)).
* Command line parameter "--password" in clickhouse-client can now be specified only once. [#50966](https://github.com/ClickHouse/ClickHouse/pull/50966) ([Alexey Gerasimchuck](https://github.com/Demilivor)).
* Fix data lakes slowness because of synchronous head requests. (Related to Iceberg/Deltalake/Hudi being slow with a lot of files). [#50976](https://github.com/ClickHouse/ClickHouse/pull/50976) ([Kseniia Sumarokova](https://github.com/kssenii)).
* Use `hash_of_all_files` from `system.parts` to check identity of parts during on-cluster backups. [#50997](https://github.com/ClickHouse/ClickHouse/pull/50997) ([Vitaly Baranov](https://github.com/vitlibar)).
* The system table zookeeper_connection connected_time identifies the time when the connection is established (standard format), and session_uptime_elapsed_seconds is added, which labels the duration of the established connection session (in seconds). [#51026](https://github.com/ClickHouse/ClickHouse/pull/51026) ([郭小龙](https://github.com/guoxiaolongzte)).
* Show halves of checksums in `system.parts`, `system.projection_parts` and in error messages in the correct order. [#51040](https://github.com/ClickHouse/ClickHouse/pull/51040) ([Vitaly Baranov](https://github.com/vitlibar)).
* Do not replicate `ALTER PARTITION` queries and mutations through `Replicated` database if it has only one shard and the underlying table is `ReplicatedMergeTree`. [#51049](https://github.com/ClickHouse/ClickHouse/pull/51049) ([Alexander Tokmakov](https://github.com/tavplubix)).
* Improve the progress bar for file/s3/hdfs/url table functions by using chunk size from source data and using incremental total size counting in each thread. Fix the progress bar for *Cluster functions. This closes [#47250](https://github.com/ClickHouse/ClickHouse/issues/47250). [#51088](https://github.com/ClickHouse/ClickHouse/pull/51088) ([Kruglov Pavel](https://github.com/Avogar)).
* Add total_bytes_to_read to Progress packet in TCP protocol for better Progress bar. [#51158](https://github.com/ClickHouse/ClickHouse/pull/51158) ([Kruglov Pavel](https://github.com/Avogar)).
* Better checking of data parts on disks with filesystem cache. [#51164](https://github.com/ClickHouse/ClickHouse/pull/51164) ([Anton Popov](https://github.com/CurtizJ)).
* Disable cache setting `do_not_evict_index_and_mark_files` (Was enabled in `23.5`). [#51222](https://github.com/ClickHouse/ClickHouse/pull/51222) ([Kseniia Sumarokova](https://github.com/kssenii)).
* Fix sometimes not correct current_elements_num in fs cache. [#51242](https://github.com/ClickHouse/ClickHouse/pull/51242) ([Kseniia Sumarokova](https://github.com/kssenii)).
* Add random sleep before merges/mutations execution to split load more evenly between replicas in case of zero-copy replication. [#51282](https://github.com/ClickHouse/ClickHouse/pull/51282) ([alesapin](https://github.com/alesapin)).
* The function `transform` as well as `CASE` with value matching started to support all data types. This closes [#29730](https://github.com/ClickHouse/ClickHouse/issues/29730). This closes [#32387](https://github.com/ClickHouse/ClickHouse/issues/32387). This closes [#50827](https://github.com/ClickHouse/ClickHouse/issues/50827). This closes [#31336](https://github.com/ClickHouse/ClickHouse/issues/31336). This closes [#40493](https://github.com/ClickHouse/ClickHouse/issues/40493). [#51351](https://github.com/ClickHouse/ClickHouse/pull/51351) ([Alexey Milovidov](https://github.com/alexey-milovidov)).
* We have found a bug in LLVM that makes the usage of `compile_expressions` setting unsafe. It is disabled by default. [#51368](https://github.com/ClickHouse/ClickHouse/pull/51368) ([Alexey Milovidov](https://github.com/alexey-milovidov)).
* Issue [#50220](https://github.com/ClickHouse/ClickHouse/issues/50220) reports a core in `grace_hash` join. We finally reproduce the exception on local, and found that the issue is related to the failure of creating temporary file. Somehow this is triggered in https://github.com/ClickHouse/ClickHouse/pull/49816 https://github.com/ClickHouse/ClickHouse/pull/49483. [#51382](https://github.com/ClickHouse/ClickHouse/pull/51382) ([lgbo](https://github.com/lgbo-ustc)).
#### Build/Testing/Packaging Improvement
* Update contrib/re2 to 2023-06-02. [#50949](https://github.com/ClickHouse/ClickHouse/pull/50949) ([Yuriy Chernyshov](https://github.com/georgthegreat)).
* ClickHouse server will print the list of changed settings on fatal errors. This closes [#51137](https://github.com/ClickHouse/ClickHouse/issues/51137). [#51138](https://github.com/ClickHouse/ClickHouse/pull/51138) ([Alexey Milovidov](https://github.com/alexey-milovidov)).
* In https://github.com/ClickHouse/ClickHouse/pull/51143 the fasstests failed, but the status wasn't created because of the chown `file not found`. This addresses it. Decrease the default values for `http-max-field-value-size` and `http_max_field_name_size` to 128K. [#51163](https://github.com/ClickHouse/ClickHouse/pull/51163) ([Mikhail f. Shiryaev](https://github.com/Felixoid)).
* Update Ubuntu version in docker containers. [#51180](https://github.com/ClickHouse/ClickHouse/pull/51180) ([Mikhail f. Shiryaev](https://github.com/Felixoid)).
* Allow building ClickHouse with clang-17. [#51300](https://github.com/ClickHouse/ClickHouse/pull/51300) ([Alexey Milovidov](https://github.com/alexey-milovidov)).
* [SQLancer](https://github.com/sqlancer/sqlancer) check is considered stable as bugs that were triggered by it are fixed. Now failures of SQLancer check will be reported as failed check status. [#51340](https://github.com/ClickHouse/ClickHouse/pull/51340) ([Ilya Yatsishin](https://github.com/qoega)).
* Making our CI even better. [#51494](https://github.com/ClickHouse/ClickHouse/pull/51494) ([Nikita Mikhaylov](https://github.com/nikitamikhaylov)).
* Split huge `RUN` in Dockerfile into smaller conditional. Install the necessary tools on demand in the same `RUN` layer, and remove them after that. Upgrade the OS only once at the beginning. Use a modern way to check the signed repository. Downgrade the base repo to ubuntu:20.04 to address the issues on older docker versions. Upgrade golang version to address golang vulnerabilities. [#51504](https://github.com/ClickHouse/ClickHouse/pull/51504) ([Mikhail f. Shiryaev](https://github.com/Felixoid)).
* This a follow-up for [#51504](https://github.com/ClickHouse/ClickHouse/issues/51504), the cleanup was lost during refactoring. [#51564](https://github.com/ClickHouse/ClickHouse/pull/51564) ([Mikhail f. Shiryaev](https://github.com/Felixoid)).
#### Bug Fix (user-visible misbehavior in an official stable release)
* Report loading status for executable dictionaries correctly [#48775](https://github.com/ClickHouse/ClickHouse/pull/48775) ([Anton Kozlov](https://github.com/tonickkozlov)).
* Proper mutation of skip indices and projections [#50104](https://github.com/ClickHouse/ClickHouse/pull/50104) ([Amos Bird](https://github.com/amosbird)).
* Cleanup moving parts [#50489](https://github.com/ClickHouse/ClickHouse/pull/50489) ([vdimir](https://github.com/vdimir)).
* Fix backward compatibility for IP types hashing in aggregate functions [#50551](https://github.com/ClickHouse/ClickHouse/pull/50551) ([Yakov Olkhovskiy](https://github.com/yakov-olkhovskiy)).
* Fix Log family table return wrong rows count after truncate [#50585](https://github.com/ClickHouse/ClickHouse/pull/50585) ([flynn](https://github.com/ucasfl)).
* Fix bug in `uniqExact` parallel merging [#50590](https://github.com/ClickHouse/ClickHouse/pull/50590) ([Nikita Taranov](https://github.com/nickitat)).
* Revert recent grace hash join changes [#50699](https://github.com/ClickHouse/ClickHouse/pull/50699) ([vdimir](https://github.com/vdimir)).
* Query Cache: Try to fix bad cast from ColumnConst to ColumnVector<char8_t> [#50704](https://github.com/ClickHouse/ClickHouse/pull/50704) ([Robert Schulze](https://github.com/rschu1ze)).
* Do not read all the columns from right GLOBAL JOIN table. [#50721](https://github.com/ClickHouse/ClickHouse/pull/50721) ([Nikolai Kochetov](https://github.com/KochetovNicolai)).
* Avoid storing logs in Keeper containing unknown operation [#50751](https://github.com/ClickHouse/ClickHouse/pull/50751) ([Antonio Andelic](https://github.com/antonio2368)).
* SummingMergeTree support for DateTime64 [#50797](https://github.com/ClickHouse/ClickHouse/pull/50797) ([Jordi Villar](https://github.com/jrdi)).
* Add compat setting for non-const timezones [#50834](https://github.com/ClickHouse/ClickHouse/pull/50834) ([Robert Schulze](https://github.com/rschu1ze)).
* Fix type of LDAP server params hash in cache entry [#50865](https://github.com/ClickHouse/ClickHouse/pull/50865) ([Julian Maicher](https://github.com/jmaicher)).
* Fallback to parsing big integer from String instead of exception in Parquet format [#50873](https://github.com/ClickHouse/ClickHouse/pull/50873) ([Kruglov Pavel](https://github.com/Avogar)).
* Fix checking the lock file too often while writing a backup [#50889](https://github.com/ClickHouse/ClickHouse/pull/50889) ([Vitaly Baranov](https://github.com/vitlibar)).
* Do not apply projection if read-in-order was enabled. [#50923](https://github.com/ClickHouse/ClickHouse/pull/50923) ([Nikolai Kochetov](https://github.com/KochetovNicolai)).
* Fix race azure blob storage iterator [#50936](https://github.com/ClickHouse/ClickHouse/pull/50936) ([SmitaRKulkarni](https://github.com/SmitaRKulkarni)).
* Fix erroneous `sort_description` propagation in `CreatingSets` [#50955](https://github.com/ClickHouse/ClickHouse/pull/50955) ([Nikita Taranov](https://github.com/nickitat)).
* Fix iceberg V2 optional metadata parsing [#50974](https://github.com/ClickHouse/ClickHouse/pull/50974) ([Kseniia Sumarokova](https://github.com/kssenii)).
* MaterializedMySQL: Keep parentheses for empty table overrides [#50977](https://github.com/ClickHouse/ClickHouse/pull/50977) ([Val Doroshchuk](https://github.com/valbok)).
* Fix crash in BackupCoordinationStageSync::setError() [#51012](https://github.com/ClickHouse/ClickHouse/pull/51012) ([Vitaly Baranov](https://github.com/vitlibar)).
* Fix subtly broken copy-on-write of ColumnLowCardinality dictionary [#51064](https://github.com/ClickHouse/ClickHouse/pull/51064) ([Michael Kolupaev](https://github.com/al13n321)).
* Generate safe IVs [#51086](https://github.com/ClickHouse/ClickHouse/pull/51086) ([Salvatore Mesoraca](https://github.com/aiven-sal)).
* Fix ineffective query cache for SELECTs with subqueries [#51132](https://github.com/ClickHouse/ClickHouse/pull/51132) ([Robert Schulze](https://github.com/rschu1ze)).
* Fix Set index with constant nullable comparison. [#51205](https://github.com/ClickHouse/ClickHouse/pull/51205) ([Nikolai Kochetov](https://github.com/KochetovNicolai)).
* Fix a crash in s3 and s3Cluster functions [#51209](https://github.com/ClickHouse/ClickHouse/pull/51209) ([Nikolay Degterinsky](https://github.com/evillique)).
* Fix core dump when compile expression [#51231](https://github.com/ClickHouse/ClickHouse/pull/51231) ([LiuNeng](https://github.com/liuneng1994)).
* Fix use-after-free in StorageURL when switching URLs [#51260](https://github.com/ClickHouse/ClickHouse/pull/51260) ([Michael Kolupaev](https://github.com/al13n321)).
* Updated check for parameterized view [#51272](https://github.com/ClickHouse/ClickHouse/pull/51272) ([SmitaRKulkarni](https://github.com/SmitaRKulkarni)).
* Fix multiple writing of same file to backup [#51299](https://github.com/ClickHouse/ClickHouse/pull/51299) ([Vitaly Baranov](https://github.com/vitlibar)).
* Fix fuzzer failure in ActionsDAG [#51301](https://github.com/ClickHouse/ClickHouse/pull/51301) ([Alexey Milovidov](https://github.com/alexey-milovidov)).
* Remove garbage from function `transform` [#51350](https://github.com/ClickHouse/ClickHouse/pull/51350) ([Alexey Milovidov](https://github.com/alexey-milovidov)).
* Fix MSan report in lowerUTF8/upperUTF8 [#51371](https://github.com/ClickHouse/ClickHouse/pull/51371) ([Alexey Milovidov](https://github.com/alexey-milovidov)).
* fs cache: fix a bit incorrect use_count after [#44985](https://github.com/ClickHouse/ClickHouse/issues/44985) [#51406](https://github.com/ClickHouse/ClickHouse/pull/51406) ([Kseniia Sumarokova](https://github.com/kssenii)).
* Fix segfault in MathUnary [#51499](https://github.com/ClickHouse/ClickHouse/pull/51499) ([Ilya Yatsishin](https://github.com/qoega)).
* Fix logical assert in `tupleElement()` with default values [#51534](https://github.com/ClickHouse/ClickHouse/pull/51534) ([Robert Schulze](https://github.com/rschu1ze)).
* fs cache: remove file from opened file cache immediately when evicting file [#51596](https://github.com/ClickHouse/ClickHouse/pull/51596) ([Kseniia Sumarokova](https://github.com/kssenii)).
#### NOT FOR CHANGELOG / INSIGNIFICANT
* Deprecate delete-on-destroy.txt [#49181](https://github.com/ClickHouse/ClickHouse/pull/49181) ([Alexander Gololobov](https://github.com/davenger)).
* Attempt to increase the general runners' survival rate [#49283](https://github.com/ClickHouse/ClickHouse/pull/49283) ([Mikhail f. Shiryaev](https://github.com/Felixoid)).
* Refactor subqueries for IN [#49570](https://github.com/ClickHouse/ClickHouse/pull/49570) ([Nikolai Kochetov](https://github.com/KochetovNicolai)).
* Test plan optimization analyzer [#50095](https://github.com/ClickHouse/ClickHouse/pull/50095) ([Igor Nikonov](https://github.com/devcrafter)).
* Implement endianness-independent serialization for quantileTiming [#50324](https://github.com/ClickHouse/ClickHouse/pull/50324) ([ltrk2](https://github.com/ltrk2)).
* require `finalize()` call before d-tor for all writes buffers [#50395](https://github.com/ClickHouse/ClickHouse/pull/50395) ([Sema Checherinda](https://github.com/CheSema)).
* Implement big-endian support for the deterministic reservoir sampler [#50405](https://github.com/ClickHouse/ClickHouse/pull/50405) ([ltrk2](https://github.com/ltrk2)).
* Fix compilation error on big-endian platforms [#50406](https://github.com/ClickHouse/ClickHouse/pull/50406) ([ltrk2](https://github.com/ltrk2)).
* Attach gdb in stateless tests [#50487](https://github.com/ClickHouse/ClickHouse/pull/50487) ([Kruglov Pavel](https://github.com/Avogar)).
* JIT infrastructure refactoring [#50531](https://github.com/ClickHouse/ClickHouse/pull/50531) ([Maksim Kita](https://github.com/kitaisreal)).
* Analyzer: Do not apply Query Tree optimizations on shards [#50584](https://github.com/ClickHouse/ClickHouse/pull/50584) ([Dmitry Novik](https://github.com/novikd)).
* Increase max array size in group bitmap [#50620](https://github.com/ClickHouse/ClickHouse/pull/50620) ([Kruglov Pavel](https://github.com/Avogar)).
* Misc Annoy index improvements [#50661](https://github.com/ClickHouse/ClickHouse/pull/50661) ([Robert Schulze](https://github.com/rschu1ze)).
* Fix reading negative decimals in avro format [#50668](https://github.com/ClickHouse/ClickHouse/pull/50668) ([Kruglov Pavel](https://github.com/Avogar)).
* Unify priorities for connection pools [#50675](https://github.com/ClickHouse/ClickHouse/pull/50675) ([Sergei Trifonov](https://github.com/serxa)).
* Prostpone check of outdated parts [#50676](https://github.com/ClickHouse/ClickHouse/pull/50676) ([Alexander Tokmakov](https://github.com/tavplubix)).
* Unify priorities: `IExecutableTask`s [#50677](https://github.com/ClickHouse/ClickHouse/pull/50677) ([Sergei Trifonov](https://github.com/serxa)).
* Disable grace_hash join in stress tests [#50693](https://github.com/ClickHouse/ClickHouse/pull/50693) ([vdimir](https://github.com/vdimir)).
* ReverseTransform small improvement [#50698](https://github.com/ClickHouse/ClickHouse/pull/50698) ([Maksim Kita](https://github.com/kitaisreal)).
* Support OPTIMIZE for temporary tables [#50710](https://github.com/ClickHouse/ClickHouse/pull/50710) ([Alexander Tokmakov](https://github.com/tavplubix)).
* Refactor reading from object storages [#50711](https://github.com/ClickHouse/ClickHouse/pull/50711) ([Kseniia Sumarokova](https://github.com/kssenii)).
* Fix data race in log message of cached buffer [#50723](https://github.com/ClickHouse/ClickHouse/pull/50723) ([Kseniia Sumarokova](https://github.com/kssenii)).
* Add new keywords into projections documentation [#50743](https://github.com/ClickHouse/ClickHouse/pull/50743) ([YalalovSM](https://github.com/YalalovSM)).
* Fix build for aarch64 (temporary disable azure) [#50770](https://github.com/ClickHouse/ClickHouse/pull/50770) ([alesapin](https://github.com/alesapin)).
* Update version after release [#50772](https://github.com/ClickHouse/ClickHouse/pull/50772) ([Alexey Milovidov](https://github.com/alexey-milovidov)).
* Update version_date.tsv and changelogs after v23.5.1.3174-stable [#50774](https://github.com/ClickHouse/ClickHouse/pull/50774) ([robot-clickhouse](https://github.com/robot-clickhouse)).
* Update CHANGELOG.md [#50788](https://github.com/ClickHouse/ClickHouse/pull/50788) ([Ilya Yatsishin](https://github.com/qoega)).
* Update version_date.tsv and changelogs after v23.2.7.32-stable [#50809](https://github.com/ClickHouse/ClickHouse/pull/50809) ([robot-clickhouse](https://github.com/robot-clickhouse)).
* Desctructing --> Destructing [#50810](https://github.com/ClickHouse/ClickHouse/pull/50810) ([Robert Schulze](https://github.com/rschu1ze)).
* Don't mark a part as broken on `Poco::TimeoutException` [#50811](https://github.com/ClickHouse/ClickHouse/pull/50811) ([Alexander Tokmakov](https://github.com/tavplubix)).
* Rename azure_blob_storage to azureBlobStorage [#50812](https://github.com/ClickHouse/ClickHouse/pull/50812) ([SmitaRKulkarni](https://github.com/SmitaRKulkarni)).
* Fix ParallelReadBuffer seek [#50820](https://github.com/ClickHouse/ClickHouse/pull/50820) ([Michael Kolupaev](https://github.com/al13n321)).
* [RFC] Print git hash when crashing [#50823](https://github.com/ClickHouse/ClickHouse/pull/50823) ([Michael Kolupaev](https://github.com/al13n321)).
* Add tests for function "transform" [#50833](https://github.com/ClickHouse/ClickHouse/pull/50833) ([Alexey Milovidov](https://github.com/alexey-milovidov)).
* Update version_date.tsv and changelogs after v23.5.2.7-stable [#50844](https://github.com/ClickHouse/ClickHouse/pull/50844) ([robot-clickhouse](https://github.com/robot-clickhouse)).
* Updated changelog with azureBlobStorage table function & engine entry [#50850](https://github.com/ClickHouse/ClickHouse/pull/50850) ([SmitaRKulkarni](https://github.com/SmitaRKulkarni)).
* Update easy_tasks_sorted_ru.md [#50853](https://github.com/ClickHouse/ClickHouse/pull/50853) ([Alexey Milovidov](https://github.com/alexey-milovidov)).
* Document x86 / ARM prerequisites for Docker image [#50867](https://github.com/ClickHouse/ClickHouse/pull/50867) ([Robert Schulze](https://github.com/rschu1ze)).
* MaterializedMySQL: Add test_named_collections [#50874](https://github.com/ClickHouse/ClickHouse/pull/50874) ([Val Doroshchuk](https://github.com/valbok)).
* Update version_date.tsv and changelogs after v22.8.18.31-lts [#50881](https://github.com/ClickHouse/ClickHouse/pull/50881) ([robot-clickhouse](https://github.com/robot-clickhouse)).
* Update version_date.tsv and changelogs after v23.3.3.52-lts [#50882](https://github.com/ClickHouse/ClickHouse/pull/50882) ([robot-clickhouse](https://github.com/robot-clickhouse)).
* Update version_date.tsv and changelogs after v23.4.3.48-stable [#50883](https://github.com/ClickHouse/ClickHouse/pull/50883) ([robot-clickhouse](https://github.com/robot-clickhouse)).
* MaterializedMySQL: Add additional test case to insert_with_modify_binlog_checksum [#50884](https://github.com/ClickHouse/ClickHouse/pull/50884) ([Val Doroshchuk](https://github.com/valbok)).
* Update broken tests list [#50886](https://github.com/ClickHouse/ClickHouse/pull/50886) ([Dmitry Novik](https://github.com/novikd)).
* Fix LOGICAL_ERROR in snowflakeToDateTime*() [#50893](https://github.com/ClickHouse/ClickHouse/pull/50893) ([Robert Schulze](https://github.com/rschu1ze)).
* Tests with parallel replicas are no more "always green" [#50896](https://github.com/ClickHouse/ClickHouse/pull/50896) ([Nikita Mikhaylov](https://github.com/nikitamikhaylov)).
* Slightly more information in error message about cached disk [#50897](https://github.com/ClickHouse/ClickHouse/pull/50897) ([Michael Kolupaev](https://github.com/al13n321)).
* do not call finalize after exception [#50907](https://github.com/ClickHouse/ClickHouse/pull/50907) ([Sema Checherinda](https://github.com/CheSema)).
* Update Annoy docs [#50912](https://github.com/ClickHouse/ClickHouse/pull/50912) ([Robert Schulze](https://github.com/rschu1ze)).
* A bit safer UserDefinedSQLFunctionVisitor [#50913](https://github.com/ClickHouse/ClickHouse/pull/50913) ([Kseniia Sumarokova](https://github.com/kssenii)).
* Update contribe/orc in .gitmodules [#50920](https://github.com/ClickHouse/ClickHouse/pull/50920) ([San](https://github.com/santrancisco)).
* MaterializedMySQL: Add missing DROP DATABASE for tests [#50924](https://github.com/ClickHouse/ClickHouse/pull/50924) ([Val Doroshchuk](https://github.com/valbok)).
* Fix 'Illegal column timezone' in stress tests [#50929](https://github.com/ClickHouse/ClickHouse/pull/50929) ([Alexander Tokmakov](https://github.com/tavplubix)).
* Fix tests sanity checks and avoid dropping system.query_log table [#50934](https://github.com/ClickHouse/ClickHouse/pull/50934) ([Azat Khuzhin](https://github.com/azat)).
* Fix tests for throttling by allowing more margin of error for trottling event [#50935](https://github.com/ClickHouse/ClickHouse/pull/50935) ([Azat Khuzhin](https://github.com/azat)).
* 01746_convert_type_with_default: Temporarily disable flaky test [#50937](https://github.com/ClickHouse/ClickHouse/pull/50937) ([Robert Schulze](https://github.com/rschu1ze)).
* Fix the statless tests image for old commits [#50947](https://github.com/ClickHouse/ClickHouse/pull/50947) ([Mikhail f. Shiryaev](https://github.com/Felixoid)).
* Fix logic in `AsynchronousBoundedReadBuffer::seek` [#50952](https://github.com/ClickHouse/ClickHouse/pull/50952) ([Nikita Taranov](https://github.com/nickitat)).
* Uncomment flaky test (01746_convert_type_with_default) [#50954](https://github.com/ClickHouse/ClickHouse/pull/50954) ([Dmitry Kardymon](https://github.com/kardymonds)).
* Fix keeper-client help message [#50965](https://github.com/ClickHouse/ClickHouse/pull/50965) ([pufit](https://github.com/pufit)).
* fix build issue on clang 15 [#50967](https://github.com/ClickHouse/ClickHouse/pull/50967) ([Chang chen](https://github.com/baibaichen)).
* Docs: Fix embedded video link [#50972](https://github.com/ClickHouse/ClickHouse/pull/50972) ([Robert Schulze](https://github.com/rschu1ze)).
* Change submodule capnproto to it's fork in ClickHouse [#50987](https://github.com/ClickHouse/ClickHouse/pull/50987) ([Kruglov Pavel](https://github.com/Avogar)).
* Attempt to make 01281_group_by_limit_memory_tracking not flaky [#50995](https://github.com/ClickHouse/ClickHouse/pull/50995) ([Dmitry Novik](https://github.com/novikd)).
* Fix flaky 02561_null_as_default_more_formats [#51001](https://github.com/ClickHouse/ClickHouse/pull/51001) ([Igor Nikonov](https://github.com/devcrafter)).
* Fix flaky test_seekable_formats [#51002](https://github.com/ClickHouse/ClickHouse/pull/51002) ([Kruglov Pavel](https://github.com/Avogar)).
* Follow-up to [#50448](https://github.com/ClickHouse/ClickHouse/issues/50448) [#51006](https://github.com/ClickHouse/ClickHouse/pull/51006) ([Alexander Tokmakov](https://github.com/tavplubix)).
* Fix a versions' tweak for tagged commits, improve version_helper [#51035](https://github.com/ClickHouse/ClickHouse/pull/51035) ([Mikhail f. Shiryaev](https://github.com/Felixoid)).
* Sqlancer has changed master to main [#51060](https://github.com/ClickHouse/ClickHouse/pull/51060) ([Mikhail f. Shiryaev](https://github.com/Felixoid)).
* Do not spam sqlancer build log [#51061](https://github.com/ClickHouse/ClickHouse/pull/51061) ([Mikhail f. Shiryaev](https://github.com/Felixoid)).
* Refactor IColumn::forEachSubcolumn to make it slightly harder to implement incorrectly [#51072](https://github.com/ClickHouse/ClickHouse/pull/51072) ([Michael Kolupaev](https://github.com/al13n321)).
* MaterializedMySQL: Rename materialize_with_ddl.py -> materialized_with_ddl [#51074](https://github.com/ClickHouse/ClickHouse/pull/51074) ([Val Doroshchuk](https://github.com/valbok)).
* Improve woboq browser report [#51077](https://github.com/ClickHouse/ClickHouse/pull/51077) ([Mikhail f. Shiryaev](https://github.com/Felixoid)).
* Fix for part_names_mutex used after destruction [#51099](https://github.com/ClickHouse/ClickHouse/pull/51099) ([Alexander Gololobov](https://github.com/davenger)).
* Fix ColumnConst::forEachSubcolumn missing from previous PR [#51102](https://github.com/ClickHouse/ClickHouse/pull/51102) ([Michael Kolupaev](https://github.com/al13n321)).
* Fix the test 02783_parsedatetimebesteffort_syslog flakiness [#51112](https://github.com/ClickHouse/ClickHouse/pull/51112) ([Victor Krasnov](https://github.com/sirvickr)).
* Compatibility with clang-17 [#51114](https://github.com/ClickHouse/ClickHouse/pull/51114) ([Alexey Milovidov](https://github.com/alexey-milovidov)).
* Make more parallel get requests to ZooKeeper in system.zookeeper [#51118](https://github.com/ClickHouse/ClickHouse/pull/51118) ([Alexander Gololobov](https://github.com/davenger)).
* Fix 02703_max_local_write_bandwidth flakiness [#51120](https://github.com/ClickHouse/ClickHouse/pull/51120) ([Azat Khuzhin](https://github.com/azat)).
* Update version_date.tsv and changelogs after v23.5.3.24-stable [#51121](https://github.com/ClickHouse/ClickHouse/pull/51121) ([robot-clickhouse](https://github.com/robot-clickhouse)).
* Update version_date.tsv and changelogs after v23.4.4.16-stable [#51122](https://github.com/ClickHouse/ClickHouse/pull/51122) ([robot-clickhouse](https://github.com/robot-clickhouse)).
* Update version_date.tsv and changelogs after v23.3.4.17-lts [#51123](https://github.com/ClickHouse/ClickHouse/pull/51123) ([robot-clickhouse](https://github.com/robot-clickhouse)).
* Update version_date.tsv and changelogs after v22.8.19.10-lts [#51124](https://github.com/ClickHouse/ClickHouse/pull/51124) ([robot-clickhouse](https://github.com/robot-clickhouse)).
* Fix typo [#51126](https://github.com/ClickHouse/ClickHouse/pull/51126) ([Alexey Milovidov](https://github.com/alexey-milovidov)).
* Slightly better diagnostics [#51127](https://github.com/ClickHouse/ClickHouse/pull/51127) ([Alexey Milovidov](https://github.com/alexey-milovidov)).
* Small fix in `MergeTreePrefetchedReadPool` [#51131](https://github.com/ClickHouse/ClickHouse/pull/51131) ([Nikita Taranov](https://github.com/nickitat)).
* Don't report table function accesses to system.errors [#51147](https://github.com/ClickHouse/ClickHouse/pull/51147) ([Raúl Marín](https://github.com/Algunenano)).
* Fix SQLancer branch name [#51148](https://github.com/ClickHouse/ClickHouse/pull/51148) ([Ilya Yatsishin](https://github.com/qoega)).
* Revert "Added ability to implicitly use file/hdfs/s3 table functions in clickhouse-local" [#51149](https://github.com/ClickHouse/ClickHouse/pull/51149) ([Alexander Tokmakov](https://github.com/tavplubix)).
* More profile events for fs cache [#51161](https://github.com/ClickHouse/ClickHouse/pull/51161) ([Kseniia Sumarokova](https://github.com/kssenii)).
* Unforget to pass callback to readBigAt() in ParallelReadBuffer [#51165](https://github.com/ClickHouse/ClickHouse/pull/51165) ([Michael Kolupaev](https://github.com/al13n321)).
* Update README.md [#51179](https://github.com/ClickHouse/ClickHouse/pull/51179) ([Tyler Hannan](https://github.com/tylerhannan)).
* Update exception message [#51187](https://github.com/ClickHouse/ClickHouse/pull/51187) ([Kseniia Sumarokova](https://github.com/kssenii)).
* Split long test 02149_schema_inference_formats_with_schema into several tests to avoid timeout in debug [#51197](https://github.com/ClickHouse/ClickHouse/pull/51197) ([Kruglov Pavel](https://github.com/Avogar)).
* Avoid initializing DateLUT from emptyArray function registration [#51199](https://github.com/ClickHouse/ClickHouse/pull/51199) ([Alexander Gololobov](https://github.com/davenger)).
* Suppress check for covered parts in ZooKeeper [#51207](https://github.com/ClickHouse/ClickHouse/pull/51207) ([Alexander Tokmakov](https://github.com/tavplubix)).
* One more profile event for fs cache [#51223](https://github.com/ClickHouse/ClickHouse/pull/51223) ([Kseniia Sumarokova](https://github.com/kssenii)).
* Typo: passowrd_sha256_hex --> password_sha256_hex [#51233](https://github.com/ClickHouse/ClickHouse/pull/51233) ([Robert Schulze](https://github.com/rschu1ze)).
* Introduce settings enum field with auto-generated values list [#51237](https://github.com/ClickHouse/ClickHouse/pull/51237) ([Sergei Trifonov](https://github.com/serxa)).
* Drop session if we fail to get Keeper API version [#51238](https://github.com/ClickHouse/ClickHouse/pull/51238) ([Alexander Gololobov](https://github.com/davenger)).
* Revert "Fix a crash in s3 and s3Cluster functions" [#51239](https://github.com/ClickHouse/ClickHouse/pull/51239) ([Alexander Tokmakov](https://github.com/tavplubix)).
* fix flaky `AsyncLoader` destructor [#51245](https://github.com/ClickHouse/ClickHouse/pull/51245) ([Sergei Trifonov](https://github.com/serxa)).
* Docs: little cleanup of configuration-files.md [#51249](https://github.com/ClickHouse/ClickHouse/pull/51249) ([Robert Schulze](https://github.com/rschu1ze)).
* Fix a stupid bug on Replicated database recovery [#51252](https://github.com/ClickHouse/ClickHouse/pull/51252) ([Alexander Tokmakov](https://github.com/tavplubix)).
* FileCache: tryReserve() slight improvement [#51259](https://github.com/ClickHouse/ClickHouse/pull/51259) ([Igor Nikonov](https://github.com/devcrafter)).
* Ugly hotfix for "terminate on uncaught exception" in WriteBufferFromOStream [#51265](https://github.com/ClickHouse/ClickHouse/pull/51265) ([Alexander Tokmakov](https://github.com/tavplubix)).
* Avoid too many calls to Poco::Logger::get [#51266](https://github.com/ClickHouse/ClickHouse/pull/51266) ([Kseniia Sumarokova](https://github.com/kssenii)).
* Update version_date.tsv and changelogs after v23.3.5.9-lts [#51269](https://github.com/ClickHouse/ClickHouse/pull/51269) ([robot-clickhouse](https://github.com/robot-clickhouse)).
* Better reporting of broken parts [#51270](https://github.com/ClickHouse/ClickHouse/pull/51270) ([Anton Popov](https://github.com/CurtizJ)).
* Update ext-dict-functions.md [#51283](https://github.com/ClickHouse/ClickHouse/pull/51283) ([Mike Kot](https://github.com/myrrc)).
* Disable table structure check for secondary queries from Replicated db [#51284](https://github.com/ClickHouse/ClickHouse/pull/51284) ([Alexander Tokmakov](https://github.com/tavplubix)).
* Define Thrift version for parquet and use correct arrow version [#51285](https://github.com/ClickHouse/ClickHouse/pull/51285) ([Kruglov Pavel](https://github.com/Avogar)).
* Restore Azure build on ARM [#51288](https://github.com/ClickHouse/ClickHouse/pull/51288) ([Robert Schulze](https://github.com/rschu1ze)).
* Query Cache: Un-comment settings in server cfg [#51294](https://github.com/ClickHouse/ClickHouse/pull/51294) ([Robert Schulze](https://github.com/rschu1ze)).
* Require more checks [#51295](https://github.com/ClickHouse/ClickHouse/pull/51295) ([Alexey Milovidov](https://github.com/alexey-milovidov)).
* Fix metadata loading test [#51297](https://github.com/ClickHouse/ClickHouse/pull/51297) ([Alexey Milovidov](https://github.com/alexey-milovidov)).
* Scratch the strange Python code [#51302](https://github.com/ClickHouse/ClickHouse/pull/51302) ([Alexey Milovidov](https://github.com/alexey-milovidov)).
* Add a test for [#47865](https://github.com/ClickHouse/ClickHouse/issues/47865) [#51306](https://github.com/ClickHouse/ClickHouse/pull/51306) ([Alexey Milovidov](https://github.com/alexey-milovidov)).
* Add a test for [#48894](https://github.com/ClickHouse/ClickHouse/issues/48894) [#51307](https://github.com/ClickHouse/ClickHouse/pull/51307) ([Alexey Milovidov](https://github.com/alexey-milovidov)).
* Add a test for [#48676](https://github.com/ClickHouse/ClickHouse/issues/48676) [#51308](https://github.com/ClickHouse/ClickHouse/pull/51308) ([Alexey Milovidov](https://github.com/alexey-milovidov)).
* Fix long test `functions_bad_arguments` [#51310](https://github.com/ClickHouse/ClickHouse/pull/51310) ([Alexey Milovidov](https://github.com/alexey-milovidov)).
* Unify merge predicate [#51344](https://github.com/ClickHouse/ClickHouse/pull/51344) ([Alexander Tokmakov](https://github.com/tavplubix)).
* Fix using locks in ProcessList [#51348](https://github.com/ClickHouse/ClickHouse/pull/51348) ([Vitaly Baranov](https://github.com/vitlibar)).
* Add a test for [#42631](https://github.com/ClickHouse/ClickHouse/issues/42631) [#51353](https://github.com/ClickHouse/ClickHouse/pull/51353) ([Alexey Milovidov](https://github.com/alexey-milovidov)).
* Fix performance tests due to warnings from jemalloc about Per-CPU arena disabled [#51362](https://github.com/ClickHouse/ClickHouse/pull/51362) ([Azat Khuzhin](https://github.com/azat)).
* Fix "merge_truncate_long" test [#51369](https://github.com/ClickHouse/ClickHouse/pull/51369) ([Alexey Milovidov](https://github.com/alexey-milovidov)).
* Increase timeout of Fast Test [#51372](https://github.com/ClickHouse/ClickHouse/pull/51372) ([Alexey Milovidov](https://github.com/alexey-milovidov)).
* Fix bad tests for DNS [#51374](https://github.com/ClickHouse/ClickHouse/pull/51374) ([Alexey Milovidov](https://github.com/alexey-milovidov)).
* Attempt to fix the `relax_too_many_parts` test [#51375](https://github.com/ClickHouse/ClickHouse/pull/51375) ([Alexey Milovidov](https://github.com/alexey-milovidov)).
* Fix MySQL test in Debug mode [#51376](https://github.com/ClickHouse/ClickHouse/pull/51376) ([Alexey Milovidov](https://github.com/alexey-milovidov)).
* Fix bad test `01018_Distributed__shard_num` [#51377](https://github.com/ClickHouse/ClickHouse/pull/51377) ([Alexey Milovidov](https://github.com/alexey-milovidov)).
* Fix "logical error" in addressToLineWithInlines [#51379](https://github.com/ClickHouse/ClickHouse/pull/51379) ([Alexey Milovidov](https://github.com/alexey-milovidov)).
* Fix test 01280_ttl_where_group_by [#51380](https://github.com/ClickHouse/ClickHouse/pull/51380) ([Alexey Milovidov](https://github.com/alexey-milovidov)).
* Attempt to fix `test_ssl_cert_authentication` [#51384](https://github.com/ClickHouse/ClickHouse/pull/51384) ([Alexey Milovidov](https://github.com/alexey-milovidov)).
* Revert "Merge pull request [#50951](https://github.com/ClickHouse/ClickHouse/issues/50951) from ZhiguoZh/20230607-toyear-fix" [#51390](https://github.com/ClickHouse/ClickHouse/pull/51390) ([Alexey Milovidov](https://github.com/alexey-milovidov)).
* Two tests are twice longer in average with Analyzer and sometimes failing [#51391](https://github.com/ClickHouse/ClickHouse/pull/51391) ([Alexey Milovidov](https://github.com/alexey-milovidov)).
* Fix 00899_long_attach_memory_limit [#51395](https://github.com/ClickHouse/ClickHouse/pull/51395) ([Alexey Milovidov](https://github.com/alexey-milovidov)).
* Fix test 01293_optimize_final_force [#51396](https://github.com/ClickHouse/ClickHouse/pull/51396) ([Alexey Milovidov](https://github.com/alexey-milovidov)).
* Fix test 02481_parquet_list_monotonically_increasing_offsets [#51397](https://github.com/ClickHouse/ClickHouse/pull/51397) ([Alexey Milovidov](https://github.com/alexey-milovidov)).
* Fix test 02497_trace_events_stress_long [#51398](https://github.com/ClickHouse/ClickHouse/pull/51398) ([Alexey Milovidov](https://github.com/alexey-milovidov)).
* Fix broken labeling for `manual approve` [#51405](https://github.com/ClickHouse/ClickHouse/pull/51405) ([Mikhail f. Shiryaev](https://github.com/Felixoid)).
* Fix parts lifetime in `MergeTreeTransaction` [#51407](https://github.com/ClickHouse/ClickHouse/pull/51407) ([Alexander Tokmakov](https://github.com/tavplubix)).
* Fix flaky test test_skip_empty_files [#51409](https://github.com/ClickHouse/ClickHouse/pull/51409) ([Kruglov Pavel](https://github.com/Avogar)).
* fix flacky test test_profile_events_s3 [#51412](https://github.com/ClickHouse/ClickHouse/pull/51412) ([Sema Checherinda](https://github.com/CheSema)).
* Update README.md [#51413](https://github.com/ClickHouse/ClickHouse/pull/51413) ([Tyler Hannan](https://github.com/tylerhannan)).
* Replace try/catch logic in hasTokenOrNull() by something more lightweight [#51425](https://github.com/ClickHouse/ClickHouse/pull/51425) ([Robert Schulze](https://github.com/rschu1ze)).
* Add retries to `tlsv1_3` tests [#51434](https://github.com/ClickHouse/ClickHouse/pull/51434) ([János Benjamin Antal](https://github.com/antaljanosbenjamin)).
* Update exception message [#51440](https://github.com/ClickHouse/ClickHouse/pull/51440) ([Kseniia Sumarokova](https://github.com/kssenii)).
* fs cache: add check for intersecting ranges [#51444](https://github.com/ClickHouse/ClickHouse/pull/51444) ([Kseniia Sumarokova](https://github.com/kssenii)).
* Slightly better code around packets for parallel replicas [#51451](https://github.com/ClickHouse/ClickHouse/pull/51451) ([Nikita Mikhaylov](https://github.com/nikitamikhaylov)).
* Update system_warnings test [#51453](https://github.com/ClickHouse/ClickHouse/pull/51453) ([Alexey Milovidov](https://github.com/alexey-milovidov)).
* Many fixes [#51455](https://github.com/ClickHouse/ClickHouse/pull/51455) ([Alexey Milovidov](https://github.com/alexey-milovidov)).
* Fix test 01605_adaptive_granularity_block_borders [#51457](https://github.com/ClickHouse/ClickHouse/pull/51457) ([Alexey Milovidov](https://github.com/alexey-milovidov)).
* Try fix flaky 02497_storage_file_reader_selection [#51468](https://github.com/ClickHouse/ClickHouse/pull/51468) ([Kruglov Pavel](https://github.com/Avogar)).
* Try making Keeper in `DatabaseReplicated` tests more stable [#51473](https://github.com/ClickHouse/ClickHouse/pull/51473) ([Antonio Andelic](https://github.com/antonio2368)).
* Convert 02003_memory_limit_in_client from expect to sh test (to fix flakiness) [#51475](https://github.com/ClickHouse/ClickHouse/pull/51475) ([Azat Khuzhin](https://github.com/azat)).
* Fix test_disk_over_web_server [#51476](https://github.com/ClickHouse/ClickHouse/pull/51476) ([Kseniia Sumarokova](https://github.com/kssenii)).
* Delay shutdown of system and temporary databases [#51479](https://github.com/ClickHouse/ClickHouse/pull/51479) ([Kseniia Sumarokova](https://github.com/kssenii)).
* Fix memory leakage in CompressionCodecDeflateQpl [#51480](https://github.com/ClickHouse/ClickHouse/pull/51480) ([Vitaly Baranov](https://github.com/vitlibar)).
* Increase retries in test_multiple_disks/test.py::test_start_stop_moves [#51482](https://github.com/ClickHouse/ClickHouse/pull/51482) ([Kseniia Sumarokova](https://github.com/kssenii)).
* Fix race in BoundedReadBuffer [#51484](https://github.com/ClickHouse/ClickHouse/pull/51484) ([Kseniia Sumarokova](https://github.com/kssenii)).
* Fix flaky unit test [#51485](https://github.com/ClickHouse/ClickHouse/pull/51485) ([Kseniia Sumarokova](https://github.com/kssenii)).
* Fix flaky test `test_host_regexp_multiple_ptr_records` [#51506](https://github.com/ClickHouse/ClickHouse/pull/51506) ([Nikolay Degterinsky](https://github.com/evillique)).
* Add a comment [#51517](https://github.com/ClickHouse/ClickHouse/pull/51517) ([Kseniia Sumarokova](https://github.com/kssenii)).
* Make `test_ssl_cert_authentication` similar to `test_tlvs1_3` [#51520](https://github.com/ClickHouse/ClickHouse/pull/51520) ([János Benjamin Antal](https://github.com/antaljanosbenjamin)).
* Fix duplicate storage set logical error. [#51521](https://github.com/ClickHouse/ClickHouse/pull/51521) ([Nikolai Kochetov](https://github.com/KochetovNicolai)).
* Update test_storage_postgresql/test.py::test_concurrent_queries [#51523](https://github.com/ClickHouse/ClickHouse/pull/51523) ([Kseniia Sumarokova](https://github.com/kssenii)).
* Fix FATAL: query context is not detached from thread group [#51540](https://github.com/ClickHouse/ClickHouse/pull/51540) ([Igor Nikonov](https://github.com/devcrafter)).
* Update version_date.tsv and changelogs after v23.3.6.7-lts [#51548](https://github.com/ClickHouse/ClickHouse/pull/51548) ([robot-clickhouse](https://github.com/robot-clickhouse)).
* Decoupled commits from [#51180](https://github.com/ClickHouse/ClickHouse/issues/51180) for backports [#51561](https://github.com/ClickHouse/ClickHouse/pull/51561) ([Mikhail f. Shiryaev](https://github.com/Felixoid)).
* Try to fix deadlock in ZooKeeper client [#51563](https://github.com/ClickHouse/ClickHouse/pull/51563) ([Alexander Tokmakov](https://github.com/tavplubix)).
* Retry chroot creation in ZK before stateless tests [#51585](https://github.com/ClickHouse/ClickHouse/pull/51585) ([Antonio Andelic](https://github.com/antonio2368)).
* use timeout instead trap in 01443_merge_truncate_long.sh [#51593](https://github.com/ClickHouse/ClickHouse/pull/51593) ([Sema Checherinda](https://github.com/CheSema)).
* Update version_date.tsv and changelogs after v23.5.4.25-stable [#51604](https://github.com/ClickHouse/ClickHouse/pull/51604) ([robot-clickhouse](https://github.com/robot-clickhouse)).
* Fix MergeTreeMarksLoader segfaulting if marks file is longer than expected [#51636](https://github.com/ClickHouse/ClickHouse/pull/51636) ([Michael Kolupaev](https://github.com/al13n321)).
* Update version_date.tsv and changelogs after v23.4.5.22-stable [#51638](https://github.com/ClickHouse/ClickHouse/pull/51638) ([robot-clickhouse](https://github.com/robot-clickhouse)).
* Update version_date.tsv and changelogs after v23.3.7.5-lts [#51639](https://github.com/ClickHouse/ClickHouse/pull/51639) ([robot-clickhouse](https://github.com/robot-clickhouse)).
* Update parts.md [#51643](https://github.com/ClickHouse/ClickHouse/pull/51643) ([Ramazan Polat](https://github.com/ramazanpolat)).

View File

@ -949,7 +949,14 @@ The example uses `type=web`, but any disk type can be configured as dynamic, eve
#### Example dynamic web storage
:::tip
A [demo dataset](https://github.com/ClickHouse/web-tables-demo) is hosted in GitHub. To prepare your own tables for web storage see the tool [clickhouse-static-files-uploader](/docs/en/operations/storing-data.md/#storing-data-on-webserver)
:::
In this `ATTACH TABLE` query the `UUID` provided matches the directory name of the data, and the endpoint is the URL for the raw GitHub content.
```sql
# highlight-next-line
ATTACH TABLE uk_price_paid UUID 'cf712b4f-2ca8-435c-ac23-c4393efe52f7'
(
price UInt32,

View File

@ -2120,7 +2120,13 @@ This section contains the following parameters:
- `operation_timeout_ms` — Maximum timeout for one operation in milliseconds.
- `root` — The [znode](http://zookeeper.apache.org/doc/r3.5.5/zookeeperOver.html#Nodes+and+ephemeral+nodes) that is used as the root for znodes used by the ClickHouse server. Optional.
- `identity` — User and password, that can be required by ZooKeeper to give access to requested znodes. Optional.
- zookeeper_load_balancing - Specifies the algorithm of ZooKeeper node selection.
* random - randomly selects one of ZooKeeper nodes.
* in_order - selects the first ZooKeeper node, if it's not available then the second, and so on.
* nearest_hostname - selects a ZooKeeper node with a hostname that is most similar to the servers hostname.
* first_or_random - selects the first ZooKeeper node, if it's not available then randomly selects one of remaining ZooKeeper nodes.
* round_robin - selects the first ZooKeeper node, if reconnection happens selects the next.
**Example configuration**
``` xml
@ -2139,6 +2145,8 @@ This section contains the following parameters:
<root>/path/to/zookeeper/node</root>
<!-- Optional. Zookeeper digest ACL string. -->
<identity>user:password</identity>
<!--<zookeeper_load_balancing>random / in_order / nearest_hostname / first_or_random / round_robin</zookeeper_load_balancing>-->
<zookeeper_load_balancing>random</zookeeper_load_balancing>
</zookeeper>
```

View File

@ -184,13 +184,15 @@ These settings should be defined in the disk configuration section.
- `enable_filesystem_query_cache_limit` - allow to limit the size of cache which is downloaded within each query (depends on user setting `max_query_cache_size`). Default: `false`.
- `enable_cache_hits_threshold` - number which defines how many times some data needs to be read before it will be cached. Default: `0`, e.g. the data is cached at the first attempt to read it.
- `enable_cache_hits_threshold` - number which defines how many times some data needs to be read before it will be cached. Default: `false`. This threshold can be defined by `cache_hits_threshold`. Default: `0`, e.g. the data is cached at the first attempt to read it.
- `enable_bypass_cache_with_threshold` - allows to skip cache completely in case the requested read range exceeds the threshold. Default: `false`. This threshold can be defined by `bypass_cache_threashold`. Default: `268435456` (`256Mi`).
- `do_not_evict_index_and_mark_files` - do not evict small frequently used files according to cache policy. Default: `false`. This setting was added in version 22.8. If you used filesystem cache before this version, then it will not work on versions starting from 22.8 if this setting is set to `true`. If you want to use this setting, clear old cache created before version 22.8 before upgrading.
- `max_file_segment_size` - a maximum size of a single cache file in bytes or in readable format (`ki, Mi, Gi, etc`, example `10Gi`). Default: `104857600` (`100Mi`).
- `max_file_segment_size` - a maximum size of a single cache file in bytes or in readable format (`ki, Mi, Gi, etc`, example `10Gi`). Default: `8388608` (`8Mi`).
- `max_elements` - a limit for a number of cache files. Default: `1048576`.
- `max_elements` - a limit for a number of cache files. Default: `10000000`.
File Cache **query/profile settings**:

View File

@ -790,7 +790,7 @@ bool Client::processWithFuzzing(const String & full_query)
WriteBufferFromOStream cerr_buf(std::cerr, 4096);
fuzz_base->dumpTree(cerr_buf);
cerr_buf.next();
cerr_buf.finalize();
fmt::print(
stderr,
@ -928,7 +928,7 @@ bool Client::processWithFuzzing(const String & full_query)
std::cout << std::endl;
WriteBufferFromOStream ast_buf(std::cout, 4096);
formatAST(*query, ast_buf, false /*highlight*/);
ast_buf.next();
ast_buf.finalize();
if (const auto * insert = query->as<ASTInsertQuery>())
{
/// For inserts with data it's really useful to have the data itself available in the logs, as formatAST doesn't print it

View File

@ -151,6 +151,7 @@ int mainEntryClickHouseFormat(int argc, char ** argv)
WriteBufferFromFileDescriptor out(STDOUT_FILENO);
obfuscateQueries(query, out, obfuscated_words_map, used_nouns, hash_func, is_known_identifier);
out.finalize();
}
else
{
@ -175,7 +176,7 @@ int mainEntryClickHouseFormat(int argc, char ** argv)
{
WriteBufferFromOStream res_buf(std::cout, 4096);
formatAST(*res, res_buf, hilite, oneline);
res_buf.next();
res_buf.finalize();
if (multiple)
std::cout << "\n;\n";
std::cout << std::endl;
@ -199,7 +200,7 @@ int mainEntryClickHouseFormat(int argc, char ** argv)
res_cout.write(*s_pos++);
}
res_cout.next();
res_cout.finalize();
if (multiple)
std::cout << " \\\n;\n";
std::cout << std::endl;

View File

@ -9,6 +9,7 @@
#include <Poco/AutoPtr.h>
#include <Poco/Logger.h>
#include <Common/logger_useful.h>
#include <Disks/DiskLocal.h>
int mainEntryClickHouseKeeperConverter(int argc, char ** argv)
@ -39,8 +40,9 @@ int mainEntryClickHouseKeeperConverter(int argc, char ** argv)
try
{
auto keeper_context = std::make_shared<KeeperContext>();
keeper_context->digest_enabled = true;
auto keeper_context = std::make_shared<KeeperContext>(true);
keeper_context->setDigestEnabled(true);
keeper_context->setSnapshotDisk(std::make_shared<DiskLocal>("Keeper-snapshots", options["output-dir"].as<std::string>(), 0));
DB::KeeperStorage storage(/* tick_time_ms */ 500, /* superdigest */ "", keeper_context, /* initialize_system_nodes */ false);
@ -51,10 +53,10 @@ int mainEntryClickHouseKeeperConverter(int argc, char ** argv)
DB::SnapshotMetadataPtr snapshot_meta = std::make_shared<DB::SnapshotMetadata>(storage.getZXID(), 1, std::make_shared<nuraft::cluster_config>());
DB::KeeperStorageSnapshot snapshot(&storage, snapshot_meta);
DB::KeeperSnapshotManager manager(options["output-dir"].as<std::string>(), 1, keeper_context);
DB::KeeperSnapshotManager manager(1, keeper_context);
auto snp = manager.serializeSnapshotToBuffer(snapshot);
auto path = manager.serializeSnapshotBufferToDisk(*snp, storage.getZXID());
std::cout << "Snapshot serialized to path:" << path << std::endl;
auto file_info = manager.serializeSnapshotBufferToDisk(*snp, storage.getZXID());
std::cout << "Snapshot serialized to path:" << fs::path(file_info.disk->getPath()) / file_info.path << std::endl;
}
catch (...)
{

View File

@ -48,10 +48,10 @@ if (BUILD_STANDALONE_KEEPER)
${CMAKE_CURRENT_SOURCE_DIR}/../../src/Coordination/KeeperSnapshotManager.cpp
${CMAKE_CURRENT_SOURCE_DIR}/../../src/Coordination/KeeperSnapshotManagerS3.cpp
${CMAKE_CURRENT_SOURCE_DIR}/../../src/Coordination/KeeperStateMachine.cpp
${CMAKE_CURRENT_SOURCE_DIR}/../../src/Coordination/KeeperContext.cpp
${CMAKE_CURRENT_SOURCE_DIR}/../../src/Coordination/KeeperStateManager.cpp
${CMAKE_CURRENT_SOURCE_DIR}/../../src/Coordination/KeeperStorage.cpp
${CMAKE_CURRENT_SOURCE_DIR}/../../src/Coordination/KeeperAsynchronousMetrics.cpp
${CMAKE_CURRENT_SOURCE_DIR}/../../src/Coordination/TinyContext.cpp
${CMAKE_CURRENT_SOURCE_DIR}/../../src/Coordination/pathUtils.cpp
${CMAKE_CURRENT_SOURCE_DIR}/../../src/Coordination/SessionExpiryQueue.cpp
${CMAKE_CURRENT_SOURCE_DIR}/../../src/Coordination/SummingStateMachine.cpp
@ -60,10 +60,14 @@ if (BUILD_STANDALONE_KEEPER)
${CMAKE_CURRENT_SOURCE_DIR}/../../src/Core/SettingsFields.cpp
${CMAKE_CURRENT_SOURCE_DIR}/../../src/Core/BaseSettings.cpp
${CMAKE_CURRENT_SOURCE_DIR}/../../src/Core/ServerSettings.cpp
${CMAKE_CURRENT_SOURCE_DIR}/../../src/Core/Field.cpp
${CMAKE_CURRENT_SOURCE_DIR}/../../src/Core/SettingsEnums.cpp
${CMAKE_CURRENT_SOURCE_DIR}/../../src/Core/ServerUUID.cpp
${CMAKE_CURRENT_SOURCE_DIR}/../../src/Core/UUID.cpp
${CMAKE_CURRENT_SOURCE_DIR}/../../src/Core/BackgroundSchedulePool.cpp
${CMAKE_CURRENT_SOURCE_DIR}/../../src/IO/ReadBuffer.cpp
${CMAKE_CURRENT_SOURCE_DIR}/../../src/Server/KeeperTCPHandler.cpp
${CMAKE_CURRENT_SOURCE_DIR}/../../src/Server/TCPServer.cpp
@ -95,6 +99,10 @@ if (BUILD_STANDALONE_KEEPER)
${CMAKE_CURRENT_SOURCE_DIR}/../../src/Compression/ICompressionCodec.cpp
${CMAKE_CURRENT_SOURCE_DIR}/../../src/Compression/LZ4_decompress_faster.cpp
${CMAKE_CURRENT_SOURCE_DIR}/../../src/Common/CurrentThread.cpp
${CMAKE_CURRENT_SOURCE_DIR}/../../src/Common/NamedCollections/NamedCollections.cpp
${CMAKE_CURRENT_SOURCE_DIR}/../../src/Common/NamedCollections/NamedCollectionConfiguration.cpp
${CMAKE_CURRENT_SOURCE_DIR}/../../src/Common/ZooKeeper/IKeeper.cpp
${CMAKE_CURRENT_SOURCE_DIR}/../../src/Common/ZooKeeper/TestKeeper.cpp
${CMAKE_CURRENT_SOURCE_DIR}/../../src/Common/ZooKeeper/ZooKeeperCommon.cpp
@ -105,11 +113,58 @@ if (BUILD_STANDALONE_KEEPER)
${CMAKE_CURRENT_SOURCE_DIR}/../../src/Common/ZooKeeper/ZooKeeperLock.cpp
${CMAKE_CURRENT_SOURCE_DIR}/../../src/Common/ZooKeeper/ZooKeeperNodeCache.cpp
${CMAKE_CURRENT_SOURCE_DIR}/../../src/Disks/registerDisks.cpp
${CMAKE_CURRENT_SOURCE_DIR}/../../src/Disks/IDisk.cpp
${CMAKE_CURRENT_SOURCE_DIR}/../../src/Disks/DiskFactory.cpp
${CMAKE_CURRENT_SOURCE_DIR}/../../src/Disks/DiskSelector.cpp
${CMAKE_CURRENT_SOURCE_DIR}/../../src/Disks/DiskLocal.cpp
${CMAKE_CURRENT_SOURCE_DIR}/../../src/Disks/DiskLocalCheckThread.cpp
${CMAKE_CURRENT_SOURCE_DIR}/../../src/Disks/LocalDirectorySyncGuard.cpp
${CMAKE_CURRENT_SOURCE_DIR}/../../src/Disks/TemporaryFileOnDisk.cpp
${CMAKE_CURRENT_SOURCE_DIR}/../../src/Disks/loadLocalDiskConfig.cpp
${CMAKE_CURRENT_SOURCE_DIR}/../../src/Disks/ObjectStorages/IObjectStorage.cpp
${CMAKE_CURRENT_SOURCE_DIR}/../../src/Disks/ObjectStorages/MetadataStorageFromPlainObjectStorage.cpp
${CMAKE_CURRENT_SOURCE_DIR}/../../src/Disks/ObjectStorages/MetadataStorageFromDisk.cpp
${CMAKE_CURRENT_SOURCE_DIR}/../../src/Disks/ObjectStorages/MetadataFromDiskTransactionState.cpp
${CMAKE_CURRENT_SOURCE_DIR}/../../src/Disks/ObjectStorages/DiskObjectStorageMetadata.cpp
${CMAKE_CURRENT_SOURCE_DIR}/../../src/Disks/ObjectStorages/MetadataStorageFromDiskTransactionOperations.cpp
${CMAKE_CURRENT_SOURCE_DIR}/../../src/Disks/ObjectStorages/DiskObjectStorage.cpp
${CMAKE_CURRENT_SOURCE_DIR}/../../src/Disks/ObjectStorages/DiskObjectStorageTransaction.cpp
${CMAKE_CURRENT_SOURCE_DIR}/../../src/Disks/ObjectStorages/DiskObjectStorageRemoteMetadataRestoreHelper.cpp
${CMAKE_CURRENT_SOURCE_DIR}/../../src/Disks/ObjectStorages/DiskObjectStorageCommon.cpp
${CMAKE_CURRENT_SOURCE_DIR}/../../src/Disks/ObjectStorages/ObjectStorageIteratorAsync.cpp
${CMAKE_CURRENT_SOURCE_DIR}/../../src/Disks/ObjectStorages/ObjectStorageIterator.cpp
${CMAKE_CURRENT_SOURCE_DIR}/../../src/Disks/ObjectStorages/StoredObject.cpp
${CMAKE_CURRENT_SOURCE_DIR}/../../src/Disks/ObjectStorages/S3/registerDiskS3.cpp
${CMAKE_CURRENT_SOURCE_DIR}/../../src/Disks/ObjectStorages/S3/S3ObjectStorage.cpp
${CMAKE_CURRENT_SOURCE_DIR}/../../src/Disks/ObjectStorages/S3/S3Capabilities.cpp
${CMAKE_CURRENT_SOURCE_DIR}/../../src/Disks/ObjectStorages/S3/diskSettings.cpp
${CMAKE_CURRENT_SOURCE_DIR}/../../src/Disks/ObjectStorages/S3/ProxyListConfiguration.cpp
${CMAKE_CURRENT_SOURCE_DIR}/../../src/Disks/ObjectStorages/S3/ProxyResolverConfiguration.cpp
${CMAKE_CURRENT_SOURCE_DIR}/../../src/Disks/IO/createReadBufferFromFileBase.cpp
${CMAKE_CURRENT_SOURCE_DIR}/../../src/Disks/IO/ReadBufferFromRemoteFSGather.cpp
${CMAKE_CURRENT_SOURCE_DIR}/../../src/Disks/IO/IOUringReader.cpp
${CMAKE_CURRENT_SOURCE_DIR}/../../src/Disks/IO/WriteBufferFromTemporaryFile.cpp
${CMAKE_CURRENT_SOURCE_DIR}/../../src/Disks/IO/WriteBufferWithFinalizeCallback.cpp
${CMAKE_CURRENT_SOURCE_DIR}/../../src/Disks/IO/AsynchronousBoundedReadBuffer.cpp
${CMAKE_CURRENT_SOURCE_DIR}/../../src/Disks/IO/getThreadPoolReader.cpp
${CMAKE_CURRENT_SOURCE_DIR}/../../src/Disks/IO/ThreadPoolRemoteFSReader.cpp
${CMAKE_CURRENT_SOURCE_DIR}/../../src/Disks/IO/ThreadPoolReader.cpp
${CMAKE_CURRENT_SOURCE_DIR}/../../src/Storages/StorageS3Settings.cpp
${CMAKE_CURRENT_SOURCE_DIR}/../../src/Daemon/BaseDaemon.cpp
${CMAKE_CURRENT_SOURCE_DIR}/../../src/Daemon/SentryWriter.cpp
${CMAKE_CURRENT_SOURCE_DIR}/../../src/Daemon/GraphiteWriter.cpp
${CMAKE_CURRENT_BINARY_DIR}/../../src/Daemon/GitHash.generated.cpp
${CMAKE_CURRENT_SOURCE_DIR}/../../src/Coordination/Standalone/Context.cpp
${CMAKE_CURRENT_SOURCE_DIR}/../../src/Coordination/Standalone/Settings.cpp
${CMAKE_CURRENT_SOURCE_DIR}/../../src/Coordination/Standalone/ThreadStatusExt.cpp
Keeper.cpp
clickhouse-keeper.cpp
)
@ -132,10 +187,6 @@ if (BUILD_STANDALONE_KEEPER)
target_compile_definitions (clickhouse-keeper PRIVATE -DCLICKHOUSE_PROGRAM_STANDALONE_BUILD)
target_compile_definitions (clickhouse-keeper PUBLIC -DWITHOUT_TEXT_LOG)
target_include_directories(clickhouse-keeper PUBLIC "${CMAKE_CURRENT_SOURCE_DIR}/../../src") # uses includes from src directory
target_include_directories(clickhouse-keeper PUBLIC "${CMAKE_CURRENT_BINARY_DIR}/../../src/Core/include") # uses some includes from core
target_include_directories(clickhouse-keeper PUBLIC "${CMAKE_CURRENT_BINARY_DIR}/../../src") # uses some includes from common
if (ENABLE_CLICKHOUSE_KEEPER_CLIENT AND TARGET ch_rust::skim)
target_link_libraries(clickhouse-keeper PRIVATE ch_rust::skim)
endif()

View File

@ -24,6 +24,8 @@
#include <sys/stat.h>
#include <pwd.h>
#include <Interpreters/Context.h>
#include <Coordination/FourLetterCommand.h>
#include <Coordination/KeeperAsynchronousMetrics.h>
@ -45,6 +47,8 @@
#include <Server/ProtocolServerAdapter.h>
#include <Server/KeeperTCPHandlerFactory.h>
#include <Disks/registerDisks.h>
int mainEntryClickHouseKeeper(int argc, char ** argv)
{
@ -201,9 +205,12 @@ void Keeper::defineOptions(Poco::Util::OptionSet & options)
BaseDaemon::defineOptions(options);
}
struct Keeper::KeeperHTTPContext : public IHTTPContext
namespace
{
explicit KeeperHTTPContext(TinyContextPtr context_)
struct KeeperHTTPContext : public IHTTPContext
{
explicit KeeperHTTPContext(ContextPtr context_)
: context(std::move(context_))
{}
@ -247,12 +254,14 @@ struct Keeper::KeeperHTTPContext : public IHTTPContext
return {context->getConfigRef().getInt64("keeper_server.http_send_timeout", DBMS_DEFAULT_SEND_TIMEOUT_SEC), 0};
}
TinyContextPtr context;
ContextPtr context;
};
HTTPContextPtr Keeper::httpContext()
HTTPContextPtr httpContext()
{
return std::make_shared<KeeperHTTPContext>(tiny_context);
return std::make_shared<KeeperHTTPContext>(Context::getGlobalContextInstance());
}
}
int Keeper::main(const std::vector<std::string> & /*args*/)
@ -316,10 +325,21 @@ try
std::mutex servers_lock;
auto servers = std::make_shared<std::vector<ProtocolServerAdapter>>();
tiny_context = std::make_shared<TinyContext>();
auto shared_context = Context::createShared();
auto global_context = Context::createGlobal(shared_context.get());
global_context->makeGlobalContext();
global_context->setPath(path);
global_context->setRemoteHostFilter(config());
if (config().has("macros"))
global_context->setMacros(std::make_unique<Macros>(config(), "macros", log));
registerDisks(/*global_skip_access_check=*/false);
/// This object will periodically calculate some metrics.
KeeperAsynchronousMetrics async_metrics(
tiny_context,
global_context,
config().getUInt("asynchronous_metrics_update_period_s", 1),
[&]() -> std::vector<ProtocolServerMetrics>
{
@ -344,12 +364,12 @@ try
}
/// Initialize keeper RAFT. Do nothing if no keeper_server in config.
tiny_context->initializeKeeperDispatcher(/* start_async = */ true);
FourLetterCommandFactory::registerCommands(*tiny_context->getKeeperDispatcher());
global_context->initializeKeeperDispatcher(/* start_async = */ true);
FourLetterCommandFactory::registerCommands(*global_context->getKeeperDispatcher());
auto config_getter = [this] () -> const Poco::Util::AbstractConfiguration &
auto config_getter = [&] () -> const Poco::Util::AbstractConfiguration &
{
return tiny_context->getConfigRef();
return global_context->getConfigRef();
};
auto tcp_receive_timeout = config().getInt64("keeper_server.socket_receive_timeout_sec", DBMS_DEFAULT_RECEIVE_TIMEOUT_SEC);
@ -371,7 +391,7 @@ try
"Keeper (tcp): " + address.toString(),
std::make_unique<TCPServer>(
new KeeperTCPHandlerFactory(
config_getter, tiny_context->getKeeperDispatcher(),
config_getter, global_context->getKeeperDispatcher(),
tcp_receive_timeout, tcp_send_timeout, false), server_pool, socket));
});
@ -389,7 +409,7 @@ try
"Keeper with secure protocol (tcp_secure): " + address.toString(),
std::make_unique<TCPServer>(
new KeeperTCPHandlerFactory(
config_getter, tiny_context->getKeeperDispatcher(),
config_getter, global_context->getKeeperDispatcher(),
tcp_receive_timeout, tcp_send_timeout, true), server_pool, socket));
#else
UNUSED(port);
@ -441,7 +461,7 @@ try
[&](ConfigurationPtr config, bool /* initial_loading */)
{
if (config->has("keeper_server"))
tiny_context->updateKeeperConfiguration(*config);
global_context->updateKeeperConfiguration(*config);
},
/* already_loaded = */ false); /// Reload it right now (initial loading)
@ -472,7 +492,7 @@ try
else
LOG_INFO(log, "Closed connections to Keeper.");
tiny_context->shutdownKeeperDispatcher();
global_context->shutdownKeeperDispatcher();
/// Wait server pool to avoid use-after-free of destroyed context in the handlers
server_pool.joinAll();

View File

@ -1,9 +1,7 @@
#pragma once
#include <Server/IServer.h>
#include <Server/HTTP/HTTPContext.h>
#include <Daemon/BaseDaemon.h>
#include <Coordination/TinyContext.h>
namespace Poco
{
@ -68,11 +66,6 @@ protected:
std::string getDefaultConfigFileName() const override;
private:
TinyContextPtr tiny_context;
struct KeeperHTTPContext;
HTTPContextPtr httpContext();
Poco::Net::SocketAddress socketBindListen(Poco::Net::ServerSocket & socket, const std::string & host, UInt16 port, [[maybe_unused]] bool secure = false) const;
using CreateServerFunc = std::function<void(UInt16)>;

View File

@ -239,6 +239,7 @@ QueryPipeline ExternalDictionaryLibraryBridgeHelper::loadKeys(const Block & requ
WriteBufferFromOStream out_buffer(os);
auto output_format = getContext()->getOutputFormat(ExternalDictionaryLibraryBridgeHelper::DEFAULT_FORMAT, out_buffer, requested_block.cloneEmpty());
formatBlock(output_format, requested_block);
out_buffer.finalize();
};
return QueryPipeline(loadBase(uri, out_stream_callback));
}

View File

@ -362,7 +362,7 @@ ASTPtr ClientBase::parseQuery(const char *& pos, const char * end, bool allow_mu
std::cout << std::endl;
WriteBufferFromOStream res_buf(std::cout, 4096);
formatAST(*res, res_buf);
res_buf.next();
res_buf.finalize();
std::cout << std::endl << std::endl;
}

View File

@ -1244,7 +1244,7 @@ void QueryFuzzer::fuzzMain(ASTPtr & ast)
std::cout << std::endl;
WriteBufferFromOStream ast_buf(std::cout, 4096);
formatAST(*ast, ast_buf, false /*highlight*/);
ast_buf.next();
ast_buf.finalize();
std::cout << std::endl << std::endl;
}

View File

@ -1,5 +1,6 @@
#include "Exception.h"
#include <algorithm>
#include <cstring>
#include <cxxabi.h>
#include <cstdlib>
@ -83,6 +84,7 @@ Exception::Exception(const MessageMasked & msg_masked, int code, bool remote_)
: Poco::Exception(msg_masked.msg, code)
, remote(remote_)
{
capture_thread_frame_pointers = thread_frame_pointers;
handle_error_code(msg_masked.msg, code, remote, getStackFramePointers());
}
@ -90,12 +92,14 @@ Exception::Exception(MessageMasked && msg_masked, int code, bool remote_)
: Poco::Exception(msg_masked.msg, code)
, remote(remote_)
{
capture_thread_frame_pointers = thread_frame_pointers;
handle_error_code(message(), code, remote, getStackFramePointers());
}
Exception::Exception(CreateFromPocoTag, const Poco::Exception & exc)
: Poco::Exception(exc.displayText(), ErrorCodes::POCO_EXCEPTION)
{
capture_thread_frame_pointers = thread_frame_pointers;
#ifdef STD_EXCEPTION_HAS_STACK_TRACE
auto * stack_trace_frames = exc.get_stack_trace_frames();
auto stack_trace_size = exc.get_stack_trace_size();
@ -107,6 +111,7 @@ Exception::Exception(CreateFromPocoTag, const Poco::Exception & exc)
Exception::Exception(CreateFromSTDTag, const std::exception & exc)
: Poco::Exception(demangle(typeid(exc).name()) + ": " + String(exc.what()), ErrorCodes::STD_EXCEPTION)
{
capture_thread_frame_pointers = thread_frame_pointers;
#ifdef STD_EXCEPTION_HAS_STACK_TRACE
auto * stack_trace_frames = exc.get_stack_trace_frames();
auto stack_trace_size = exc.get_stack_trace_size();
@ -153,7 +158,17 @@ std::string Exception::getStackTraceString() const
auto * stack_trace_frames = get_stack_trace_frames();
auto stack_trace_size = get_stack_trace_size();
__msan_unpoison(stack_trace_frames, stack_trace_size * sizeof(stack_trace_frames[0]));
return StackTrace::toString(stack_trace_frames, 0, stack_trace_size);
String thread_stack_trace;
std::for_each(capture_thread_frame_pointers.rbegin(), capture_thread_frame_pointers.rend(),
[&thread_stack_trace](StackTrace::FramePointers & frame_pointers)
{
thread_stack_trace +=
"\nJob's origin stack trace:\n" +
StackTrace::toString(frame_pointers.data(), 0, std::ranges::find(frame_pointers, nullptr) - frame_pointers.begin());
}
);
return StackTrace::toString(stack_trace_frames, 0, stack_trace_size) + thread_stack_trace;
#else
return trace.toString();
#endif
@ -185,6 +200,9 @@ Exception::FramePointers Exception::getStackFramePointers() const
return frame_pointers;
}
thread_local bool Exception::enable_job_stack_trace = false;
thread_local std::vector<StackTrace::FramePointers> Exception::thread_frame_pointers = {};
void throwFromErrno(const std::string & s, int code, int the_errno)
{

View File

@ -25,18 +25,27 @@ class Exception : public Poco::Exception
public:
using FramePointers = std::vector<void *>;
Exception() = default;
Exception()
{
capture_thread_frame_pointers = thread_frame_pointers;
}
Exception(const PreformattedMessage & msg, int code): Exception(msg.text, code)
{
capture_thread_frame_pointers = thread_frame_pointers;
message_format_string = msg.format_string;
}
Exception(PreformattedMessage && msg, int code): Exception(std::move(msg.text), code)
{
capture_thread_frame_pointers = thread_frame_pointers;
message_format_string = msg.format_string;
}
/// Collect call stacks of all previous jobs' schedulings leading to this thread job's execution
static thread_local bool enable_job_stack_trace;
static thread_local std::vector<StackTrace::FramePointers> thread_frame_pointers;
protected:
// used to remove the sensitive information from exceptions if query_masking_rules is configured
struct MessageMasked
@ -66,6 +75,7 @@ public:
Exception(int code, T && message)
: Exception(message, code)
{
capture_thread_frame_pointers = thread_frame_pointers;
message_format_string = tryGetStaticFormatString(message);
}
@ -80,6 +90,7 @@ public:
Exception(int code, FormatStringHelper<Args...> fmt, Args &&... args)
: Exception(fmt::format(fmt.fmt_str, std::forward<Args>(args)...), code)
{
capture_thread_frame_pointers = thread_frame_pointers;
message_format_string = fmt.message_format_string;
}
@ -131,6 +142,8 @@ private:
protected:
std::string_view message_format_string;
/// Local copy of static per-thread thread_frame_pointers, should be mutable to be unpoisoned on printout
mutable std::vector<StackTrace::FramePointers> capture_thread_frame_pointers;
};

View File

@ -412,6 +412,21 @@ void StackTrace::toStringEveryLine(std::function<void(std::string_view)> callbac
toStringEveryLineImpl(true, {frame_pointers, offset, size}, std::move(callback));
}
void StackTrace::toStringEveryLine(const FramePointers & frame_pointers, std::function<void(std::string_view)> callback)
{
toStringEveryLineImpl(true, {frame_pointers, 0, static_cast<size_t>(std::ranges::find(frame_pointers, nullptr) - frame_pointers.begin())}, std::move(callback));
}
void StackTrace::toStringEveryLine(void ** frame_pointers_raw, size_t offset, size_t size, std::function<void(std::string_view)> callback)
{
__msan_unpoison(frame_pointers_raw, size * sizeof(*frame_pointers_raw));
StackTrace::FramePointers frame_pointers{};
std::copy_n(frame_pointers_raw, size, frame_pointers.begin());
toStringEveryLineImpl(true, {frame_pointers, offset, size}, std::move(callback));
}
using StackTraceCache = std::map<StackTraceTriple, String, std::less<>>;
static StackTraceCache & cacheInstance()

View File

@ -65,6 +65,8 @@ public:
static void symbolize(const FramePointers & frame_pointers, size_t offset, size_t size, StackTrace::Frames & frames);
void toStringEveryLine(std::function<void(std::string_view)> callback) const;
static void toStringEveryLine(const FramePointers & frame_pointers, std::function<void(std::string_view)> callback);
static void toStringEveryLine(void ** frame_pointers_raw, size_t offset, size_t size, std::function<void(std::string_view)> callback);
/// Displaying the addresses can be disabled for security reasons.
/// If you turn off addresses, it will be more secure, but we will be unable to help you with debugging.

View File

@ -189,7 +189,9 @@ ReturnType ThreadPoolImpl<Thread>::scheduleImpl(Job job, Priority priority, std:
jobs.emplace(std::move(job),
priority,
/// Tracing context on this thread is used as parent context for the sub-thread that runs the job
propagate_opentelemetry_tracing_context ? DB::OpenTelemetry::CurrentContext() : DB::OpenTelemetry::TracingContextOnThread());
propagate_opentelemetry_tracing_context ? DB::OpenTelemetry::CurrentContext() : DB::OpenTelemetry::TracingContextOnThread(),
/// capture_frame_pointers
DB::Exception::enable_job_stack_trace);
++scheduled_jobs;
}
@ -348,6 +350,8 @@ void ThreadPoolImpl<Thread>::worker(typename std::list<Thread>::iterator thread_
/// A copy of parent trace context
DB::OpenTelemetry::TracingContextOnThread parent_thread_trace_context;
std::vector<StackTrace::FramePointers> thread_frame_pointers;
/// Get a job from the queue.
Job job;
@ -393,6 +397,9 @@ void ThreadPoolImpl<Thread>::worker(typename std::list<Thread>::iterator thread_
/// to prevent us from modifying its priority. We have to use const_cast to force move semantics on JobWithPriority::job.
job = std::move(const_cast<Job &>(jobs.top().job));
parent_thread_trace_context = std::move(const_cast<DB::OpenTelemetry::TracingContextOnThread &>(jobs.top().thread_trace_context));
DB::Exception::enable_job_stack_trace = jobs.top().enable_job_stack_trace;
if (DB::Exception::enable_job_stack_trace)
thread_frame_pointers = std::move(const_cast<std::vector<StackTrace::FramePointers> &>(jobs.top().frame_pointers));
jobs.pop();
/// We don't run jobs after `shutdown` is set, but we have to properly dequeue all jobs and finish them.
@ -411,6 +418,10 @@ void ThreadPoolImpl<Thread>::worker(typename std::list<Thread>::iterator thread_
/// Run the job.
try
{
if (DB::Exception::enable_job_stack_trace)
DB::Exception::thread_frame_pointers = std::move(thread_frame_pointers);
CurrentMetrics::Increment metric_active_pool_threads(metric_active_threads);
job();

View File

@ -19,6 +19,8 @@
#include <Common/CurrentMetrics.h>
#include <Common/ThreadPool_fwd.h>
#include <Common/Priority.h>
#include <Common/StackTrace.h>
#include <Common/Exception.h>
#include <base/scope_guard.h>
/** Very simple thread pool similar to boost::threadpool.
@ -127,8 +129,19 @@ private:
Priority priority;
DB::OpenTelemetry::TracingContextOnThread thread_trace_context;
JobWithPriority(Job job_, Priority priority_, const DB::OpenTelemetry::TracingContextOnThread & thread_trace_context_)
: job(job_), priority(priority_), thread_trace_context(thread_trace_context_) {}
/// Call stacks of all jobs' schedulings leading to this one
std::vector<StackTrace::FramePointers> frame_pointers;
bool enable_job_stack_trace = false;
JobWithPriority(Job job_, Priority priority_, const DB::OpenTelemetry::TracingContextOnThread & thread_trace_context_, bool capture_frame_pointers = false)
: job(job_), priority(priority_), thread_trace_context(thread_trace_context_), enable_job_stack_trace(capture_frame_pointers)
{
if (!capture_frame_pointers)
return;
/// Save all previous jobs call stacks and append with current
frame_pointers = DB::Exception::thread_frame_pointers;
frame_pointers.push_back(StackTrace().getFramePointers());
}
bool operator<(const JobWithPriority & rhs) const
{

View File

@ -290,6 +290,7 @@ public:
void flushUntrackedMemory();
private:
void applyGlobalSettings();
void applyQuerySettings();
void initPerformanceCounters();

View File

@ -1,18 +1,19 @@
#include <filesystem>
#include <Coordination/Changelog.h>
#include <Disks/DiskLocal.h>
#include <IO/ReadBufferFromFile.h>
#include <IO/ReadHelpers.h>
#include <IO/WriteBufferFromFile.h>
#include <IO/WriteHelpers.h>
#include <IO/ZstdDeflatingAppendableWriteBuffer.h>
#include <base/errnoToString.h>
#include <boost/algorithm/string/join.hpp>
#include <boost/algorithm/string/split.hpp>
#include <boost/algorithm/string/trim.hpp>
#include <Common/filesystemHelpers.h>
#include <Common/Exception.h>
#include <Common/SipHash.h>
#include <Common/filesystemHelpers.h>
#include <Common/logger_useful.h>
#include <IO/WriteBufferFromFile.h>
#include <base/errnoToString.h>
#include <libnuraft/log_val_type.hxx>
@ -24,20 +25,41 @@ namespace ErrorCodes
extern const int CHECKSUM_DOESNT_MATCH;
extern const int CORRUPTED_DATA;
extern const int UNKNOWN_FORMAT_VERSION;
extern const int NOT_IMPLEMENTED;
extern const int BAD_ARGUMENTS;
extern const int LOGICAL_ERROR;
}
namespace
{
constexpr std::string_view tmp_prefix = "tmp_";
void moveFileBetweenDisks(DiskPtr disk_from, ChangelogFileDescriptionPtr description, DiskPtr disk_to, const std::string & path_to)
{
/// we use empty file with prefix tmp_ to detect incomplete copies
/// if a copy is complete we don't care from which disk we use the same file
/// so it's okay if a failure happens after removing of tmp file but before we remove
/// the changelog from the source disk
auto from_path = fs::path(description->path);
auto tmp_changelog_name = from_path.parent_path() / (std::string{tmp_prefix} + from_path.filename().string());
{
auto buf = disk_to->writeFile(tmp_changelog_name);
buf->finalize();
}
disk_from->copyFile(from_path, *disk_to, path_to, {});
disk_to->removeFile(tmp_changelog_name);
disk_from->removeFile(description->path);
description->path = path_to;
description->disk = disk_to;
}
constexpr auto DEFAULT_PREFIX = "changelog";
std::string formatChangelogPath(
const std::string & prefix, const std::string & name_prefix, uint64_t from_index, uint64_t to_index, const std::string & extension)
inline std::string
formatChangelogPath(const std::string & name_prefix, uint64_t from_index, uint64_t to_index, const std::string & extension)
{
std::filesystem::path path(prefix);
path /= std::filesystem::path(fmt::format("{}_{}_{}.{}", name_prefix, from_index, to_index, extension));
return path;
return fmt::format("{}_{}_{}.{}", name_prefix, from_index, to_index, extension);
}
ChangelogFileDescriptionPtr getChangelogFileDescription(const std::filesystem::path & path)
@ -89,17 +111,19 @@ class ChangelogWriter
public:
ChangelogWriter(
std::map<uint64_t, ChangelogFileDescriptionPtr> & existing_changelogs_,
const std::filesystem::path & changelogs_dir_,
KeeperContextPtr keeper_context_,
LogFileSettings log_file_settings_)
: existing_changelogs(existing_changelogs_)
, log_file_settings(log_file_settings_)
, changelogs_dir(changelogs_dir_)
, keeper_context(std::move(keeper_context_))
, log(&Poco::Logger::get("Changelog"))
{
}
void setFile(ChangelogFileDescriptionPtr file_description, WriteMode mode)
{
auto disk = getDisk();
try
{
if (mode == WriteMode::Append && file_description->expectedEntriesCountInLog() != log_file_settings.rotate_interval)
@ -110,7 +134,7 @@ public:
file_description->expectedEntriesCountInLog());
// we have a file we need to finalize first
if (tryGetFileBuffer() && prealloc_done)
if (tryGetFileBaseBuffer() && prealloc_done)
{
finalizeCurrentFile();
@ -118,27 +142,55 @@ public:
// if we wrote at least 1 log in the log file we can rename the file to reflect correctly the
// contained logs
// file can be deleted from disk earlier by compaction
if (!current_file_description->deleted && last_index_written
&& *last_index_written != current_file_description->to_log_index)
if (!current_file_description->deleted)
{
auto new_path = formatChangelogPath(
changelogs_dir,
current_file_description->prefix,
current_file_description->from_log_index,
*last_index_written,
current_file_description->extension);
std::filesystem::rename(current_file_description->path, new_path);
current_file_description->path = std::move(new_path);
auto log_disk = current_file_description->disk;
const auto & path = current_file_description->path;
std::string new_path = path;
if (last_index_written && *last_index_written != current_file_description->to_log_index)
{
new_path = formatChangelogPath(
current_file_description->prefix,
current_file_description->from_log_index,
*last_index_written,
current_file_description->extension);
}
if (disk == log_disk)
{
if (path != new_path)
{
try
{
disk->moveFile(path, new_path);
}
catch (...)
{
tryLogCurrentException(log, fmt::format("File rename failed on disk {}", disk->getName()));
}
current_file_description->path = std::move(new_path);
}
}
else
{
moveFileBetweenDisks(log_disk, current_file_description, disk, new_path);
}
}
}
file_buf = std::make_unique<WriteBufferFromFile>(
file_description->path, DBMS_DEFAULT_BUFFER_SIZE, mode == WriteMode::Rewrite ? -1 : (O_APPEND | O_CREAT | O_WRONLY));
auto latest_log_disk = getLatestLogDisk();
assert(file_description->disk == latest_log_disk);
file_buf = latest_log_disk->writeFile(file_description->path, DBMS_DEFAULT_BUFFER_SIZE, mode);
assert(file_buf);
last_index_written.reset();
current_file_description = std::move(file_description);
if (log_file_settings.compress_logs)
compressed_buffer = std::make_unique<ZstdDeflatingAppendableWriteBuffer>(std::move(file_buf), /* compression level = */ 3, /* append_to_existing_file_ = */ mode == WriteMode::Append);
compressed_buffer = std::make_unique<ZstdDeflatingAppendableWriteBuffer>(
std::move(file_buf),
/* compressi)on level = */ 3,
/* append_to_existing_file_ = */ mode == WriteMode::Append,
[latest_log_disk, path = current_file_description->path] { return latest_log_disk->readFile(path); });
prealloc_done = false;
}
@ -149,12 +201,12 @@ public:
}
}
bool isFileSet() const { return tryGetFileBuffer() != nullptr; }
/// There is bug when compressed_buffer has value, file_buf's ownership transfer to compressed_buffer
bool isFileSet() const { return compressed_buffer != nullptr || file_buf != nullptr; }
bool appendRecord(ChangelogRecord && record)
{
const auto * file_buffer = tryGetFileBuffer();
const auto * file_buffer = tryGetFileBaseBuffer();
assert(file_buffer && current_file_description);
assert(record.header.index - getStartIndex() <= current_file_description->expectedEntriesCountInLog());
@ -211,7 +263,7 @@ public:
void flush()
{
auto * file_buffer = tryGetFileBuffer();
auto * file_buffer = tryGetFileBaseBuffer();
if (file_buffer)
{
/// Fsync file system if needed
@ -236,12 +288,12 @@ public:
new_description->from_log_index = new_start_log_index;
new_description->to_log_index = new_start_log_index + log_file_settings.rotate_interval - 1;
new_description->extension = "bin";
new_description->disk = getLatestLogDisk();
if (log_file_settings.compress_logs)
new_description->extension += "." + toContentEncodingName(CompressionMethod::Zstd);
new_description->path = formatChangelogPath(
changelogs_dir,
new_description->prefix,
new_start_log_index,
new_start_log_index + log_file_settings.rotate_interval - 1,
@ -260,17 +312,15 @@ public:
}
private:
void finalizeCurrentFile()
{
const auto * file_buffer = tryGetFileBuffer();
assert(file_buffer && prealloc_done);
assert(prealloc_done);
assert(current_file_description);
// compact can delete the file and we don't need to do anything
if (current_file_description->deleted)
{
LOG_WARNING(log, "Log {} is already deleted", file_buffer->getFileName());
LOG_WARNING(log, "Log {} is already deleted", current_file_description->path);
return;
}
@ -279,27 +329,36 @@ private:
flush();
if (log_file_settings.max_size != 0)
const auto * file_buffer = tryGetFileBuffer();
if (log_file_settings.max_size != 0 && file_buffer)
{
int res = -1;
do
{
res = ftruncate(file_buffer->getFD(), initial_file_size + file_buffer->count());
}
while (res < 0 && errno == EINTR);
} while (res < 0 && errno == EINTR);
if (res != 0)
LOG_WARNING(log, "Could not ftruncate file. Error: {}, errno: {}", errnoToString(), errno);
}
if (log_file_settings.compress_logs)
{
compressed_buffer.reset();
}
else
{
chassert(file_buf);
file_buf->finalize();
file_buf.reset();
}
}
WriteBuffer & getBuffer()
{
/// TODO: unify compressed_buffer and file_buf,
/// compressed_buffer can use its NestedBuffer directly if compress_logs=false
if (compressed_buffer)
return *compressed_buffer;
@ -319,38 +378,42 @@ private:
return *file_buffer;
}
const WriteBufferFromFile * tryGetFileBuffer() const
{
return const_cast<ChangelogWriter *>(this)->tryGetFileBuffer();
}
const WriteBufferFromFile * tryGetFileBuffer() const { return const_cast<ChangelogWriter *>(this)->tryGetFileBuffer(); }
WriteBufferFromFile * tryGetFileBuffer()
{
if (compressed_buffer)
return dynamic_cast<WriteBufferFromFile *>(compressed_buffer->getNestedBuffer());
if (file_buf)
return file_buf.get();
return dynamic_cast<WriteBufferFromFile *>(file_buf.get());
}
return nullptr;
WriteBufferFromFileBase * tryGetFileBaseBuffer()
{
if (compressed_buffer)
return dynamic_cast<WriteBufferFromFileBase *>(compressed_buffer->getNestedBuffer());
return file_buf.get();
}
void tryPreallocateForFile()
{
if (log_file_settings.max_size == 0)
const auto * file_buffer = tryGetFileBuffer();
if (log_file_settings.max_size == 0 || !file_buffer)
{
initial_file_size = 0;
prealloc_done = true;
return;
}
const auto & file_buffer = getFileBuffer();
#ifdef OS_LINUX
{
int res = -1;
do
{
res = fallocate(file_buffer.getFD(), FALLOC_FL_KEEP_SIZE, 0, log_file_settings.max_size + log_file_settings.overallocate_size);
res = fallocate(
file_buffer->getFD(), FALLOC_FL_KEEP_SIZE, 0, log_file_settings.max_size + log_file_settings.overallocate_size);
} while (res < 0 && errno == EINTR);
if (res != 0)
@ -365,15 +428,21 @@ private:
}
}
#endif
initial_file_size = getSizeFromFileDescriptor(file_buffer.getFD());
initial_file_size = getSizeFromFileDescriptor(file_buffer->getFD());
prealloc_done = true;
}
DiskPtr getLatestLogDisk() const { return keeper_context->getLatestLogDisk(); }
DiskPtr getDisk() const { return keeper_context->getLogDisk(); }
bool isLocalDisk() const { return dynamic_cast<DiskLocal *>(getDisk().get()) != nullptr; }
std::map<uint64_t, ChangelogFileDescriptionPtr> & existing_changelogs;
ChangelogFileDescriptionPtr current_file_description{nullptr};
std::unique_ptr<WriteBufferFromFile> file_buf;
std::unique_ptr<WriteBufferFromFileBase> file_buf;
std::optional<uint64_t> last_index_written;
size_t initial_file_size{0};
@ -383,7 +452,7 @@ private:
LogFileSettings log_file_settings;
const std::filesystem::path changelogs_dir;
KeeperContextPtr keeper_context;
Poco::Logger * const log;
};
@ -413,10 +482,10 @@ struct ChangelogReadResult
class ChangelogReader
{
public:
explicit ChangelogReader(const std::string & filepath_) : filepath(filepath_)
explicit ChangelogReader(DiskPtr disk_, const std::string & filepath_) : disk(disk_), filepath(filepath_)
{
auto compression_method = chooseCompressionMethod(filepath, "");
auto read_buffer_from_file = std::make_unique<ReadBufferFromFile>(filepath);
auto read_buffer_from_file = disk->readFile(filepath);
read_buf = wrapReadBufferWithCompressionMethod(std::move(read_buffer_from_file), compression_method);
}
@ -512,37 +581,103 @@ public:
}
private:
DiskPtr disk;
std::string filepath;
std::unique_ptr<ReadBuffer> read_buf;
};
Changelog::Changelog(
const std::string & changelogs_dir_,
Poco::Logger * log_,
LogFileSettings log_file_settings)
: changelogs_dir(changelogs_dir_)
, changelogs_detached_dir(changelogs_dir / "detached")
Changelog::Changelog(Poco::Logger * log_, LogFileSettings log_file_settings, KeeperContextPtr keeper_context_)
: changelogs_detached_dir("detached")
, rotate_interval(log_file_settings.rotate_interval)
, log(log_)
, write_operations(std::numeric_limits<size_t>::max())
, append_completion_queue(std::numeric_limits<size_t>::max())
, keeper_context(std::move(keeper_context_))
{
/// Load all files in changelog directory
namespace fs = std::filesystem;
if (!fs::exists(changelogs_dir))
fs::create_directories(changelogs_dir);
for (const auto & p : fs::directory_iterator(changelogs_dir))
if (auto latest_log_disk = getLatestLogDisk();
log_file_settings.force_sync && dynamic_cast<const DiskLocal *>(latest_log_disk.get()) == nullptr)
{
if (p == changelogs_detached_dir)
continue;
auto file_description = getChangelogFileDescription(p.path());
existing_changelogs[file_description->from_log_index] = std::move(file_description);
throw DB::Exception(
DB::ErrorCodes::BAD_ARGUMENTS,
"force_sync is set to true for logs but disk '{}' cannot satisfy such guarantee because it's not of type DiskLocal.\n"
"If you want to use force_sync and same disk for all logs, please set keeper_server.log_storage_disk to a local disk.\n"
"If you want to use force_sync and different disk only for old logs, please set 'keeper_server.log_storage_disk' to any "
"supported disk and 'keeper_server.latest_log_storage_disk' to a local disk.\n"
"Otherwise, disable force_sync",
latest_log_disk->getName());
}
/// Load all files on changelog disks
const auto load_from_disk = [&](const auto & disk)
{
LOG_TRACE(log, "Reading from disk {}", disk->getName());
std::unordered_map<std::string, std::string> incomplete_files;
const auto clean_incomplete_file = [&](const auto & file_path)
{
if (auto incomplete_it = incomplete_files.find(fs::path(file_path).filename()); incomplete_it != incomplete_files.end())
{
LOG_TRACE(log, "Removing {} from {}", file_path, disk->getName());
disk->removeFile(file_path);
disk->removeFile(incomplete_it->second);
incomplete_files.erase(incomplete_it);
return true;
}
return false;
};
std::vector<std::string> changelog_files;
for (auto it = disk->iterateDirectory(""); it->isValid(); it->next())
{
if (it->name() == changelogs_detached_dir)
continue;
if (it->name().starts_with(tmp_prefix))
{
incomplete_files.emplace(it->name().substr(tmp_prefix.size()), it->path());
continue;
}
if (clean_incomplete_file(it->path()))
continue;
changelog_files.push_back(it->path());
}
for (const auto & changelog_file : changelog_files)
{
if (clean_incomplete_file(fs::path(changelog_file).filename()))
continue;
auto file_description = getChangelogFileDescription(changelog_file);
file_description->disk = disk;
LOG_TRACE(log, "Found {} on {}", changelog_file, disk->getName());
auto [changelog_it, inserted] = existing_changelogs.insert_or_assign(file_description->from_log_index, std::move(file_description));
if (!inserted)
LOG_WARNING(log, "Found duplicate entries for {}, will use the entry from {}", changelog_it->second->path, disk->getName());
}
for (const auto & [name, path] : incomplete_files)
disk->removeFile(path);
};
/// Load all files from old disks
for (const auto & disk : keeper_context->getOldLogDisks())
load_from_disk(disk);
auto disk = getDisk();
load_from_disk(disk);
auto latest_log_disk = getLatestLogDisk();
if (disk != latest_log_disk)
load_from_disk(latest_log_disk);
if (existing_changelogs.empty())
LOG_WARNING(log, "No logs exists in {}. It's Ok if it's the first run of clickhouse-keeper.", changelogs_dir.generic_string());
LOG_WARNING(log, "No logs exists in {}. It's Ok if it's the first run of clickhouse-keeper.", disk->getPath());
clean_log_thread = ThreadFromGlobalPool([this] { cleanLogThread(); });
@ -550,8 +685,7 @@ Changelog::Changelog(
append_completion_thread = ThreadFromGlobalPool([this] { appendCompletionThread(); });
current_writer = std::make_unique<ChangelogWriter>(
existing_changelogs, changelogs_dir, log_file_settings);
current_writer = std::make_unique<ChangelogWriter>(existing_changelogs, keeper_context, log_file_settings);
}
void Changelog::readChangelogAndInitWriter(uint64_t last_commited_log_index, uint64_t logs_to_keep)
@ -623,7 +757,7 @@ void Changelog::readChangelogAndInitWriter(uint64_t last_commited_log_index, uin
break;
}
ChangelogReader reader(changelog_description.path);
ChangelogReader reader(changelog_description.disk, changelog_description.path);
last_log_read_result = reader.readChangelog(logs, start_to_read_from, log);
last_log_read_result->log_start_index = changelog_description.from_log_index;
@ -684,13 +818,13 @@ void Changelog::readChangelogAndInitWriter(uint64_t last_commited_log_index, uin
assert(existing_changelogs.find(last_log_read_result->log_start_index) != existing_changelogs.end());
assert(existing_changelogs.find(last_log_read_result->log_start_index)->first == existing_changelogs.rbegin()->first);
/// Continue to write into incomplete existing log if it doesn't finished with error
/// Continue to write into incomplete existing log if it didn't finish with error
const auto & description = existing_changelogs[last_log_read_result->log_start_index];
if (last_log_read_result->last_read_index == 0 || last_log_read_result->error) /// If it's broken log then remove it
{
LOG_INFO(log, "Removing chagelog {} because it's empty or read finished with error", description->path);
std::filesystem::remove(description->path);
description->disk->removeFile(description->path);
existing_changelogs.erase(last_log_read_result->log_start_index);
std::erase_if(logs, [last_log_read_result](const auto & item) { return item.first >= last_log_read_result->log_start_index; });
}
@ -699,55 +833,124 @@ void Changelog::readChangelogAndInitWriter(uint64_t last_commited_log_index, uin
initWriter(description);
}
}
else if (last_log_read_result.has_value())
{
/// check if we need to move completed log to another disk
auto latest_log_disk = getLatestLogDisk();
auto disk = getDisk();
auto & description = existing_changelogs.at(last_log_read_result->log_start_index);
if (latest_log_disk != disk && latest_log_disk == description->disk)
moveFileBetweenDisks(latest_log_disk, description, disk, description->path);
}
/// Start new log if we don't initialize writer from previous log. All logs can be "complete".
if (!current_writer->isFileSet())
current_writer->rotate(max_log_id + 1);
/// Move files to correct disks
auto latest_start_index = current_writer->getStartIndex();
auto latest_log_disk = getLatestLogDisk();
auto disk = getDisk();
for (const auto & [start_index, description] : existing_changelogs)
{
/// latest log should already be on latest_log_disk
if (start_index == latest_start_index)
{
chassert(description->disk == latest_log_disk);
continue;
}
if (description->disk != disk)
moveFileBetweenDisks(description->disk, description, disk, description->path);
}
initialized = true;
}
void Changelog::initWriter(ChangelogFileDescriptionPtr description)
{
if (description->expectedEntriesCountInLog() != rotate_interval)
LOG_TRACE(
log,
"Looks like rotate_logs_interval was changed, current {}, expected entries in last log {}",
rotate_interval,
description->expectedEntriesCountInLog());
LOG_TRACE(log, "Continue to write into {}", description->path);
auto log_disk = description->disk;
auto latest_log_disk = getLatestLogDisk();
if (log_disk != latest_log_disk)
moveFileBetweenDisks(log_disk, description, latest_log_disk, description->path);
current_writer->setFile(std::move(description), WriteMode::Append);
}
namespace
{
std::string getCurrentTimestampFolder()
{
const auto timestamp = LocalDateTime{std::time(nullptr)};
return fmt::format(
"{:02}{:02}{:02}T{:02}{:02}{:02}",
timestamp.year(),
timestamp.month(),
timestamp.day(),
timestamp.hour(),
timestamp.minute(),
timestamp.second());
std::string getCurrentTimestampFolder()
{
const auto timestamp = LocalDateTime{std::time(nullptr)};
return fmt::format(
"{:02}{:02}{:02}T{:02}{:02}{:02}",
timestamp.year(),
timestamp.month(),
timestamp.day(),
timestamp.hour(),
timestamp.minute(),
timestamp.second());
}
}
DiskPtr Changelog::getDisk() const
{
return keeper_context->getLogDisk();
}
DiskPtr Changelog::getLatestLogDisk() const
{
return keeper_context->getLatestLogDisk();
}
void Changelog::removeExistingLogs(ChangelogIter begin, ChangelogIter end)
{
const auto timestamp_folder = changelogs_detached_dir / getCurrentTimestampFolder();
auto disk = getDisk();
const auto timestamp_folder = (fs::path(changelogs_detached_dir) / getCurrentTimestampFolder()).generic_string();
for (auto itr = begin; itr != end;)
{
if (!std::filesystem::exists(timestamp_folder))
if (!disk->exists(timestamp_folder))
{
LOG_WARNING(log, "Moving broken logs to {}", timestamp_folder.generic_string());
std::filesystem::create_directories(timestamp_folder);
LOG_WARNING(log, "Moving broken logs to {}", timestamp_folder);
disk->createDirectories(timestamp_folder);
}
LOG_WARNING(log, "Removing changelog {}", itr->second->path);
const std::filesystem::path & path = itr->second->path;
const auto new_path = timestamp_folder / path.filename();
std::filesystem::rename(path, new_path);
auto changelog_disk = itr->second->disk;
if (changelog_disk == disk)
{
try
{
disk->moveFile(path.generic_string(), new_path.generic_string());
}
catch (const DB::Exception & e)
{
if (e.code() == DB::ErrorCodes::NOT_IMPLEMENTED)
moveFileBetweenDisks(changelog_disk, itr->second, disk, new_path);
}
}
else
moveFileBetweenDisks(changelog_disk, itr->second, disk, new_path);
itr = existing_changelogs.erase(itr);
}
}
@ -882,7 +1085,6 @@ void Changelog::writeAt(uint64_t index, const LogEntryPtr & log_entry)
throw Exception(ErrorCodes::LOGICAL_ERROR, "Changelog must be initialized before writing records");
{
std::lock_guard lock(writer_mutex);
/// This write_at require to overwrite everything in this file and also in previous file(s)
const bool go_to_previous_file = index < current_writer->getStartIndex();
@ -898,13 +1100,18 @@ void Changelog::writeAt(uint64_t index, const LogEntryPtr & log_entry)
else
description = std::prev(index_changelog)->second;
auto log_disk = description->disk;
auto latest_log_disk = getLatestLogDisk();
if (log_disk != latest_log_disk)
moveFileBetweenDisks(log_disk, description, latest_log_disk, description->path);
current_writer->setFile(std::move(description), WriteMode::Append);
/// Remove all subsequent files if overwritten something in previous one
auto to_remove_itr = existing_changelogs.upper_bound(index);
for (auto itr = to_remove_itr; itr != existing_changelogs.end();)
{
std::filesystem::remove(itr->second->path);
itr->second->disk->removeFile(itr->second->path);
itr = existing_changelogs.erase(itr);
}
}
@ -954,14 +1161,22 @@ void Changelog::compact(uint64_t up_to_log_index)
LOG_INFO(log, "Removing changelog {} because of compaction", changelog_description.path);
/// If failed to push to queue for background removing, then we will remove it now
if (!log_files_to_delete_queue.tryPush(changelog_description.path, 1))
if (!log_files_to_delete_queue.tryPush({changelog_description.path, changelog_description.disk}, 1))
{
std::error_code ec;
std::filesystem::remove(changelog_description.path, ec);
if (ec)
LOG_WARNING(log, "Failed to remove changelog {} in compaction, error message: {}", changelog_description.path, ec.message());
else
LOG_INFO(log, "Removed changelog {} because of compaction", changelog_description.path);
try
{
changelog_description.disk->removeFile(changelog_description.path);
LOG_INFO(log, "Removed changelog {} because of compaction.", changelog_description.path);
}
catch (Exception & e)
{
LOG_WARNING(
log, "Failed to remove changelog {} in compaction, error message: {}", changelog_description.path, e.message());
}
catch (...)
{
tryLogCurrentException(log);
}
}
changelog_description.deleted = true;
@ -1151,14 +1366,23 @@ Changelog::~Changelog()
void Changelog::cleanLogThread()
{
std::string path;
while (log_files_to_delete_queue.pop(path))
std::pair<std::string, DiskPtr> path_with_disk;
while (log_files_to_delete_queue.pop(path_with_disk))
{
std::error_code ec;
if (std::filesystem::remove(path, ec))
const auto & [path, disk] = path_with_disk;
try
{
disk->removeFile(path);
LOG_INFO(log, "Removed changelog {} because of compaction.", path);
else
LOG_WARNING(log, "Failed to remove changelog {} in compaction, error message: {}", path, ec.message());
}
catch (Exception & e)
{
LOG_WARNING(log, "Failed to remove changelog {} in compaction, error message: {}", path, e.message());
}
catch (...)
{
tryLogCurrentException(log);
}
}
}

View File

@ -11,6 +11,7 @@
#include <libnuraft/raft_server.hxx>
#include <Common/ConcurrentBoundedQueue.h>
#include <Common/ThreadPool.h>
#include <Coordination/KeeperContext.h>
namespace DB
{
@ -59,6 +60,7 @@ struct ChangelogFileDescription
uint64_t to_log_index;
std::string extension;
DiskPtr disk;
std::string path;
bool deleted = false;
@ -87,9 +89,9 @@ class Changelog
{
public:
Changelog(
const std::string & changelogs_dir_,
Poco::Logger * log_,
LogFileSettings log_file_settings);
LogFileSettings log_file_settings,
KeeperContextPtr keeper_context_);
Changelog(Changelog &&) = delete;
@ -152,6 +154,9 @@ private:
/// Pack log_entry into changelog record
static ChangelogRecord buildRecord(uint64_t index, const LogEntryPtr & log_entry);
DiskPtr getDisk() const;
DiskPtr getLatestLogDisk() const;
/// Currently existing changelogs
std::map<uint64_t, ChangelogFileDescriptionPtr> existing_changelogs;
@ -169,8 +174,7 @@ private:
/// Clean useless log files in a background thread
void cleanLogThread();
const std::filesystem::path changelogs_dir;
const std::filesystem::path changelogs_detached_dir;
const String changelogs_detached_dir;
const uint64_t rotate_interval;
Poco::Logger * log;
@ -185,7 +189,7 @@ private:
uint64_t max_log_id = 0;
/// For compaction, queue of delete not used logs
/// 128 is enough, even if log is not removed, it's not a problem
ConcurrentBoundedQueue<std::string> log_files_to_delete_queue{128};
ConcurrentBoundedQueue<std::pair<std::string, DiskPtr>> log_files_to_delete_queue{128};
ThreadFromGlobalPool clean_log_thread;
struct AppendLog
@ -223,6 +227,8 @@ private:
nuraft::wptr<nuraft::raft_server> raft_server;
KeeperContextPtr keeper_context;
bool initialized = false;
};

View File

@ -85,14 +85,6 @@ void KeeperConfigurationAndSettings::dump(WriteBufferFromOwnString & buf) const
writeText(four_letter_word_allow_list, buf);
buf.write('\n');
writeText("log_storage_path=", buf);
writeText(log_storage_path, buf);
buf.write('\n');
writeText("snapshot_storage_path=", buf);
writeText(snapshot_storage_path, buf);
buf.write('\n');
/// coordination_settings
writeText("max_requests_batch_size=", buf);
@ -188,61 +180,9 @@ KeeperConfigurationAndSettings::loadFromConfig(const Poco::Util::AbstractConfigu
DEFAULT_FOUR_LETTER_WORD_CMD));
ret->log_storage_path = getLogsPathFromConfig(config, standalone_keeper_);
ret->snapshot_storage_path = getSnapshotsPathFromConfig(config, standalone_keeper_);
ret->state_file_path = getStateFilePathFromConfig(config, standalone_keeper_);
ret->coordination_settings->loadFromConfig("keeper_server.coordination_settings", config);
return ret;
}
String KeeperConfigurationAndSettings::getLogsPathFromConfig(const Poco::Util::AbstractConfiguration & config, bool standalone_keeper_)
{
/// the most specialized path
if (config.has("keeper_server.log_storage_path"))
return config.getString("keeper_server.log_storage_path");
if (config.has("keeper_server.storage_path"))
return std::filesystem::path{config.getString("keeper_server.storage_path")} / "logs";
if (standalone_keeper_)
return std::filesystem::path{config.getString("path", KEEPER_DEFAULT_PATH)} / "logs";
else
return std::filesystem::path{config.getString("path", DBMS_DEFAULT_PATH)} / "coordination/logs";
}
String KeeperConfigurationAndSettings::getSnapshotsPathFromConfig(const Poco::Util::AbstractConfiguration & config, bool standalone_keeper_)
{
/// the most specialized path
if (config.has("keeper_server.snapshot_storage_path"))
return config.getString("keeper_server.snapshot_storage_path");
if (config.has("keeper_server.storage_path"))
return std::filesystem::path{config.getString("keeper_server.storage_path")} / "snapshots";
if (standalone_keeper_)
return std::filesystem::path{config.getString("path", KEEPER_DEFAULT_PATH)} / "snapshots";
else
return std::filesystem::path{config.getString("path", DBMS_DEFAULT_PATH)} / "coordination/snapshots";
}
String KeeperConfigurationAndSettings::getStateFilePathFromConfig(const Poco::Util::AbstractConfiguration & config, bool standalone_keeper_)
{
if (config.has("keeper_server.storage_path"))
return std::filesystem::path{config.getString("keeper_server.storage_path")} / "state";
if (config.has("keeper_server.snapshot_storage_path"))
return std::filesystem::path(config.getString("keeper_server.snapshot_storage_path")).parent_path() / "state";
if (config.has("keeper_server.log_storage_path"))
return std::filesystem::path(config.getString("keeper_server.log_storage_path")).parent_path() / "state";
if (standalone_keeper_)
return std::filesystem::path{config.getString("path", KEEPER_DEFAULT_PATH)} / "state";
else
return std::filesystem::path{config.getString("path", DBMS_DEFAULT_PATH)} / "coordination/state";
}
}

View File

@ -82,17 +82,8 @@ struct KeeperConfigurationAndSettings
bool standalone_keeper;
CoordinationSettingsPtr coordination_settings;
String log_storage_path;
String snapshot_storage_path;
String state_file_path;
void dump(WriteBufferFromOwnString & buf) const;
static std::shared_ptr<KeeperConfigurationAndSettings> loadFromConfig(const Poco::Util::AbstractConfiguration & config, bool standalone_keeper_);
private:
static String getLogsPathFromConfig(const Poco::Util::AbstractConfiguration & config, bool standalone_keeper_);
static String getSnapshotsPathFromConfig(const Poco::Util::AbstractConfiguration & config, bool standalone_keeper_);
static String getStateFilePathFromConfig(const Poco::Util::AbstractConfiguration & config, bool standalone_keeper_);
};
using KeeperConfigurationAndSettingsPtr = std::shared_ptr<KeeperConfigurationAndSettings>;

View File

@ -297,6 +297,7 @@ String ConfCommand::run()
StringBuffer buf;
keeper_dispatcher.getKeeperConfigurationAndSettings()->dump(buf);
keeper_dispatcher.getKeeperContext()->dumpConfiguration(buf);
return buf.str();
}
@ -542,7 +543,7 @@ String CleanResourcesCommand::run()
String FeatureFlagsCommand::run()
{
const auto & feature_flags = keeper_dispatcher.getKeeperContext()->feature_flags;
const auto & feature_flags = keeper_dispatcher.getKeeperContext()->getFeatureFlags();
StringBuffer ret;

View File

@ -28,8 +28,8 @@ void updateKeeperInformation(KeeperDispatcher & keeper_dispatcher, AsynchronousM
size_t zxid = 0;
size_t session_with_watches = 0;
size_t paths_watched = 0;
size_t snapshot_dir_size = 0;
size_t log_dir_size = 0;
//size_t snapshot_dir_size = 0;
//size_t log_dir_size = 0;
if (keeper_dispatcher.isServerActive())
{
@ -49,8 +49,8 @@ void updateKeeperInformation(KeeperDispatcher & keeper_dispatcher, AsynchronousM
latest_snapshot_size = state_machine.getLatestSnapshotBufSize();
session_with_watches = state_machine.getSessionsWithWatchesCount();
paths_watched = state_machine.getWatchedPathsCount();
snapshot_dir_size = keeper_dispatcher.getSnapDirSize();
log_dir_size = keeper_dispatcher.getLogDirSize();
//snapshot_dir_size = keeper_dispatcher.getSnapDirSize();
//log_dir_size = keeper_dispatcher.getLogDirSize();
# if defined(__linux__) || defined(__APPLE__)
open_file_descriptor_count = getCurrentProcessFDCount();
@ -85,8 +85,8 @@ void updateKeeperInformation(KeeperDispatcher & keeper_dispatcher, AsynchronousM
new_values["KeeperZxid"] = { zxid, "The current transaction id number (zxid) in ClickHouse Keeper." };
new_values["KeeperSessionWithWatches"] = { session_with_watches, "The number of client sessions of ClickHouse Keeper having watches." };
new_values["KeeperPathsWatched"] = { paths_watched, "The number of different paths watched by the clients of ClickHouse Keeper." };
new_values["KeeperSnapshotDirSize"] = { snapshot_dir_size, "The size of the snapshots directory of ClickHouse Keeper, in bytes." };
new_values["KeeperLogDirSize"] = { log_dir_size, "The size of the logs directory of ClickHouse Keeper, in bytes." };
//new_values["KeeperSnapshotDirSize"] = { snapshot_dir_size, "The size of the snapshots directory of ClickHouse Keeper, in bytes." };
//new_values["KeeperLogDirSize"] = { log_dir_size, "The size of the logs directory of ClickHouse Keeper, in bytes." };
auto keeper_log_info = keeper_dispatcher.getKeeperLogInfo();
@ -108,8 +108,8 @@ void updateKeeperInformation(KeeperDispatcher & keeper_dispatcher, AsynchronousM
}
KeeperAsynchronousMetrics::KeeperAsynchronousMetrics(
TinyContextPtr tiny_context_, int update_period_seconds, const ProtocolServerMetricsFunc & protocol_server_metrics_func_)
: AsynchronousMetrics(update_period_seconds, protocol_server_metrics_func_), tiny_context(std::move(tiny_context_))
ContextPtr context_, int update_period_seconds, const ProtocolServerMetricsFunc & protocol_server_metrics_func_)
: AsynchronousMetrics(update_period_seconds, protocol_server_metrics_func_), context(std::move(context_))
{
}
@ -117,7 +117,7 @@ void KeeperAsynchronousMetrics::updateImpl(AsynchronousMetricValues & new_values
{
#if USE_NURAFT
{
auto keeper_dispatcher = tiny_context->tryGetKeeperDispatcher();
auto keeper_dispatcher = context->tryGetKeeperDispatcher();
if (keeper_dispatcher)
updateKeeperInformation(*keeper_dispatcher, new_values);
}

View File

@ -1,6 +1,6 @@
#pragma once
#include <Coordination/TinyContext.h>
#include <Interpreters/Context.h>
#include <Common/AsynchronousMetrics.h>
namespace DB
@ -13,10 +13,10 @@ class KeeperAsynchronousMetrics : public AsynchronousMetrics
{
public:
KeeperAsynchronousMetrics(
TinyContextPtr tiny_context_, int update_period_seconds, const ProtocolServerMetricsFunc & protocol_server_metrics_func_);
ContextPtr context_, int update_period_seconds, const ProtocolServerMetricsFunc & protocol_server_metrics_func_);
private:
TinyContextPtr tiny_context;
ContextPtr context;
void updateImpl(AsynchronousMetricValues & new_values, TimePoint update_time, TimePoint current_time) override;
};

View File

@ -1,4 +1,9 @@
#include <Coordination/KeeperContext.h>
#include <Coordination/Defines.h>
#include <Disks/DiskLocal.h>
#include <Interpreters/Context.h>
#include <Poco/Util/AbstractConfiguration.h>
#include <Coordination/KeeperConstants.h>
#include <Common/logger_useful.h>
#include <Coordination/KeeperFeatureFlags.h>
@ -14,14 +19,15 @@ extern const int BAD_ARGUMENTS;
}
KeeperContext::KeeperContext()
KeeperContext::KeeperContext(bool standalone_keeper_)
: disk_selector(std::make_shared<DiskSelector>())
, standalone_keeper(standalone_keeper_)
{
/// enable by default some feature flags
feature_flags.enableFeatureFlag(KeeperFeatureFlag::FILTERED_LIST);
feature_flags.enableFeatureFlag(KeeperFeatureFlag::MULTI_READ);
system_nodes_with_data[keeper_api_feature_flags_path] = feature_flags.getFeatureFlags();
/// for older clients, the default is equivalent to WITH_MULTI_READ version
system_nodes_with_data[keeper_api_version_path] = toString(static_cast<uint8_t>(KeeperApiVersion::WITH_MULTI_READ));
}
@ -31,6 +37,264 @@ void KeeperContext::initialize(const Poco::Util::AbstractConfiguration & config)
digest_enabled = config.getBool("keeper_server.digest_enabled", false);
ignore_system_path_on_startup = config.getBool("keeper_server.ignore_system_path_on_startup", false);
initializeFeatureFlags(config);
initializeDisks(config);
}
void KeeperContext::initializeDisks(const Poco::Util::AbstractConfiguration & config)
{
disk_selector->initialize(config, "storage_configuration.disks", Context::getGlobalContextInstance());
log_storage = getLogsPathFromConfig(config);
if (config.has("keeper_server.latest_log_storage_disk"))
latest_log_storage = config.getString("keeper_server.latest_log_storage_disk");
else
latest_log_storage = log_storage;
const auto collect_old_disk_names = [&](const std::string_view key_prefix, std::vector<std::string> & disk_names)
{
Poco::Util::AbstractConfiguration::Keys disk_name_keys;
config.keys("keeper_server", disk_name_keys);
for (const auto & key : disk_name_keys)
{
if (key.starts_with(key_prefix))
disk_names.push_back(config.getString(fmt::format("keeper_server.{}", key)));
}
};
collect_old_disk_names("old_log_storage_disk", old_log_disk_names);
collect_old_disk_names("old_snapshot_storage_disk", old_snapshot_disk_names);
snapshot_storage = getSnapshotsPathFromConfig(config);
if (config.has("keeper_server.latest_snapshot_storage_disk"))
latest_snapshot_storage = config.getString("keeper_server.latest_snapshot_storage_disk");
else
latest_snapshot_storage = snapshot_storage;
state_file_storage = getStatePathFromConfig(config);
}
KeeperContext::Phase KeeperContext::getServerState() const
{
return server_state;
}
void KeeperContext::setServerState(KeeperContext::Phase server_state_)
{
server_state = server_state_;
}
bool KeeperContext::ignoreSystemPathOnStartup() const
{
return ignore_system_path_on_startup;
}
bool KeeperContext::digestEnabled() const
{
return digest_enabled;
}
void KeeperContext::setDigestEnabled(bool digest_enabled_)
{
digest_enabled = digest_enabled_;
}
DiskPtr KeeperContext::getDisk(const Storage & storage) const
{
if (const auto * storage_disk = std::get_if<DiskPtr>(&storage))
return *storage_disk;
const auto & disk_name = std::get<std::string>(storage);
return disk_selector->get(disk_name);
}
DiskPtr KeeperContext::getLogDisk() const
{
return getDisk(log_storage);
}
std::vector<DiskPtr> KeeperContext::getOldLogDisks() const
{
std::vector<DiskPtr> old_log_disks;
old_log_disks.reserve(old_log_disk_names.size());
for (const auto & disk_name : old_log_disk_names)
old_log_disks.push_back(disk_selector->get(disk_name));
return old_log_disks;
}
DiskPtr KeeperContext::getLatestLogDisk() const
{
return getDisk(latest_log_storage);
}
void KeeperContext::setLogDisk(DiskPtr disk)
{
log_storage = disk;
latest_log_storage = std::move(disk);
}
DiskPtr KeeperContext::getLatestSnapshotDisk() const
{
return getDisk(latest_snapshot_storage);
}
DiskPtr KeeperContext::getSnapshotDisk() const
{
return getDisk(snapshot_storage);
}
std::vector<DiskPtr> KeeperContext::getOldSnapshotDisks() const
{
std::vector<DiskPtr> old_snapshot_disks;
old_snapshot_disks.reserve(old_snapshot_disk_names.size());
for (const auto & disk_name : old_snapshot_disk_names)
old_snapshot_disks.push_back(disk_selector->get(disk_name));
return old_snapshot_disks;
}
void KeeperContext::setSnapshotDisk(DiskPtr disk)
{
snapshot_storage = std::move(disk);
latest_snapshot_storage = snapshot_storage;
}
DiskPtr KeeperContext::getStateFileDisk() const
{
return getDisk(state_file_storage);
}
void KeeperContext::setStateFileDisk(DiskPtr disk)
{
state_file_storage = std::move(disk);
}
const std::unordered_map<std::string, std::string> & KeeperContext::getSystemNodesWithData() const
{
return system_nodes_with_data;
}
const KeeperFeatureFlags & KeeperContext::getFeatureFlags() const
{
return feature_flags;
}
void KeeperContext::dumpConfiguration(WriteBufferFromOwnString & buf) const
{
auto dump_disk_info = [&](const std::string_view prefix, const IDisk & disk)
{
writeText(fmt::format("{}_path=", prefix), buf);
writeText(disk.getPath(), buf);
buf.write('\n');
writeText(fmt::format("{}_disk=", prefix), buf);
writeText(disk.getName(), buf);
buf.write('\n');
};
{
auto log_disk = getDisk(log_storage);
dump_disk_info("log_storage", *log_disk);
auto latest_log_disk = getDisk(latest_log_storage);
if (log_disk != latest_log_disk)
dump_disk_info("latest_log_storage", *latest_log_disk);
}
{
auto snapshot_disk = getDisk(snapshot_storage);
dump_disk_info("snapshot_storage", *snapshot_disk);
}
}
KeeperContext::Storage KeeperContext::getLogsPathFromConfig(const Poco::Util::AbstractConfiguration & config) const
{
const auto create_local_disk = [](const auto & path)
{
if (!fs::exists(path))
fs::create_directories(path);
return std::make_shared<DiskLocal>("LocalLogDisk", path, 0);
};
/// the most specialized path
if (config.has("keeper_server.log_storage_path"))
return create_local_disk(config.getString("keeper_server.log_storage_path"));
if (config.has("keeper_server.log_storage_disk"))
return config.getString("keeper_server.log_storage_disk");
if (config.has("keeper_server.storage_path"))
return create_local_disk(std::filesystem::path{config.getString("keeper_server.storage_path")} / "logs");
if (standalone_keeper)
return create_local_disk(std::filesystem::path{config.getString("path", KEEPER_DEFAULT_PATH)} / "logs");
else
return create_local_disk(std::filesystem::path{config.getString("path", DBMS_DEFAULT_PATH)} / "coordination/logs");
}
KeeperContext::Storage KeeperContext::getSnapshotsPathFromConfig(const Poco::Util::AbstractConfiguration & config) const
{
const auto create_local_disk = [](const auto & path)
{
if (!fs::exists(path))
fs::create_directories(path);
return std::make_shared<DiskLocal>("LocalSnapshotDisk", path, 0);
};
/// the most specialized path
if (config.has("keeper_server.snapshot_storage_path"))
return create_local_disk(config.getString("keeper_server.snapshot_storage_path"));
if (config.has("keeper_server.snapshot_storage_disk"))
return config.getString("keeper_server.snapshot_storage_disk");
if (config.has("keeper_server.storage_path"))
return create_local_disk(std::filesystem::path{config.getString("keeper_server.storage_path")} / "snapshots");
if (standalone_keeper)
return create_local_disk(std::filesystem::path{config.getString("path", KEEPER_DEFAULT_PATH)} / "snapshots");
else
return create_local_disk(std::filesystem::path{config.getString("path", DBMS_DEFAULT_PATH)} / "coordination/snapshots");
}
KeeperContext::Storage KeeperContext::getStatePathFromConfig(const Poco::Util::AbstractConfiguration & config) const
{
const auto create_local_disk = [](const auto & path)
{
if (!fs::exists(path))
fs::create_directories(path);
return std::make_shared<DiskLocal>("LocalStateFileDisk", path, 0);
};
if (config.has("keeper_server.state_storage_disk"))
return config.getString("keeper_server.state_storage_disk");
if (config.has("keeper_server.storage_path"))
return create_local_disk(std::filesystem::path{config.getString("keeper_server.storage_path")});
if (config.has("keeper_server.snapshot_storage_path"))
return create_local_disk(std::filesystem::path(config.getString("keeper_server.snapshot_storage_path")).parent_path());
if (config.has("keeper_server.log_storage_path"))
return create_local_disk(std::filesystem::path(config.getString("keeper_server.log_storage_path")).parent_path());
if (standalone_keeper)
return create_local_disk(std::filesystem::path{config.getString("path", KEEPER_DEFAULT_PATH)});
else
return create_local_disk(std::filesystem::path{config.getString("path", DBMS_DEFAULT_PATH)} / "coordination");
}
void KeeperContext::initializeFeatureFlags(const Poco::Util::AbstractConfiguration & config)
{
static const std::string feature_flags_key = "keeper_server.feature_flags";
if (config.has(feature_flags_key))
{

View File

@ -1,16 +1,21 @@
#pragma once
#include <Poco/Util/AbstractConfiguration.h>
#include <Coordination/KeeperFeatureFlags.h>
#include <IO/WriteBufferFromString.h>
#include <Disks/DiskSelector.h>
#include <cstdint>
#include <memory>
namespace DB
{
struct KeeperContext
class KeeperContext
{
KeeperContext();
void initialize(const Poco::Util::AbstractConfiguration & config);
public:
explicit KeeperContext(bool standalone_keeper_);
enum class Phase : uint8_t
{
@ -19,11 +24,64 @@ struct KeeperContext
SHUTDOWN
};
void initialize(const Poco::Util::AbstractConfiguration & config);
Phase getServerState() const;
void setServerState(Phase server_state_);
bool ignoreSystemPathOnStartup() const;
bool digestEnabled() const;
void setDigestEnabled(bool digest_enabled_);
DiskPtr getLatestLogDisk() const;
DiskPtr getLogDisk() const;
std::vector<DiskPtr> getOldLogDisks() const;
void setLogDisk(DiskPtr disk);
DiskPtr getLatestSnapshotDisk() const;
DiskPtr getSnapshotDisk() const;
std::vector<DiskPtr> getOldSnapshotDisks() const;
void setSnapshotDisk(DiskPtr disk);
DiskPtr getStateFileDisk() const;
void setStateFileDisk(DiskPtr disk);
const std::unordered_map<std::string, std::string> & getSystemNodesWithData() const;
const KeeperFeatureFlags & getFeatureFlags() const;
void dumpConfiguration(WriteBufferFromOwnString & buf) const;
private:
/// local disk defined using path or disk name
using Storage = std::variant<DiskPtr, std::string>;
void initializeFeatureFlags(const Poco::Util::AbstractConfiguration & config);
void initializeDisks(const Poco::Util::AbstractConfiguration & config);
Storage getLogsPathFromConfig(const Poco::Util::AbstractConfiguration & config) const;
Storage getSnapshotsPathFromConfig(const Poco::Util::AbstractConfiguration & config) const;
Storage getStatePathFromConfig(const Poco::Util::AbstractConfiguration & config) const;
DiskPtr getDisk(const Storage & storage) const;
Phase server_state{Phase::INIT};
bool ignore_system_path_on_startup{false};
bool digest_enabled{true};
std::shared_ptr<DiskSelector> disk_selector;
Storage log_storage;
Storage latest_log_storage;
Storage snapshot_storage;
Storage latest_snapshot_storage;
Storage state_file_storage;
std::vector<std::string> old_log_disk_names;
std::vector<std::string> old_snapshot_disk_names;
bool standalone_keeper;
std::unordered_map<std::string, std::string> system_nodes_with_data;
KeeperFeatureFlags feature_flags;

View File

@ -38,8 +38,6 @@ namespace ProfileEvents
extern const Event MemoryAllocatorPurgeTimeMicroseconds;
}
namespace fs = std::filesystem;
namespace DB
{
@ -238,13 +236,13 @@ void KeeperDispatcher::snapshotThread()
try
{
auto snapshot_path = task.create_snapshot(std::move(task.snapshot));
auto snapshot_file_info = task.create_snapshot(std::move(task.snapshot));
if (snapshot_path.empty())
if (snapshot_file_info.path.empty())
continue;
if (isLeader())
snapshot_s3.uploadSnapshot(snapshot_path);
snapshot_s3.uploadSnapshot(snapshot_file_info);
}
catch (...)
{
@ -336,7 +334,7 @@ void KeeperDispatcher::initialize(const Poco::Util::AbstractConfiguration & conf
snapshot_s3.startup(config, macros);
keeper_context = std::make_shared<KeeperContext>();
keeper_context = std::make_shared<KeeperContext>(standalone_keeper);
keeper_context->initialize(config);
server = std::make_unique<KeeperServer>(
@ -777,35 +775,37 @@ void KeeperDispatcher::updateKeeperStatLatency(uint64_t process_time_ms)
keeper_stats.updateLatency(process_time_ms);
}
static uint64_t getDirSize(const fs::path & dir)
static uint64_t getTotalSize(const DiskPtr & disk, const std::string & path = "")
{
checkStackSize();
if (!fs::exists(dir))
return 0;
fs::directory_iterator it(dir);
fs::directory_iterator end;
uint64_t size{0};
while (it != end)
uint64_t size = 0;
for (auto it = disk->iterateDirectory(path); it->isValid(); it->next())
{
if (it->is_regular_file())
size += fs::file_size(*it);
if (disk->isFile(it->path()))
size += disk->getFileSize(it->path());
else
size += getDirSize(it->path());
++it;
size += getTotalSize(disk, it->path());
}
return size;
}
uint64_t KeeperDispatcher::getLogDirSize() const
{
return getDirSize(configuration_and_settings->log_storage_path);
auto log_disk = keeper_context->getLogDisk();
auto size = getTotalSize(log_disk);
auto latest_log_disk = keeper_context->getLatestLogDisk();
if (log_disk != latest_log_disk)
size += getTotalSize(latest_log_disk);
return size;
}
uint64_t KeeperDispatcher::getSnapDirSize() const
{
return getDirSize(configuration_and_settings->snapshot_storage_path);
return getTotalSize(keeper_context->getSnapshotDisk());
}
Keeper4LWInfo KeeperDispatcher::getKeeper4LWInfo() const

View File

@ -205,7 +205,6 @@ public:
return keeper_context;
}
void incrementPacketsSent()
{
keeper_stats.incrementPacketsSent();

View File

@ -1,14 +1,14 @@
#include <Coordination/KeeperLogStore.h>
#include <IO/CompressionMethod.h>
#include <Disks/DiskLocal.h>
#include <Common/logger_useful.h>
namespace DB
{
KeeperLogStore::KeeperLogStore(
const std::string & changelogs_path, LogFileSettings log_file_settings)
KeeperLogStore::KeeperLogStore(LogFileSettings log_file_settings, KeeperContextPtr keeper_context)
: log(&Poco::Logger::get("KeeperLogStore"))
, changelog(changelogs_path, log, log_file_settings)
, changelog(log, log_file_settings, keeper_context)
{
if (log_file_settings.force_sync)
LOG_INFO(log, "force_sync enabled");

View File

@ -4,6 +4,7 @@
#include <mutex>
#include <Core/Types.h>
#include <Coordination/Changelog.h>
#include <Coordination/KeeperContext.h>
#include <base/defines.h>
namespace DB
@ -13,7 +14,7 @@ namespace DB
class KeeperLogStore : public nuraft::log_store
{
public:
KeeperLogStore(const std::string & changelogs_path, LogFileSettings log_file_settings);
KeeperLogStore(LogFileSettings log_file_settings, KeeperContextPtr keeper_context);
/// Read log storage from filesystem starting from last_commited_log_index
void init(uint64_t last_commited_log_index, uint64_t logs_to_keep);

View File

@ -26,6 +26,7 @@
#include <Common/ZooKeeper/ZooKeeperIO.h>
#include <Common/Stopwatch.h>
#include <Common/getMultipleKeysFromConfig.h>
#include <Disks/DiskLocal.h>
namespace DB
{
@ -124,7 +125,6 @@ KeeperServer::KeeperServer(
state_machine = nuraft::cs_new<KeeperStateMachine>(
responses_queue_,
snapshots_queue_,
configuration_and_settings_->snapshot_storage_path,
coordination_settings,
keeper_context,
config.getBool("keeper_server.upload_snapshot_on_exit", true) ? &snapshot_manager_s3 : nullptr,
@ -134,10 +134,10 @@ KeeperServer::KeeperServer(
state_manager = nuraft::cs_new<KeeperStateManager>(
server_id,
"keeper_server",
configuration_and_settings_->log_storage_path,
configuration_and_settings_->state_file_path,
"state",
config,
coordination_settings);
coordination_settings,
keeper_context);
}
/**
@ -413,7 +413,7 @@ void KeeperServer::startup(const Poco::Util::AbstractConfiguration & config, boo
launchRaftServer(config, enable_ipv6);
keeper_context->server_state = KeeperContext::Phase::RUNNING;
keeper_context->setServerState(KeeperContext::Phase::RUNNING);
}
void KeeperServer::shutdownRaftServer()
@ -428,7 +428,7 @@ void KeeperServer::shutdownRaftServer()
raft_instance->shutdown();
keeper_context->server_state = KeeperContext::Phase::SHUTDOWN;
keeper_context->setServerState(KeeperContext::Phase::SHUTDOWN);
if (create_snapshot_on_exit)
raft_instance->create_snapshot();

View File

@ -9,13 +9,15 @@
#include <IO/WriteHelpers.h>
#include <IO/copyData.h>
#include <Common/ZooKeeper/ZooKeeperIO.h>
#include <Coordination/pathUtils.h>
#include <filesystem>
#include <memory>
#include <Common/logger_useful.h>
#include <Coordination/KeeperContext.h>
#include <Coordination/pathUtils.h>
#include <Coordination/KeeperConstants.h>
#include <Common/ZooKeeper/ZooKeeperCommon.h>
#include "Core/Field.h"
#include <Disks/DiskLocal.h>
namespace DB
@ -30,6 +32,25 @@ namespace ErrorCodes
namespace
{
constexpr std::string_view tmp_prefix = "tmp_";
void moveFileBetweenDisks(DiskPtr disk_from, const std::string & path_from, DiskPtr disk_to, const std::string & path_to)
{
/// we use empty file with prefix tmp_ to detect incomplete copies
/// if a copy is complete we don't care from which disk we use the same file
/// so it's okay if a failure happens after removing of tmp file but before we remove
/// the snapshot from the source disk
auto from_path = fs::path(path_from);
auto tmp_snapshot_name = from_path.parent_path() / (std::string{tmp_prefix} + from_path.filename().string());
{
auto buf = disk_to->writeFile(tmp_snapshot_name);
buf->finalize();
}
disk_from->copyFile(from_path, *disk_to, path_to, {});
disk_to->removeFile(tmp_snapshot_name);
disk_from->removeFile(path_from);
}
uint64_t getSnapshotPathUpToLogIdx(const String & snapshot_path)
{
std::filesystem::path path(snapshot_path);
@ -41,7 +62,7 @@ namespace
std::string getSnapshotFileName(uint64_t up_to_log_idx, bool compress_zstd)
{
auto base = std::string{"snapshot_"} + std::to_string(up_to_log_idx) + ".bin";
auto base = fmt::format("snapshot_{}.bin", up_to_log_idx);
if (compress_zstd)
base += ".zstd";
return base;
@ -156,7 +177,7 @@ void KeeperStorageSnapshot::serialize(const KeeperStorageSnapshot & snapshot, Wr
if (snapshot.version >= SnapshotVersion::V5)
{
writeBinary(snapshot.zxid, out);
if (keeper_context->digest_enabled)
if (keeper_context->digestEnabled())
{
writeBinary(static_cast<uint8_t>(KeeperStorage::CURRENT_DIGEST_VERSION), out);
writeBinary(snapshot.nodes_digest, out);
@ -185,7 +206,7 @@ void KeeperStorageSnapshot::serialize(const KeeperStorageSnapshot & snapshot, Wr
}
/// Serialize data tree
writeBinary(snapshot.snapshot_container_size - keeper_context->system_nodes_with_data.size(), out);
writeBinary(snapshot.snapshot_container_size - keeper_context->getSystemNodesWithData().size(), out);
size_t counter = 0;
for (auto it = snapshot.begin; counter < snapshot.snapshot_container_size; ++counter)
{
@ -267,7 +288,7 @@ void KeeperStorageSnapshot::deserialize(SnapshotDeserializationResult & deserial
deserialization_result.snapshot_meta = deserializeSnapshotMetadata(in);
KeeperStorage & storage = *deserialization_result.storage;
bool recalculate_digest = keeper_context->digest_enabled;
bool recalculate_digest = keeper_context->digestEnabled();
if (version >= SnapshotVersion::V5)
{
readBinary(storage.zxid, in);
@ -349,7 +370,7 @@ void KeeperStorageSnapshot::deserialize(SnapshotDeserializationResult & deserial
const std::string error_msg = fmt::format("Cannot read node on path {} from a snapshot because it is used as a system node", path);
if (match_result == IS_CHILD)
{
if (keeper_context->ignore_system_path_on_startup || keeper_context->server_state != KeeperContext::Phase::INIT)
if (keeper_context->ignoreSystemPathOnStartup() || keeper_context->getServerState() != KeeperContext::Phase::INIT)
{
LOG_ERROR(&Poco::Logger::get("KeeperSnapshotManager"), "{}. Ignoring it", error_msg);
continue;
@ -365,7 +386,7 @@ void KeeperStorageSnapshot::deserialize(SnapshotDeserializationResult & deserial
{
if (!is_node_empty(node))
{
if (keeper_context->ignore_system_path_on_startup || keeper_context->server_state != KeeperContext::Phase::INIT)
if (keeper_context->ignoreSystemPathOnStartup() || keeper_context->getServerState() != KeeperContext::Phase::INIT)
{
LOG_ERROR(&Poco::Logger::get("KeeperSnapshotManager"), "{}. Ignoring it", error_msg);
node = KeeperStorage::Node{};
@ -394,9 +415,9 @@ void KeeperStorageSnapshot::deserialize(SnapshotDeserializationResult & deserial
{
if (itr.key != "/")
{
auto parent_path = parentPath(itr.key);
auto parent_path = parentNodePath(itr.key);
storage.container.updateValue(
parent_path, [version, path = itr.key](KeeperStorage::Node & value) { value.addChild(getBaseName(path), /*update_size*/ version < SnapshotVersion::V4); });
parent_path, [version, path = itr.key](KeeperStorage::Node & value) { value.addChild(getBaseNodeName(path), /*update_size*/ version < SnapshotVersion::V4); });
}
}
@ -507,70 +528,110 @@ KeeperStorageSnapshot::~KeeperStorageSnapshot()
}
KeeperSnapshotManager::KeeperSnapshotManager(
const std::string & snapshots_path_,
size_t snapshots_to_keep_,
const KeeperContextPtr & keeper_context_,
bool compress_snapshots_zstd_,
const std::string & superdigest_,
size_t storage_tick_time_)
: snapshots_path(snapshots_path_)
, snapshots_to_keep(snapshots_to_keep_)
: snapshots_to_keep(snapshots_to_keep_)
, compress_snapshots_zstd(compress_snapshots_zstd_)
, superdigest(superdigest_)
, storage_tick_time(storage_tick_time_)
, keeper_context(keeper_context_)
{
namespace fs = std::filesystem;
if (!fs::exists(snapshots_path))
fs::create_directories(snapshots_path);
for (const auto & p : fs::directory_iterator(snapshots_path))
const auto load_snapshot_from_disk = [&](const auto & disk)
{
const auto & path = p.path();
LOG_TRACE(log, "Reading from disk {}", disk->getName());
std::unordered_map<std::string, std::string> incomplete_files;
if (!path.has_filename())
continue;
if (startsWith(path.filename(), "tmp_")) /// Unfinished tmp files
const auto clean_incomplete_file = [&](const auto & file_path)
{
std::filesystem::remove(p);
continue;
if (auto incomplete_it = incomplete_files.find(fs::path(file_path).filename()); incomplete_it != incomplete_files.end())
{
LOG_TRACE(log, "Removing {} from {}", file_path, disk->getName());
disk->removeFile(file_path);
disk->removeFile(incomplete_it->second);
incomplete_files.erase(incomplete_it);
return true;
}
return false;
};
std::vector<std::string> snapshot_files;
for (auto it = disk->iterateDirectory(""); it->isValid(); it->next())
{
if (it->name().starts_with(tmp_prefix))
{
incomplete_files.emplace(it->name().substr(tmp_prefix.size()), it->path());
continue;
}
if (it->name().starts_with("snapshot_") && !clean_incomplete_file(it->path()))
snapshot_files.push_back(it->path());
}
/// Not snapshot file
if (!startsWith(path.filename(), "snapshot_"))
for (const auto & snapshot_file : snapshot_files)
{
continue;
if (clean_incomplete_file(fs::path(snapshot_file).filename()))
continue;
LOG_TRACE(log, "Found {} on {}", snapshot_file, disk->getName());
size_t snapshot_up_to = getSnapshotPathUpToLogIdx(snapshot_file);
auto [_, inserted] = existing_snapshots.insert_or_assign(snapshot_up_to, SnapshotFileInfo{snapshot_file, disk});
if (!inserted)
LOG_WARNING(
&Poco::Logger::get("KeeperSnapshotManager"),
"Found another snapshots with last log idx {}, will use snapshot from disk {}",
snapshot_up_to,
disk->getName());
}
size_t snapshot_up_to = getSnapshotPathUpToLogIdx(p.path());
existing_snapshots[snapshot_up_to] = p.path();
}
for (const auto & [name, path] : incomplete_files)
disk->removeFile(path);
};
for (const auto & disk : keeper_context->getOldSnapshotDisks())
load_snapshot_from_disk(disk);
auto disk = getDisk();
load_snapshot_from_disk(disk);
auto latest_snapshot_disk = getLatestSnapshotDisk();
if (latest_snapshot_disk != disk)
load_snapshot_from_disk(latest_snapshot_disk);
removeOutdatedSnapshotsIfNeeded();
moveSnapshotsIfNeeded();
}
std::string KeeperSnapshotManager::serializeSnapshotBufferToDisk(nuraft::buffer & buffer, uint64_t up_to_log_idx)
SnapshotFileInfo KeeperSnapshotManager::serializeSnapshotBufferToDisk(nuraft::buffer & buffer, uint64_t up_to_log_idx)
{
ReadBufferFromNuraftBuffer reader(buffer);
auto snapshot_file_name = getSnapshotFileName(up_to_log_idx, compress_snapshots_zstd);
auto tmp_snapshot_file_name = "tmp_" + snapshot_file_name;
std::string tmp_snapshot_path = std::filesystem::path{snapshots_path} / tmp_snapshot_file_name;
std::string new_snapshot_path = std::filesystem::path{snapshots_path} / snapshot_file_name;
WriteBufferFromFile plain_buf(tmp_snapshot_path);
copyData(reader, plain_buf);
plain_buf.sync();
auto disk = getLatestSnapshotDisk();
std::filesystem::rename(tmp_snapshot_path, new_snapshot_path);
{
auto buf = disk->writeFile(tmp_snapshot_file_name);
buf->finalize();
}
existing_snapshots.emplace(up_to_log_idx, new_snapshot_path);
auto plain_buf = disk->writeFile(snapshot_file_name);
copyData(reader, *plain_buf);
plain_buf->sync();
plain_buf->finalize();
disk->removeFile(tmp_snapshot_file_name);
existing_snapshots.emplace(up_to_log_idx, SnapshotFileInfo{snapshot_file_name, disk});
removeOutdatedSnapshotsIfNeeded();
moveSnapshotsIfNeeded();
return new_snapshot_path;
return {snapshot_file_name, disk};
}
nuraft::ptr<nuraft::buffer> KeeperSnapshotManager::deserializeLatestSnapshotBufferFromDisk()
@ -584,7 +645,8 @@ nuraft::ptr<nuraft::buffer> KeeperSnapshotManager::deserializeLatestSnapshotBuff
}
catch (const DB::Exception &)
{
std::filesystem::remove(latest_itr->second);
const auto & [path, disk] = latest_itr->second;
disk->removeFile(path);
existing_snapshots.erase(latest_itr->first);
tryLogCurrentException(__PRETTY_FUNCTION__);
}
@ -595,10 +657,10 @@ nuraft::ptr<nuraft::buffer> KeeperSnapshotManager::deserializeLatestSnapshotBuff
nuraft::ptr<nuraft::buffer> KeeperSnapshotManager::deserializeSnapshotBufferFromDisk(uint64_t up_to_log_idx) const
{
const std::string & snapshot_path = existing_snapshots.at(up_to_log_idx);
const auto & [snapshot_path, snapshot_disk] = existing_snapshots.at(up_to_log_idx);
WriteBufferFromNuraftBuffer writer;
ReadBufferFromFile reader(snapshot_path);
copyData(reader, writer);
auto reader = snapshot_disk->readFile(snapshot_path);
copyData(*reader, writer);
return writer.getBuffer();
}
@ -659,30 +721,75 @@ SnapshotDeserializationResult KeeperSnapshotManager::restoreFromLatestSnapshot()
return deserializeSnapshotFromBuffer(buffer);
}
DiskPtr KeeperSnapshotManager::getDisk() const
{
return keeper_context->getSnapshotDisk();
}
DiskPtr KeeperSnapshotManager::getLatestSnapshotDisk() const
{
return keeper_context->getLatestSnapshotDisk();
}
void KeeperSnapshotManager::removeOutdatedSnapshotsIfNeeded()
{
while (existing_snapshots.size() > snapshots_to_keep)
removeSnapshot(existing_snapshots.begin()->first);
}
void KeeperSnapshotManager::moveSnapshotsIfNeeded()
{
/// move snapshots to correct disks
auto disk = getDisk();
auto latest_snapshot_disk = getLatestSnapshotDisk();
auto latest_snapshot_idx = getLatestSnapshotIndex();
for (auto & [idx, file_info] : existing_snapshots)
{
if (idx == latest_snapshot_idx)
{
if (file_info.disk != latest_snapshot_disk)
{
moveFileBetweenDisks(file_info.disk, file_info.path, latest_snapshot_disk, file_info.path);
file_info.disk = latest_snapshot_disk;
}
}
else
{
if (file_info.disk != disk)
{
moveFileBetweenDisks(file_info.disk, file_info.path, disk, file_info.path);
file_info.disk = disk;
}
}
}
}
void KeeperSnapshotManager::removeSnapshot(uint64_t log_idx)
{
auto itr = existing_snapshots.find(log_idx);
if (itr == existing_snapshots.end())
throw Exception(ErrorCodes::UNKNOWN_SNAPSHOT, "Unknown snapshot with log index {}", log_idx);
std::filesystem::remove(itr->second);
const auto & [path, disk] = itr->second;
disk->removeFile(path);
existing_snapshots.erase(itr);
}
std::pair<std::string, std::error_code> KeeperSnapshotManager::serializeSnapshotToDisk(const KeeperStorageSnapshot & snapshot)
SnapshotFileInfo KeeperSnapshotManager::serializeSnapshotToDisk(const KeeperStorageSnapshot & snapshot)
{
auto up_to_log_idx = snapshot.snapshot_meta->get_last_log_idx();
auto snapshot_file_name = getSnapshotFileName(up_to_log_idx, compress_snapshots_zstd);
auto tmp_snapshot_file_name = "tmp_" + snapshot_file_name;
std::string tmp_snapshot_path = std::filesystem::path{snapshots_path} / tmp_snapshot_file_name;
std::string new_snapshot_path = std::filesystem::path{snapshots_path} / snapshot_file_name;
auto writer = std::make_unique<WriteBufferFromFile>(tmp_snapshot_path, O_WRONLY | O_TRUNC | O_CREAT | O_CLOEXEC | O_APPEND);
auto disk = getLatestSnapshotDisk();
{
auto buf = disk->writeFile(tmp_snapshot_file_name);
buf->finalize();
}
auto writer = disk->writeFile(snapshot_file_name);
std::unique_ptr<WriteBuffer> compressed_writer;
if (compress_snapshots_zstd)
compressed_writer = wrapWriteBufferWithCompressionMethod(std::move(writer), CompressionMethod::Zstd, 3);
@ -693,14 +800,13 @@ std::pair<std::string, std::error_code> KeeperSnapshotManager::serializeSnapshot
compressed_writer->finalize();
compressed_writer->sync();
std::error_code ec;
std::filesystem::rename(tmp_snapshot_path, new_snapshot_path, ec);
if (!ec)
{
existing_snapshots.emplace(up_to_log_idx, new_snapshot_path);
removeOutdatedSnapshotsIfNeeded();
}
return {new_snapshot_path, ec};
disk->removeFile(tmp_snapshot_file_name);
existing_snapshots.emplace(up_to_log_idx, SnapshotFileInfo{snapshot_file_name, disk});
removeOutdatedSnapshotsIfNeeded();
moveSnapshotsIfNeeded();
return {snapshot_file_name, disk};
}
}

View File

@ -6,6 +6,7 @@
#include <IO/WriteBuffer.h>
#include <libnuraft/nuraft.hxx>
#include <Coordination/KeeperContext.h>
#include <Disks/IDisk.h>
namespace DB
{
@ -86,8 +87,14 @@ public:
uint64_t nodes_digest;
};
struct SnapshotFileInfo
{
std::string path;
DiskPtr disk;
};
using KeeperStorageSnapshotPtr = std::shared_ptr<KeeperStorageSnapshot>;
using CreateSnapshotCallback = std::function<std::string(KeeperStorageSnapshotPtr &&)>;
using CreateSnapshotCallback = std::function<SnapshotFileInfo(KeeperStorageSnapshotPtr &&)>;
using SnapshotMetaAndStorage = std::pair<SnapshotMetadataPtr, KeeperStoragePtr>;
@ -98,7 +105,6 @@ class KeeperSnapshotManager
{
public:
KeeperSnapshotManager(
const std::string & snapshots_path_,
size_t snapshots_to_keep_,
const KeeperContextPtr & keeper_context_,
bool compress_snapshots_zstd_ = true,
@ -112,10 +118,10 @@ public:
nuraft::ptr<nuraft::buffer> serializeSnapshotToBuffer(const KeeperStorageSnapshot & snapshot) const;
/// Serialize already compressed snapshot to disk (return path)
std::string serializeSnapshotBufferToDisk(nuraft::buffer & buffer, uint64_t up_to_log_idx);
SnapshotFileInfo serializeSnapshotBufferToDisk(nuraft::buffer & buffer, uint64_t up_to_log_idx);
/// Serialize snapshot directly to disk
std::pair<std::string, std::error_code> serializeSnapshotToDisk(const KeeperStorageSnapshot & snapshot);
SnapshotFileInfo serializeSnapshotToDisk(const KeeperStorageSnapshot & snapshot);
SnapshotDeserializationResult deserializeSnapshotFromBuffer(nuraft::ptr<nuraft::buffer> buffer) const;
@ -139,30 +145,39 @@ public:
return 0;
}
std::string getLatestSnapshotPath() const
SnapshotFileInfo getLatestSnapshotInfo() const
{
if (!existing_snapshots.empty())
{
const auto & path = existing_snapshots.at(getLatestSnapshotIndex());
std::error_code ec;
if (std::filesystem::exists(path, ec))
return path;
const auto & [path, disk] = existing_snapshots.at(getLatestSnapshotIndex());
try
{
if (disk->exists(path))
return {path, disk};
}
catch (...)
{
}
}
return "";
return {"", nullptr};
}
private:
void removeOutdatedSnapshotsIfNeeded();
void moveSnapshotsIfNeeded();
DiskPtr getDisk() const;
DiskPtr getLatestSnapshotDisk() const;
/// Checks first 4 buffer bytes to became sure that snapshot compressed with
/// ZSTD codec.
static bool isZstdCompressed(nuraft::ptr<nuraft::buffer> buffer);
const std::string snapshots_path;
/// How many snapshots to keep before remove
const size_t snapshots_to_keep;
/// All existing snapshots in our path (log_index -> path)
std::map<uint64_t, std::string> existing_snapshots;
std::map<uint64_t, SnapshotFileInfo> existing_snapshots;
/// Compress snapshots in common ZSTD format instead of custom ClickHouse block LZ4 format
const bool compress_snapshots_zstd;
/// Superdigest for deserialization of storage
@ -171,6 +186,8 @@ private:
size_t storage_tick_time;
KeeperContextPtr keeper_context;
Poco::Logger * log = &Poco::Logger::get("KeeperSnapshotManager");
};
/// Keeper create snapshots in background thread. KeeperStateMachine just create

View File

@ -132,8 +132,9 @@ std::shared_ptr<KeeperSnapshotManagerS3::S3Configuration> KeeperSnapshotManagerS
return snapshot_s3_client;
}
void KeeperSnapshotManagerS3::uploadSnapshotImpl(const std::string & snapshot_path)
void KeeperSnapshotManagerS3::uploadSnapshotImpl(const SnapshotFileInfo & snapshot_file_info)
{
const auto & [snapshot_path, snapshot_disk] = snapshot_file_info;
try
{
auto s3_client = getSnapshotS3Client();
@ -154,8 +155,9 @@ void KeeperSnapshotManagerS3::uploadSnapshotImpl(const std::string & snapshot_pa
};
};
LOG_INFO(log, "Will try to upload snapshot on {} to S3", snapshot_path);
ReadBufferFromFile snapshot_file(snapshot_path);
LOG_INFO(log, "Will try to upload snapshot on {} to S3", snapshot_file_info.path);
auto snapshot_file = snapshot_disk->readFile(snapshot_file_info.path);
auto snapshot_name = fs::path(snapshot_path).filename().string();
auto lock_file = fmt::format(".{}_LOCK", snapshot_name);
@ -222,7 +224,7 @@ void KeeperSnapshotManagerS3::uploadSnapshotImpl(const std::string & snapshot_pa
});
WriteBufferFromS3 snapshot_writer = create_writer(snapshot_name);
copyData(snapshot_file, snapshot_writer);
copyData(*snapshot_file, snapshot_writer);
snapshot_writer.finalize();
LOG_INFO(log, "Successfully uploaded {} to S3", snapshot_path);
@ -240,31 +242,31 @@ void KeeperSnapshotManagerS3::snapshotS3Thread()
while (!shutdown_called)
{
std::string snapshot_path;
if (!snapshots_s3_queue.pop(snapshot_path))
SnapshotFileInfo snapshot_file_info;
if (!snapshots_s3_queue.pop(snapshot_file_info))
break;
if (shutdown_called)
break;
uploadSnapshotImpl(snapshot_path);
uploadSnapshotImpl(snapshot_file_info);
}
}
void KeeperSnapshotManagerS3::uploadSnapshot(const std::string & path, bool async_upload)
void KeeperSnapshotManagerS3::uploadSnapshot(const SnapshotFileInfo & file_info, bool async_upload)
{
if (getSnapshotS3Client() == nullptr)
return;
if (async_upload)
{
if (!snapshots_s3_queue.push(path))
LOG_WARNING(log, "Failed to add snapshot {} to S3 queue", path);
if (!snapshots_s3_queue.push(file_info))
LOG_WARNING(log, "Failed to add snapshot {} to S3 queue", file_info.path);
return;
}
uploadSnapshotImpl(path);
uploadSnapshotImpl(file_info);
}
void KeeperSnapshotManagerS3::startup(const Poco::Util::AbstractConfiguration & config, const MultiVersion<Macros>::Version & macros)

View File

@ -6,10 +6,13 @@
#include <Common/MultiVersion.h>
#include <Common/Macros.h>
#include <Coordination/KeeperSnapshotManager.h>
#if USE_AWS_S3
#include <Common/ConcurrentBoundedQueue.h>
#include <Common/ThreadPool.h>
#include <string>
#endif
@ -24,13 +27,13 @@ public:
/// 'macros' are used to substitute macros in endpoint of disks
void updateS3Configuration(const Poco::Util::AbstractConfiguration & config, const MultiVersion<Macros>::Version & macros);
void uploadSnapshot(const std::string & path, bool async_upload = true);
void uploadSnapshot(const SnapshotFileInfo & file_info, bool async_upload = true);
/// 'macros' are used to substitute macros in endpoint of disks
void startup(const Poco::Util::AbstractConfiguration & config, const MultiVersion<Macros>::Version & macros);
void shutdown();
private:
using SnapshotS3Queue = ConcurrentBoundedQueue<std::string>;
using SnapshotS3Queue = ConcurrentBoundedQueue<SnapshotFileInfo>;
SnapshotS3Queue snapshots_s3_queue;
/// Upload new snapshots to S3
@ -48,7 +51,7 @@ private:
std::shared_ptr<S3Configuration> getSnapshotS3Client() const;
void uploadSnapshotImpl(const std::string & snapshot_path);
void uploadSnapshotImpl(const SnapshotFileInfo & snapshot_file_info);
/// Thread upload snapshots to S3 in the background
void snapshotS3Thread();
@ -60,7 +63,7 @@ public:
KeeperSnapshotManagerS3() = default;
void updateS3Configuration(const Poco::Util::AbstractConfiguration &, const MultiVersion<Macros>::Version &) {}
void uploadSnapshot(const std::string &, [[maybe_unused]] bool async_upload = true) {}
void uploadSnapshot(const SnapshotFileInfo &, [[maybe_unused]] bool async_upload = true) {}
void startup(const Poco::Util::AbstractConfiguration &, const MultiVersion<Macros>::Version &) {}

View File

@ -14,6 +14,8 @@
#include <Common/logger_useful.h>
#include "Coordination/KeeperStorage.h"
#include <Disks/DiskLocal.h>
namespace ProfileEvents
{
@ -33,17 +35,11 @@ namespace DB
namespace ErrorCodes
{
extern const int LOGICAL_ERROR;
extern const int SYSTEM_ERROR;
}
namespace
{
}
KeeperStateMachine::KeeperStateMachine(
ResponsesQueue & responses_queue_,
SnapshotsQueue & snapshots_queue_,
const std::string & snapshots_path_,
const CoordinationSettingsPtr & coordination_settings_,
const KeeperContextPtr & keeper_context_,
KeeperSnapshotManagerS3 * snapshot_manager_s3_,
@ -52,7 +48,6 @@ KeeperStateMachine::KeeperStateMachine(
: commit_callback(commit_callback_)
, coordination_settings(coordination_settings_)
, snapshot_manager(
snapshots_path_,
coordination_settings->snapshots_to_keep,
keeper_context_,
coordination_settings->compress_snapshots_with_zstd_format,
@ -69,6 +64,16 @@ KeeperStateMachine::KeeperStateMachine(
{
}
namespace
{
bool isLocalDisk(const IDisk & disk)
{
return dynamic_cast<const DiskLocal *>(&disk) != nullptr;
}
}
void KeeperStateMachine::init()
{
/// Do everything without mutexes, no other threads exist.
@ -83,9 +88,13 @@ void KeeperStateMachine::init()
try
{
auto snapshot_deserialization_result
= snapshot_manager.deserializeSnapshotFromBuffer(snapshot_manager.deserializeSnapshotBufferFromDisk(latest_log_index));
latest_snapshot_path = snapshot_manager.getLatestSnapshotPath();
latest_snapshot_buf = snapshot_manager.deserializeSnapshotBufferFromDisk(latest_log_index);
auto snapshot_deserialization_result = snapshot_manager.deserializeSnapshotFromBuffer(latest_snapshot_buf);
latest_snapshot_info = snapshot_manager.getLatestSnapshotInfo();
if (isLocalDisk(*latest_snapshot_info.disk))
latest_snapshot_buf = nullptr;
storage = std::move(snapshot_deserialization_result.storage);
latest_snapshot_meta = snapshot_deserialization_result.snapshot_meta;
cluster_config = snapshot_deserialization_result.cluster_config;
@ -276,7 +285,7 @@ bool KeeperStateMachine::preprocess(const KeeperStorage::RequestForSession & req
std::abort();
}
if (keeper_context->digest_enabled && request_for_session.digest)
if (keeper_context->digestEnabled() && request_for_session.digest)
assertDigest(*request_for_session.digest, storage->getNodesDigest(false), *request_for_session.request, false);
return true;
@ -333,7 +342,7 @@ nuraft::ptr<nuraft::buffer> KeeperStateMachine::commit(const uint64_t log_idx, n
response_for_session.session_id);
}
if (keeper_context->digest_enabled && request_for_session->digest)
if (keeper_context->digestEnabled() && request_for_session->digest)
assertDigest(*request_for_session->digest, storage->getNodesDigest(true), *request_for_session->request, true);
}
@ -371,8 +380,13 @@ bool KeeperStateMachine::apply_snapshot(nuraft::snapshot & s)
{ /// deserialize and apply snapshot to storage
std::lock_guard lock(storage_and_responses_lock);
auto snapshot_deserialization_result
= snapshot_manager.deserializeSnapshotFromBuffer(snapshot_manager.deserializeSnapshotBufferFromDisk(s.get_last_log_idx()));
SnapshotDeserializationResult snapshot_deserialization_result;
if (latest_snapshot_ptr)
snapshot_deserialization_result = snapshot_manager.deserializeSnapshotFromBuffer(latest_snapshot_ptr);
else
snapshot_deserialization_result
= snapshot_manager.deserializeSnapshotFromBuffer(snapshot_manager.deserializeSnapshotBufferFromDisk(s.get_last_log_idx()));
/// maybe some logs were preprocessed with log idx larger than the snapshot idx
/// we have to apply them to the new storage
@ -464,19 +478,24 @@ void KeeperStateMachine::create_snapshot(nuraft::snapshot & s, nuraft::async_res
}
else
{
auto [path, error_code] = snapshot_manager.serializeSnapshotToDisk(*snapshot);
if (error_code)
{
throw Exception(
ErrorCodes::SYSTEM_ERROR,
"Snapshot {} was created failed, error: {}",
snapshot->snapshot_meta->get_last_log_idx(),
error_code.message());
}
latest_snapshot_path = path;
latest_snapshot_meta = snapshot->snapshot_meta;
/// we rely on the fact that the snapshot disk cannot be changed during runtime
if (isLocalDisk(*keeper_context->getLatestSnapshotDisk()))
{
auto snapshot_info = snapshot_manager.serializeSnapshotToDisk(*snapshot);
latest_snapshot_info = std::move(snapshot_info);
latest_snapshot_buf = nullptr;
}
else
{
auto snapshot_buf = snapshot_manager.serializeSnapshotToBuffer(*snapshot);
auto snapshot_info = snapshot_manager.serializeSnapshotBufferToDisk(*snapshot_buf, snapshot->snapshot_meta->get_last_log_idx());
latest_snapshot_info = std::move(snapshot_info);
latest_snapshot_buf = std::move(snapshot_buf);
}
ProfileEvents::increment(ProfileEvents::KeeperSnapshotCreations);
LOG_DEBUG(log, "Created persistent snapshot {} with path {}", latest_snapshot_meta->get_last_log_idx(), path);
LOG_DEBUG(log, "Created persistent snapshot {} with path {}", latest_snapshot_meta->get_last_log_idx(), latest_snapshot_info.path);
}
}
@ -500,19 +519,19 @@ void KeeperStateMachine::create_snapshot(nuraft::snapshot & s, nuraft::async_res
when_done(ret, exception);
return ret ? latest_snapshot_path : "";
return ret ? latest_snapshot_info : SnapshotFileInfo{};
};
if (keeper_context->server_state == KeeperContext::Phase::SHUTDOWN)
if (keeper_context->getServerState() == KeeperContext::Phase::SHUTDOWN)
{
LOG_INFO(log, "Creating a snapshot during shutdown because 'create_snapshot_on_exit' is enabled.");
auto snapshot_path = snapshot_task.create_snapshot(std::move(snapshot_task.snapshot));
auto snapshot_file_info = snapshot_task.create_snapshot(std::move(snapshot_task.snapshot));
if (!snapshot_path.empty() && snapshot_manager_s3)
if (!snapshot_file_info.path.empty() && snapshot_manager_s3)
{
LOG_INFO(log, "Uploading snapshot {} during shutdown because 'upload_snapshot_on_exit' is enabled.", snapshot_path);
snapshot_manager_s3->uploadSnapshot(snapshot_path, /* asnyc_upload */ false);
LOG_INFO(log, "Uploading snapshot {} during shutdown because 'upload_snapshot_on_exit' is enabled.", snapshot_file_info.path);
snapshot_manager_s3->uploadSnapshot(snapshot_file_info, /* asnyc_upload */ false);
}
return;
@ -533,14 +552,20 @@ void KeeperStateMachine::save_logical_snp_obj(
nuraft::ptr<nuraft::buffer> snp_buf = s.serialize();
nuraft::ptr<nuraft::snapshot> cloned_meta = nuraft::snapshot::deserialize(*snp_buf);
nuraft::ptr<nuraft::buffer> cloned_buffer;
/// we rely on the fact that the snapshot disk cannot be changed during runtime
if (!isLocalDisk(*keeper_context->getSnapshotDisk()))
cloned_buffer = nuraft::buffer::clone(data);
try
{
std::lock_guard lock(snapshots_lock);
/// Serialize snapshot to disk
auto result_path = snapshot_manager.serializeSnapshotBufferToDisk(data, s.get_last_log_idx());
latest_snapshot_path = result_path;
latest_snapshot_info = snapshot_manager.serializeSnapshotBufferToDisk(data, s.get_last_log_idx());
latest_snapshot_meta = cloned_meta;
LOG_DEBUG(log, "Saved snapshot {} to path {}", s.get_last_log_idx(), result_path);
latest_snapshot_buf = std::move(cloned_buffer);
LOG_DEBUG(log, "Saved snapshot {} to path {}", s.get_last_log_idx(), latest_snapshot_info.path);
obj_id++;
ProfileEvents::increment(ProfileEvents::KeeperSaveSnapshot);
}
@ -600,11 +625,23 @@ int KeeperStateMachine::read_logical_snp_obj(
latest_snapshot_meta->get_last_log_idx());
return -1;
}
if (bufferFromFile(log, latest_snapshot_path, data_out))
const auto & [path, disk] = latest_snapshot_info;
if (isLocalDisk(*disk))
{
LOG_WARNING(log, "Error reading snapshot {} from {}", s.get_last_log_idx(), latest_snapshot_path);
return -1;
auto full_path = fs::path(disk->getPath()) / path;
if (bufferFromFile(log, full_path, data_out))
{
LOG_WARNING(log, "Error reading snapshot {} from {}", s.get_last_log_idx(), full_path);
return -1;
}
}
else
{
chassert(latest_snapshot_buf);
data_out = nuraft::buffer::clone(*latest_snapshot_buf);
}
is_last_obj = true;
ProfileEvents::increment(ProfileEvents::KeeperReadSnapshot);

View File

@ -26,7 +26,6 @@ public:
KeeperStateMachine(
ResponsesQueue & responses_queue_,
SnapshotsQueue & snapshots_queue_,
const std::string & snapshots_path_,
const CoordinationSettingsPtr & coordination_settings_,
const KeeperContextPtr & keeper_context_,
KeeperSnapshotManagerS3 * snapshot_manager_s3_,
@ -128,7 +127,7 @@ private:
/// In our state machine we always have a single snapshot which is stored
/// in memory in compressed (serialized) format.
SnapshotMetadataPtr latest_snapshot_meta = nullptr;
std::string latest_snapshot_path;
SnapshotFileInfo latest_snapshot_info;
nuraft::ptr<nuraft::buffer> latest_snapshot_buf = nullptr;
CoordinationSettingsPtr coordination_settings;

View File

@ -8,6 +8,7 @@
#include <IO/ReadHelpers.h>
#include <IO/ReadBufferFromFile.h>
#include <Common/getMultipleKeysFromConfig.h>
#include <Disks/DiskLocal.h>
#include <Common/logger_useful.h>
namespace DB
@ -22,6 +23,8 @@ namespace ErrorCodes
namespace
{
const std::string copy_lock_file = "STATE_COPY_LOCK";
bool isLocalhost(const std::string & hostname)
{
try
@ -212,12 +215,14 @@ KeeperStateManager::parseServersConfiguration(const Poco::Util::AbstractConfigur
return result;
}
KeeperStateManager::KeeperStateManager(
int server_id_, const std::string & host, int port, const std::string & logs_path, const std::string & state_file_path)
KeeperStateManager::KeeperStateManager(int server_id_, const std::string & host, int port, KeeperContextPtr keeper_context_)
: my_server_id(server_id_)
, secure(false)
, log_store(nuraft::cs_new<KeeperLogStore>(logs_path, LogFileSettings{.force_sync =false, .compress_logs = false, .rotate_interval = 5000}))
, server_state_path(state_file_path)
, log_store(nuraft::cs_new<KeeperLogStore>(
LogFileSettings{.force_sync = false, .compress_logs = false, .rotate_interval = 5000},
keeper_context_))
, server_state_file_name("state")
, keeper_context(keeper_context_)
, logger(&Poco::Logger::get("KeeperStateManager"))
{
auto peer_config = nuraft::cs_new<nuraft::srv_config>(my_server_id, host + ":" + std::to_string(port));
@ -230,16 +235,15 @@ KeeperStateManager::KeeperStateManager(
KeeperStateManager::KeeperStateManager(
int my_server_id_,
const std::string & config_prefix_,
const std::string & log_storage_path,
const std::string & state_file_path,
const std::string & server_state_file_name_,
const Poco::Util::AbstractConfiguration & config,
const CoordinationSettingsPtr & coordination_settings)
const CoordinationSettingsPtr & coordination_settings,
KeeperContextPtr keeper_context_)
: my_server_id(my_server_id_)
, secure(config.getBool(config_prefix_ + ".raft_configuration.secure", false))
, config_prefix(config_prefix_)
, configuration_wrapper(parseServersConfiguration(config, false))
, log_store(nuraft::cs_new<KeeperLogStore>(
log_storage_path,
LogFileSettings
{
.force_sync = coordination_settings->force_sync,
@ -247,8 +251,10 @@ KeeperStateManager::KeeperStateManager(
.rotate_interval = coordination_settings->rotate_log_storage_interval,
.max_size = coordination_settings->max_log_file_size,
.overallocate_size = coordination_settings->log_file_overallocate_size
}))
, server_state_path(state_file_path)
},
keeper_context_))
, server_state_file_name(server_state_file_name_)
, keeper_context(keeper_context_)
, logger(&Poco::Logger::get("KeeperStateManager"))
{
}
@ -287,16 +293,21 @@ void KeeperStateManager::save_config(const nuraft::cluster_config & config)
configuration_wrapper.cluster_config = nuraft::cluster_config::deserialize(*buf);
}
const std::filesystem::path & KeeperStateManager::getOldServerStatePath()
const String & KeeperStateManager::getOldServerStatePath()
{
static auto old_path = [this]
{
return server_state_path.parent_path() / (server_state_path.filename().generic_string() + "-OLD");
return server_state_file_name + "-OLD";
}();
return old_path;
}
DiskPtr KeeperStateManager::getStateFileDisk() const
{
return keeper_context->getStateFileDisk();
}
namespace
{
enum ServerStateVersion : uint8_t
@ -312,51 +323,61 @@ void KeeperStateManager::save_state(const nuraft::srv_state & state)
{
const auto & old_path = getOldServerStatePath();
if (std::filesystem::exists(server_state_path))
std::filesystem::rename(server_state_path, old_path);
auto disk = getStateFileDisk();
WriteBufferFromFile server_state_file(server_state_path, DBMS_DEFAULT_BUFFER_SIZE, O_TRUNC | O_CREAT | O_WRONLY);
if (disk->exists(server_state_file_name))
{
auto buf = disk->writeFile(copy_lock_file);
buf->finalize();
disk->copyFile(server_state_file_name, *disk, old_path);
disk->removeFile(copy_lock_file);
disk->removeFile(old_path);
}
auto server_state_file = disk->writeFile(server_state_file_name);
auto buf = state.serialize();
// calculate checksum
SipHash hash;
hash.update(current_server_state_version);
hash.update(reinterpret_cast<const char *>(buf->data_begin()), buf->size());
writeIntBinary(hash.get64(), server_state_file);
writeIntBinary(hash.get64(), *server_state_file);
writeIntBinary(static_cast<uint8_t>(current_server_state_version), server_state_file);
writeIntBinary(static_cast<uint8_t>(current_server_state_version), *server_state_file);
server_state_file.write(reinterpret_cast<const char *>(buf->data_begin()), buf->size());
server_state_file.sync();
server_state_file.close();
server_state_file->write(reinterpret_cast<const char *>(buf->data_begin()), buf->size());
server_state_file->sync();
server_state_file->finalize();
std::filesystem::remove(old_path);
disk->removeFileIfExists(old_path);
}
nuraft::ptr<nuraft::srv_state> KeeperStateManager::read_state()
{
const auto & old_path = getOldServerStatePath();
const auto try_read_file = [this](const auto & path) -> nuraft::ptr<nuraft::srv_state>
auto disk = getStateFileDisk();
const auto try_read_file = [&](const auto & path) -> nuraft::ptr<nuraft::srv_state>
{
try
{
ReadBufferFromFile read_buf(path);
auto content_size = read_buf.getFileSize();
auto read_buf = disk->readFile(path);
auto content_size = read_buf->getFileSize();
if (content_size == 0)
return nullptr;
uint64_t read_checksum{0};
readIntBinary(read_checksum, read_buf);
readIntBinary(read_checksum, *read_buf);
uint8_t version;
readIntBinary(version, read_buf);
readIntBinary(version, *read_buf);
auto buffer_size = content_size - sizeof read_checksum - sizeof version;
auto state_buf = nuraft::buffer::alloc(buffer_size);
read_buf.readStrict(reinterpret_cast<char *>(state_buf->data_begin()), buffer_size);
read_buf->readStrict(reinterpret_cast<char *>(state_buf->data_begin()), buffer_size);
SipHash hash;
hash.update(version);
@ -366,15 +387,15 @@ nuraft::ptr<nuraft::srv_state> KeeperStateManager::read_state()
{
constexpr auto error_format = "Invalid checksum while reading state from {}. Got {}, expected {}";
#ifdef NDEBUG
LOG_ERROR(logger, error_format, path.generic_string(), hash.get64(), read_checksum);
LOG_ERROR(logger, error_format, path, hash.get64(), read_checksum);
return nullptr;
#else
throw Exception(ErrorCodes::CORRUPTED_DATA, error_format, path.generic_string(), hash.get64(), read_checksum);
throw Exception(ErrorCodes::CORRUPTED_DATA, error_format, disk->getPath() + path, hash.get64(), read_checksum);
#endif
}
auto state = nuraft::srv_state::deserialize(*state_buf);
LOG_INFO(logger, "Read state from {}", path.generic_string());
LOG_INFO(logger, "Read state from {}", fs::path(disk->getPath()) / path);
return state;
}
catch (const std::exception & e)
@ -385,37 +406,45 @@ nuraft::ptr<nuraft::srv_state> KeeperStateManager::read_state()
throw;
}
LOG_ERROR(logger, "Failed to deserialize state from {}", path.generic_string());
LOG_ERROR(logger, "Failed to deserialize state from {}", disk->getPath() + path);
return nullptr;
}
};
if (std::filesystem::exists(server_state_path))
if (disk->exists(server_state_file_name))
{
auto state = try_read_file(server_state_path);
auto state = try_read_file(server_state_file_name);
if (state)
{
if (std::filesystem::exists(old_path))
std::filesystem::remove(old_path);
disk->removeFileIfExists(old_path);
return state;
}
std::filesystem::remove(server_state_path);
disk->removeFile(server_state_file_name);
}
if (std::filesystem::exists(old_path))
if (disk->exists(old_path))
{
auto state = try_read_file(old_path);
if (state)
if (disk->exists(copy_lock_file))
{
std::filesystem::rename(old_path, server_state_path);
return state;
disk->removeFile(old_path);
disk->removeFile(copy_lock_file);
}
std::filesystem::remove(old_path);
else
{
auto state = try_read_file(old_path);
if (state)
{
disk->moveFile(old_path, server_state_file_name);
return state;
}
disk->removeFile(old_path);
}
}
else if (disk->exists(copy_lock_file))
{
disk->removeFile(copy_lock_file);
}
LOG_WARNING(logger, "No state was read");

View File

@ -39,18 +39,17 @@ public:
KeeperStateManager(
int server_id_,
const std::string & config_prefix_,
const std::string & log_storage_path,
const std::string & state_file_path,
const std::string & server_state_file_name_,
const Poco::Util::AbstractConfiguration & config,
const CoordinationSettingsPtr & coordination_settings);
const CoordinationSettingsPtr & coordination_settings,
KeeperContextPtr keeper_context_);
/// Constructor for tests
KeeperStateManager(
int server_id_,
const std::string & host,
int port,
const std::string & logs_path,
const std::string & state_file_path);
KeeperContextPtr keeper_context_);
void loadLogStore(uint64_t last_commited_index, uint64_t logs_to_keep);
@ -111,7 +110,9 @@ public:
ConfigUpdateActions getConfigurationDiff(const Poco::Util::AbstractConfiguration & config) const;
private:
const std::filesystem::path & getOldServerStatePath();
const String & getOldServerStatePath();
DiskPtr getStateFileDisk() const;
/// Wrapper struct for Keeper cluster config. We parse this
/// info from XML files.
@ -136,7 +137,9 @@ private:
nuraft::ptr<KeeperLogStore> log_store;
const std::filesystem::path server_state_path;
const String server_state_file_name;
KeeperContextPtr keeper_context;
Poco::Logger * logger;

View File

@ -128,7 +128,7 @@ KeeperStorage::ResponsesForSessions processWatchesImpl(
watches.erase(watch_it);
}
auto parent_path = parentPath(path);
auto parent_path = parentNodePath(path);
Strings paths_to_check_for_list_watches;
if (event_type == Coordination::Event::CREATED)
@ -276,23 +276,23 @@ void KeeperStorage::initializeSystemNodes()
[](auto & node)
{
++node.stat.numChildren;
node.addChild(getBaseName(keeper_system_path));
node.addChild(getBaseNodeName(keeper_system_path));
}
);
addDigest(updated_root_it->value, "/");
}
// insert child system nodes
for (const auto & [path, data] : keeper_context->system_nodes_with_data)
for (const auto & [path, data] : keeper_context->getSystemNodesWithData())
{
assert(path.starts_with(keeper_system_path));
Node child_system_node;
child_system_node.setData(data);
auto [map_key, _] = container.insert(std::string{path}, child_system_node);
/// Take child path from key owned by map.
auto child_path = getBaseName(map_key->getKey());
auto child_path = getBaseNodeName(map_key->getKey());
container.updateValue(
parentPath(StringRef(path)),
parentNodePath(StringRef(path)),
[child_path](auto & parent)
{
// don't update stats so digest is okay
@ -728,7 +728,7 @@ bool KeeperStorage::createNode(
bool is_sequental,
Coordination::ACLs node_acls)
{
auto parent_path = parentPath(path);
auto parent_path = parentNodePath(path);
auto node_it = container.find(parent_path);
if (node_it == container.end())
@ -751,7 +751,7 @@ bool KeeperStorage::createNode(
created_node.is_sequental = is_sequental;
auto [map_key, _] = container.insert(path, created_node);
/// Take child path from key owned by map.
auto child_path = getBaseName(map_key->getKey());
auto child_path = getBaseNodeName(map_key->getKey());
container.updateValue(
parent_path,
[child_path](KeeperStorage::Node & parent)
@ -781,8 +781,8 @@ bool KeeperStorage::removeNode(const std::string & path, int32_t version)
acl_map.removeUsage(prev_node.acl_id);
container.updateValue(
parentPath(path),
[child_basename = getBaseName(node_it->key)](KeeperStorage::Node & parent)
parentNodePath(path),
[child_basename = getBaseNodeName(node_it->key)](KeeperStorage::Node & parent)
{
parent.removeChild(child_basename);
chassert(parent.stat.numChildren == static_cast<int32_t>(parent.getChildren().size()));
@ -866,7 +866,7 @@ Coordination::ACLs getNodeACLs(KeeperStorage & storage, StringRef path, bool is_
void handleSystemNodeModification(const KeeperContext & keeper_context, std::string_view error_msg)
{
if (keeper_context.server_state == KeeperContext::Phase::INIT && !keeper_context.ignore_system_path_on_startup)
if (keeper_context.getServerState() == KeeperContext::Phase::INIT && !keeper_context.ignoreSystemPathOnStartup())
throw Exception(
ErrorCodes::LOGICAL_ERROR,
"{}. Ignoring it can lead to data loss. "
@ -929,7 +929,7 @@ struct KeeperStorageCreateRequestProcessor final : public KeeperStorageRequestPr
bool checkAuth(KeeperStorage & storage, int64_t session_id, bool is_local) const override
{
auto path = zk_request->getPath();
return storage.checkACL(parentPath(path), Coordination::ACL::Create, session_id, is_local);
return storage.checkACL(parentNodePath(path), Coordination::ACL::Create, session_id, is_local);
}
std::vector<KeeperStorage::Delta>
@ -940,7 +940,7 @@ struct KeeperStorageCreateRequestProcessor final : public KeeperStorageRequestPr
std::vector<KeeperStorage::Delta> new_deltas;
auto parent_path = parentPath(request.path);
auto parent_path = parentNodePath(request.path);
auto parent_node = storage.uncommitted_state.getNode(parent_path);
if (parent_node == nullptr)
return {KeeperStorage::Delta{zxid, Coordination::Error::ZNONODE}};
@ -971,7 +971,7 @@ struct KeeperStorageCreateRequestProcessor final : public KeeperStorageRequestPr
if (storage.uncommitted_state.getNode(path_created))
return {KeeperStorage::Delta{zxid, Coordination::Error::ZNODEEXISTS}};
if (getBaseName(path_created).size == 0)
if (getBaseNodeName(path_created).size == 0)
return {KeeperStorage::Delta{zxid, Coordination::Error::ZBADARGUMENTS}};
Coordination::ACLs node_acls;
@ -1121,7 +1121,7 @@ struct KeeperStorageRemoveRequestProcessor final : public KeeperStorageRequestPr
{
bool checkAuth(KeeperStorage & storage, int64_t session_id, bool is_local) const override
{
return storage.checkACL(parentPath(zk_request->getPath()), Coordination::ACL::Delete, session_id, is_local);
return storage.checkACL(parentNodePath(zk_request->getPath()), Coordination::ACL::Delete, session_id, is_local);
}
using KeeperStorageRequestProcessor::KeeperStorageRequestProcessor;
@ -1143,7 +1143,7 @@ struct KeeperStorageRemoveRequestProcessor final : public KeeperStorageRequestPr
const auto update_parent_pzxid = [&]()
{
auto parent_path = parentPath(request.path);
auto parent_path = parentNodePath(request.path);
if (!storage.uncommitted_state.getNode(parent_path))
return;
@ -1178,7 +1178,7 @@ struct KeeperStorageRemoveRequestProcessor final : public KeeperStorageRequestPr
update_parent_pzxid();
new_deltas.emplace_back(
std::string{parentPath(request.path)},
std::string{parentNodePath(request.path)},
zxid,
KeeperStorage::UpdateNodeDelta{[](KeeperStorage::Node & parent)
{
@ -1321,7 +1321,7 @@ struct KeeperStorageSetRequestProcessor final : public KeeperStorageRequestProce
request.version});
new_deltas.emplace_back(
parentPath(request.path).toString(),
parentNodePath(request.path).toString(),
zxid,
KeeperStorage::UpdateNodeDelta
{
@ -1481,7 +1481,7 @@ struct KeeperStorageCheckRequestProcessor final : public KeeperStorageRequestPro
bool checkAuth(KeeperStorage & storage, int64_t session_id, bool is_local) const override
{
auto path = zk_request->getPath();
return storage.checkACL(check_not_exists ? parentPath(path) : path, Coordination::ACL::Read, session_id, is_local);
return storage.checkACL(check_not_exists ? parentNodePath(path) : path, Coordination::ACL::Read, session_id, is_local);
}
std::vector<KeeperStorage::Delta>
@ -2024,7 +2024,7 @@ KeeperStorageRequestProcessorsFactory::KeeperStorageRequestProcessorsFactory()
UInt64 KeeperStorage::calculateNodesDigest(UInt64 current_digest, const std::vector<Delta> & new_deltas) const
{
if (!keeper_context->digest_enabled)
if (!keeper_context->digestEnabled())
return current_digest;
std::unordered_map<std::string_view, std::shared_ptr<Node>> updated_nodes;
@ -2122,7 +2122,7 @@ void KeeperStorage::preprocessRequest(
TransactionInfo transaction{.zxid = new_last_zxid};
uint64_t new_digest = getNodesDigest(false).value;
SCOPE_EXIT({
if (keeper_context->digest_enabled)
if (keeper_context->digestEnabled())
// if the version of digest we got from the leader is the same as the one this instances has, we can simply copy the value
// and just check the digest on the commit
// a mistake can happen while applying the changes to the uncommitted_state so for now let's just recalculate the digest here also
@ -2145,7 +2145,7 @@ void KeeperStorage::preprocessRequest(
{
new_deltas.emplace_back
(
parentPath(ephemeral_path).toString(),
parentNodePath(ephemeral_path).toString(),
new_last_zxid,
UpdateNodeDelta
{
@ -2338,7 +2338,7 @@ void KeeperStorage::rollbackRequest(int64_t rollback_zxid, bool allow_missing)
KeeperStorage::Digest KeeperStorage::getNodesDigest(bool committed) const
{
if (!keeper_context->digest_enabled)
if (!keeper_context->digestEnabled())
return {.version = DigestVersion::NO_DIGEST};
if (committed || uncommitted_transactions.empty())
@ -2349,13 +2349,13 @@ KeeperStorage::Digest KeeperStorage::getNodesDigest(bool committed) const
void KeeperStorage::removeDigest(const Node & node, const std::string_view path)
{
if (keeper_context->digest_enabled)
if (keeper_context->digestEnabled())
nodes_digest -= node.getDigest(path);
}
void KeeperStorage::addDigest(const Node & node, const std::string_view path)
{
if (keeper_context->digest_enabled)
if (keeper_context->digestEnabled())
{
node.invalidateDigestCache();
nodes_digest += node.getDigest(path);

View File

@ -0,0 +1,321 @@
#include <Interpreters/Context.h>
#include <Common/Config/ConfigProcessor.h>
#include <Common/Macros.h>
#include <Common/ThreadPool.h>
#include <Core/ServerSettings.h>
#include <boost/noncopyable.hpp>
#include <memory>
#include <cassert>
namespace ProfileEvents
{
extern const Event ContextLock;
}
namespace CurrentMetrics
{
extern const Metric ContextLockWait;
extern const Metric BackgroundSchedulePoolTask;
extern const Metric BackgroundSchedulePoolSize;
extern const Metric IOWriterThreads;
extern const Metric IOWriterThreadsActive;
}
namespace DB
{
namespace ErrorCodes
{
extern const int LOGICAL_ERROR;
}
struct ContextSharedPart : boost::noncopyable
{
ContextSharedPart()
: macros(std::make_unique<Macros>())
{}
/// For access of most of shared objects. Recursive mutex.
mutable std::recursive_mutex mutex;
mutable std::mutex keeper_dispatcher_mutex;
mutable std::shared_ptr<KeeperDispatcher> keeper_dispatcher TSA_GUARDED_BY(keeper_dispatcher_mutex);
ServerSettings server_settings;
String path; /// Path to the data directory, with a slash at the end.
ConfigurationPtr config; /// Global configuration settings.
MultiVersion<Macros> macros; /// Substitutions extracted from config.
mutable std::unique_ptr<BackgroundSchedulePool> schedule_pool; /// A thread pool that can run different jobs in background
RemoteHostFilter remote_host_filter; /// Allowed URL from config.xml
///
mutable std::unique_ptr<IAsynchronousReader> asynchronous_remote_fs_reader;
mutable std::unique_ptr<IAsynchronousReader> asynchronous_local_fs_reader;
mutable std::unique_ptr<IAsynchronousReader> synchronous_local_fs_reader;
mutable std::unique_ptr<ThreadPool> threadpool_writer;
mutable ThrottlerPtr remote_read_throttler; /// A server-wide throttler for remote IO reads
mutable ThrottlerPtr remote_write_throttler; /// A server-wide throttler for remote IO writes
mutable ThrottlerPtr local_read_throttler; /// A server-wide throttler for local IO reads
mutable ThrottlerPtr local_write_throttler; /// A server-wide throttler for local IO writes
};
Context::Context() = default;
Context::~Context() = default;
Context::Context(const Context &) = default;
Context & Context::operator=(const Context &) = default;
SharedContextHolder::SharedContextHolder(SharedContextHolder &&) noexcept = default;
SharedContextHolder & SharedContextHolder::operator=(SharedContextHolder &&) noexcept = default;
SharedContextHolder::SharedContextHolder() = default;
SharedContextHolder::~SharedContextHolder() = default;
SharedContextHolder::SharedContextHolder(std::unique_ptr<ContextSharedPart> shared_context)
: shared(std::move(shared_context)) {}
void SharedContextHolder::reset() { shared.reset(); }
void Context::makeGlobalContext()
{
initGlobal();
global_context = shared_from_this();
}
ContextMutablePtr Context::createGlobal(ContextSharedPart * shared)
{
auto res = std::shared_ptr<Context>(new Context);
res->shared = shared;
return res;
}
void Context::initGlobal()
{
assert(!global_context_instance);
global_context_instance = shared_from_this();
}
SharedContextHolder Context::createShared()
{
return SharedContextHolder(std::make_unique<ContextSharedPart>());
}
ContextMutablePtr Context::getGlobalContext() const
{
auto ptr = global_context.lock();
if (!ptr) throw Exception(ErrorCodes::LOGICAL_ERROR, "There is no global context or global context has expired");
return ptr;
}
std::unique_lock<std::recursive_mutex> Context::getLock() const
{
ProfileEvents::increment(ProfileEvents::ContextLock);
CurrentMetrics::Increment increment{CurrentMetrics::ContextLockWait};
return std::unique_lock(shared->mutex);
}
String Context::getPath() const
{
auto lock = getLock();
return shared->path;
}
void Context::setPath(const String & path)
{
auto lock = getLock();
shared->path = path;
}
MultiVersion<Macros>::Version Context::getMacros() const
{
return shared->macros.get();
}
void Context::setMacros(std::unique_ptr<Macros> && macros)
{
shared->macros.set(std::move(macros));
}
BackgroundSchedulePool & Context::getSchedulePool() const
{
auto lock = getLock();
if (!shared->schedule_pool)
{
shared->schedule_pool = std::make_unique<BackgroundSchedulePool>(
shared->server_settings.background_schedule_pool_size,
CurrentMetrics::BackgroundSchedulePoolTask,
CurrentMetrics::BackgroundSchedulePoolSize,
"BgSchPool");
}
return *shared->schedule_pool;
}
void Context::setRemoteHostFilter(const Poco::Util::AbstractConfiguration & config)
{
shared->remote_host_filter.setValuesFromConfig(config);
}
const RemoteHostFilter & Context::getRemoteHostFilter() const
{
return shared->remote_host_filter;
}
IAsynchronousReader & Context::getThreadPoolReader(FilesystemReaderType type) const
{
auto lock = getLock();
switch (type)
{
case FilesystemReaderType::ASYNCHRONOUS_REMOTE_FS_READER:
{
if (!shared->asynchronous_remote_fs_reader)
shared->asynchronous_remote_fs_reader = createThreadPoolReader(type, getConfigRef());
return *shared->asynchronous_remote_fs_reader;
}
case FilesystemReaderType::ASYNCHRONOUS_LOCAL_FS_READER:
{
if (!shared->asynchronous_local_fs_reader)
shared->asynchronous_local_fs_reader = createThreadPoolReader(type, getConfigRef());
return *shared->asynchronous_local_fs_reader;
}
case FilesystemReaderType::SYNCHRONOUS_LOCAL_FS_READER:
{
if (!shared->synchronous_local_fs_reader)
shared->synchronous_local_fs_reader = createThreadPoolReader(type, getConfigRef());
return *shared->synchronous_local_fs_reader;
}
}
}
std::shared_ptr<FilesystemCacheLog> Context::getFilesystemCacheLog() const
{
return nullptr;
}
std::shared_ptr<FilesystemReadPrefetchesLog> Context::getFilesystemReadPrefetchesLog() const
{
return nullptr;
}
void Context::setConfig(const ConfigurationPtr & config)
{
auto lock = getLock();
shared->config = config;
}
const Poco::Util::AbstractConfiguration & Context::getConfigRef() const
{
auto lock = getLock();
return shared->config ? *shared->config : Poco::Util::Application::instance().config();
}
std::shared_ptr<AsyncReadCounters> Context::getAsyncReadCounters() const
{
auto lock = getLock();
if (!async_read_counters)
async_read_counters = std::make_shared<AsyncReadCounters>();
return async_read_counters;
}
ThreadPool & Context::getThreadPoolWriter() const
{
const auto & config = getConfigRef();
auto lock = getLock();
if (!shared->threadpool_writer)
{
auto pool_size = config.getUInt(".threadpool_writer_pool_size", 100);
auto queue_size = config.getUInt(".threadpool_writer_queue_size", 1000000);
shared->threadpool_writer = std::make_unique<ThreadPool>(
CurrentMetrics::IOWriterThreads, CurrentMetrics::IOWriterThreadsActive, pool_size, pool_size, queue_size);
}
return *shared->threadpool_writer;
}
ThrottlerPtr Context::getRemoteReadThrottler() const
{
return nullptr;
}
ThrottlerPtr Context::getRemoteWriteThrottler() const
{
return nullptr;
}
ThrottlerPtr Context::getLocalReadThrottler() const
{
return nullptr;
}
ThrottlerPtr Context::getLocalWriteThrottler() const
{
return nullptr;
}
ReadSettings Context::getReadSettings() const
{
return ReadSettings{};
}
void Context::initializeKeeperDispatcher([[maybe_unused]] bool start_async) const
{
const auto & config_ref = getConfigRef();
std::lock_guard lock(shared->keeper_dispatcher_mutex);
if (shared->keeper_dispatcher)
throw Exception(ErrorCodes::LOGICAL_ERROR, "Trying to initialize Keeper multiple times");
if (config_ref.has("keeper_server"))
{
shared->keeper_dispatcher = std::make_shared<KeeperDispatcher>();
shared->keeper_dispatcher->initialize(config_ref, true, start_async, getMacros());
}
}
std::shared_ptr<KeeperDispatcher> Context::getKeeperDispatcher() const
{
std::lock_guard lock(shared->keeper_dispatcher_mutex);
if (!shared->keeper_dispatcher)
throw Exception(ErrorCodes::LOGICAL_ERROR, "Keeper must be initialized before requests");
return shared->keeper_dispatcher;
}
std::shared_ptr<KeeperDispatcher> Context::tryGetKeeperDispatcher() const
{
std::lock_guard lock(shared->keeper_dispatcher_mutex);
return shared->keeper_dispatcher;
}
void Context::shutdownKeeperDispatcher() const
{
std::lock_guard lock(shared->keeper_dispatcher_mutex);
if (shared->keeper_dispatcher)
{
shared->keeper_dispatcher->shutdown();
shared->keeper_dispatcher.reset();
}
}
void Context::updateKeeperConfiguration([[maybe_unused]] const Poco::Util::AbstractConfiguration & config_)
{
std::lock_guard lock(shared->keeper_dispatcher_mutex);
if (!shared->keeper_dispatcher)
return;
shared->keeper_dispatcher->updateConfiguration(getConfigRef(), getMacros());
}
}

View File

@ -0,0 +1,120 @@
#pragma once
#include <Interpreters/Context_fwd.h>
#include <Coordination/KeeperDispatcher.h>
#include <Common/MultiVersion.h>
#include <Common/RemoteHostFilter.h>
#include <Disks/IO/getThreadPoolReader.h>
#include <Core/Settings.h>
#include <Core/BackgroundSchedulePool.h>
#include <IO/AsyncReadCounters.h>
#include <Poco/Util/Application.h>
#include <memory>
namespace DB
{
struct ContextSharedPart;
class Macros;
class FilesystemCacheLog;
class FilesystemReadPrefetchesLog;
/// A small class which owns ContextShared.
/// We don't use something like unique_ptr directly to allow ContextShared type to be incomplete.
struct SharedContextHolder
{
~SharedContextHolder();
SharedContextHolder();
explicit SharedContextHolder(std::unique_ptr<ContextSharedPart> shared_context);
SharedContextHolder(SharedContextHolder &&) noexcept;
SharedContextHolder & operator=(SharedContextHolder &&) noexcept;
ContextSharedPart * get() const { return shared.get(); }
void reset();
private:
std::unique_ptr<ContextSharedPart> shared;
};
class Context : public std::enable_shared_from_this<Context>
{
private:
/// Use copy constructor or createGlobal() instead
Context();
Context(const Context &);
Context & operator=(const Context &);
std::unique_lock<std::recursive_mutex> getLock() const;
ContextWeakMutablePtr global_context;
inline static ContextPtr global_context_instance;
ContextSharedPart * shared;
/// Query metrics for reading data asynchronously with IAsynchronousReader.
mutable std::shared_ptr<AsyncReadCounters> async_read_counters;
Settings settings; /// Setting for query execution.
public:
/// Create initial Context with ContextShared and etc.
static ContextMutablePtr createGlobal(ContextSharedPart * shared);
static SharedContextHolder createShared();
ContextMutablePtr getGlobalContext() const;
static ContextPtr getGlobalContextInstance() { return global_context_instance; }
void makeGlobalContext();
void initGlobal();
~Context();
using ConfigurationPtr = Poco::AutoPtr<Poco::Util::AbstractConfiguration>;
/// Global application configuration settings.
void setConfig(const ConfigurationPtr & config);
const Poco::Util::AbstractConfiguration & getConfigRef() const;
const Settings & getSettingsRef() const { return settings; }
String getPath() const;
void setPath(const String & path);
MultiVersion<Macros>::Version getMacros() const;
void setMacros(std::unique_ptr<Macros> && macros);
BackgroundSchedulePool & getSchedulePool() const;
/// Storage of allowed hosts from config.xml
void setRemoteHostFilter(const Poco::Util::AbstractConfiguration & config);
const RemoteHostFilter & getRemoteHostFilter() const;
std::shared_ptr<FilesystemCacheLog> getFilesystemCacheLog() const;
std::shared_ptr<FilesystemReadPrefetchesLog> getFilesystemReadPrefetchesLog() const;
IAsynchronousReader & getThreadPoolReader(FilesystemReaderType type) const;
std::shared_ptr<AsyncReadCounters> getAsyncReadCounters() const;
ThreadPool & getThreadPoolWriter() const;
ThrottlerPtr getRemoteReadThrottler() const;
ThrottlerPtr getRemoteWriteThrottler() const;
ThrottlerPtr getLocalReadThrottler() const;
ThrottlerPtr getLocalWriteThrottler() const;
ReadSettings getReadSettings() const;
std::shared_ptr<KeeperDispatcher> getKeeperDispatcher() const;
std::shared_ptr<KeeperDispatcher> tryGetKeeperDispatcher() const;
void initializeKeeperDispatcher(bool start_async) const;
void shutdownKeeperDispatcher() const;
void updateKeeperConfiguration(const Poco::Util::AbstractConfiguration & config);
};
}

View File

@ -0,0 +1,24 @@
#include <Core/Settings.h>
namespace DB
{
IMPLEMENT_SETTINGS_TRAITS(SettingsTraits, LIST_OF_SETTINGS)
std::vector<String> Settings::getAllRegisteredNames() const
{
std::vector<String> all_settings;
for (const auto & setting_field : all())
{
all_settings.push_back(setting_field.getName());
}
return all_settings;
}
void Settings::set(std::string_view name, const Field & value)
{
BaseSettings::set(name, value);
}
}

View File

@ -0,0 +1,14 @@
#include <Common/CurrentThread.h>
namespace DB
{
void CurrentThread::detachFromGroupIfNotDetached()
{
}
void CurrentThread::attachToGroup(const ThreadGroupPtr &)
{
}
}

View File

@ -1,87 +0,0 @@
#include <Coordination/TinyContext.h>
#include <Common/Exception.h>
#include <Coordination/KeeperDispatcher.h>
namespace DB
{
namespace ErrorCodes
{
extern const int LOGICAL_ERROR;
}
void TinyContext::setConfig(const ConfigurationPtr & config_)
{
std::lock_guard lock(keeper_dispatcher_mutex);
config = config_;
}
const Poco::Util::AbstractConfiguration & TinyContext::getConfigRef() const
{
std::lock_guard lock(keeper_dispatcher_mutex);
return config ? *config : Poco::Util::Application::instance().config();
}
void TinyContext::initializeKeeperDispatcher([[maybe_unused]] bool start_async) const
{
const auto & config_ref = getConfigRef();
std::lock_guard lock(keeper_dispatcher_mutex);
if (keeper_dispatcher)
throw Exception(ErrorCodes::LOGICAL_ERROR, "Trying to initialize Keeper multiple times");
if (config_ref.has("keeper_server"))
{
keeper_dispatcher = std::make_shared<KeeperDispatcher>();
MultiVersion<Macros>::Version macros;
if (config_ref.has("macros"))
macros = std::make_unique<Macros>(config_ref, "macros", &Poco::Logger::get("TinyContext"));
keeper_dispatcher->initialize(config_ref, true, start_async, macros);
}
}
std::shared_ptr<KeeperDispatcher> TinyContext::getKeeperDispatcher() const
{
std::lock_guard lock(keeper_dispatcher_mutex);
if (!keeper_dispatcher)
throw Exception(ErrorCodes::LOGICAL_ERROR, "Keeper must be initialized before requests");
return keeper_dispatcher;
}
std::shared_ptr<KeeperDispatcher> TinyContext::tryGetKeeperDispatcher() const
{
std::lock_guard lock(keeper_dispatcher_mutex);
return keeper_dispatcher;
}
void TinyContext::shutdownKeeperDispatcher() const
{
std::lock_guard lock(keeper_dispatcher_mutex);
if (keeper_dispatcher)
{
keeper_dispatcher->shutdown();
keeper_dispatcher.reset();
}
}
void TinyContext::updateKeeperConfiguration([[maybe_unused]] const Poco::Util::AbstractConfiguration & config_)
{
std::lock_guard lock(keeper_dispatcher_mutex);
if (!keeper_dispatcher)
return;
MultiVersion<Macros>::Version macros;
if (config_.has("macros"))
macros = std::make_unique<Macros>(config_, "macros", &Poco::Logger::get("TinyContext"));
keeper_dispatcher->updateConfiguration(config_, macros);
}
}

View File

@ -1,36 +0,0 @@
#pragma once
#include <memory>
#include <mutex>
#include <Poco/Util/Application.h>
#include <base/defines.h>
namespace DB
{
class KeeperDispatcher;
class TinyContext : public std::enable_shared_from_this<TinyContext>
{
public:
std::shared_ptr<KeeperDispatcher> getKeeperDispatcher() const;
std::shared_ptr<KeeperDispatcher> tryGetKeeperDispatcher() const;
void initializeKeeperDispatcher(bool start_async) const;
void shutdownKeeperDispatcher() const;
void updateKeeperConfiguration(const Poco::Util::AbstractConfiguration & config);
using ConfigurationPtr = Poco::AutoPtr<Poco::Util::AbstractConfiguration>;
void setConfig(const ConfigurationPtr & config);
const Poco::Util::AbstractConfiguration & getConfigRef() const;
private:
mutable std::mutex keeper_dispatcher_mutex;
mutable std::shared_ptr<KeeperDispatcher> keeper_dispatcher TSA_GUARDED_BY(keeper_dispatcher_mutex);
ConfigurationPtr config TSA_GUARDED_BY(keeper_dispatcher_mutex);
};
using TinyContextPtr = std::shared_ptr<TinyContext>;
}

View File

@ -139,8 +139,8 @@ int64_t deserializeStorageData(KeeperStorage & storage, ReadBuffer & in, Poco::L
{
if (itr.key != "/")
{
auto parent_path = parentPath(itr.key);
storage.container.updateValue(parent_path, [my_path = itr.key] (KeeperStorage::Node & value) { value.addChild(getBaseName(my_path)); ++value.stat.numChildren; });
auto parent_path = parentNodePath(itr.key);
storage.container.updateValue(parent_path, [my_path = itr.key] (KeeperStorage::Node & value) { value.addChild(getBaseNodeName(my_path)); ++value.stat.numChildren; });
}
}

View File

@ -21,7 +21,7 @@ static size_t findLastSlash(StringRef path)
return std::string::npos;
}
StringRef parentPath(StringRef path)
StringRef parentNodePath(StringRef path)
{
auto rslash_pos = findLastSlash(path);
if (rslash_pos > 0)
@ -29,7 +29,7 @@ StringRef parentPath(StringRef path)
return "/";
}
StringRef getBaseName(StringRef path)
StringRef getBaseNodeName(StringRef path)
{
size_t basename_start = findLastSlash(path);
return StringRef{path.data + basename_start + 1, path.size - basename_start - 1};

View File

@ -6,8 +6,8 @@
namespace DB
{
StringRef parentPath(StringRef path);
StringRef parentNodePath(StringRef path);
StringRef getBaseName(StringRef path);
StringRef getBaseNodeName(StringRef path);
}

File diff suppressed because it is too large Load Diff

View File

@ -467,6 +467,7 @@ class IColumn;
M(UInt64, max_fetch_partition_retries_count, 5, "Amount of retries while fetching partition from another host.", 0) \
M(UInt64, http_max_multipart_form_data_size, 1024 * 1024 * 1024, "Limit on size of multipart/form-data content. This setting cannot be parsed from URL parameters and should be set in user profile. Note that content is parsed and external tables are created in memory before start of query execution. And this is the only limit that has effect on that stage (limits on max memory usage and max execution time have no effect while reading HTTP form data).", 0) \
M(Bool, calculate_text_stack_trace, true, "Calculate text stack trace in case of exceptions during query execution. This is the default. It requires symbol lookups that may slow down fuzzing tests when huge amount of wrong queries are executed. In normal cases you should not disable this option.", 0) \
M(Bool, enable_job_stack_trace, false, "Output stack trace of a job creator when job results in exception", 0) \
M(Bool, allow_ddl, true, "If it is set to true, then a user is allowed to executed DDL queries.", 0) \
M(Bool, parallel_view_processing, false, "Enables pushing to attached views concurrently instead of sequentially.", 0) \
M(Bool, enable_unaligned_array_join, false, "Allow ARRAY JOIN with multiple arrays that have different sizes. When this settings is enabled, arrays will be resized to the longest one.", 0) \

View File

@ -378,6 +378,40 @@ void SettingFieldMap::readBinary(ReadBuffer & in)
*this = map;
}
#else
namespace ErrorCodes
{
extern const int LOGICAL_ERROR;
}
SettingFieldMap::SettingFieldMap(const Field &) : value(Map()) {}
String SettingFieldMap::toString() const
{
throw DB::Exception(DB::ErrorCodes::LOGICAL_ERROR, "Setting of type Map not supported");
}
SettingFieldMap & SettingFieldMap::operator =(const Field &)
{
throw DB::Exception(DB::ErrorCodes::LOGICAL_ERROR, "Setting of type Map not supported");
}
void SettingFieldMap::parseFromString(const String &)
{
throw DB::Exception(DB::ErrorCodes::LOGICAL_ERROR, "Setting of type Map not supported");
}
void SettingFieldMap::writeBinary(WriteBuffer &) const
{
throw DB::Exception(DB::ErrorCodes::LOGICAL_ERROR, "Setting of type Map not supported");
}
void SettingFieldMap::readBinary(ReadBuffer &)
{
throw DB::Exception(DB::ErrorCodes::LOGICAL_ERROR, "Setting of type Map not supported");
}
#endif
namespace

View File

@ -245,6 +245,12 @@ struct SettingFieldString
void readBinary(ReadBuffer & in);
};
#ifdef CLICKHOUSE_PROGRAM_STANDALONE_BUILD
#define NORETURN [[noreturn]]
#else
#define NORETURN
#endif
struct SettingFieldMap
{
public:
@ -261,13 +267,15 @@ public:
operator const Map &() const { return value; } /// NOLINT
explicit operator Field() const { return value; }
String toString() const;
void parseFromString(const String & str);
NORETURN String toString() const;
NORETURN void parseFromString(const String & str);
void writeBinary(WriteBuffer & out) const;
void readBinary(ReadBuffer & in);
NORETURN void writeBinary(WriteBuffer & out) const;
NORETURN void readBinary(ReadBuffer & in);
};
#undef NORETURN
struct SettingFieldChar
{
public:

View File

@ -19,6 +19,7 @@
#include <csignal>
#include <unistd.h>
#include <algorithm>
#include <typeinfo>
#include <iostream>
#include <fstream>
@ -153,6 +154,7 @@ static void signalHandler(int sig, siginfo_t * info, void * context)
writePODBinary(*info, out);
writePODBinary(signal_context, out);
writePODBinary(stack_trace, out);
writeVectorBinary(Exception::thread_frame_pointers, out);
writeBinary(static_cast<UInt32>(getThreadId()), out);
writePODBinary(current_thread, out);
@ -250,6 +252,7 @@ public:
siginfo_t info{};
ucontext_t * context{};
StackTrace stack_trace(NoCapture{});
std::vector<StackTrace::FramePointers> thread_frame_pointers;
UInt32 thread_num{};
ThreadStatus * thread_ptr{};
@ -260,12 +263,13 @@ public:
}
readPODBinary(stack_trace, in);
readVectorBinary(thread_frame_pointers, in);
readBinary(thread_num, in);
readPODBinary(thread_ptr, in);
/// This allows to receive more signals if failure happens inside onFault function.
/// Example: segfault while symbolizing stack trace.
std::thread([=, this] { onFault(sig, info, context, stack_trace, thread_num, thread_ptr); }).detach();
std::thread([=, this] { onFault(sig, info, context, stack_trace, thread_frame_pointers, thread_num, thread_ptr); }).detach();
}
}
}
@ -300,6 +304,7 @@ private:
const siginfo_t & info,
ucontext_t * context,
const StackTrace & stack_trace,
const std::vector<StackTrace::FramePointers> & thread_frame_pointers,
UInt32 thread_num,
ThreadStatus * thread_ptr) const
{
@ -375,6 +380,31 @@ private:
/// Write symbolized stack trace line by line for better grep-ability.
stack_trace.toStringEveryLine([&](std::string_view s) { LOG_FATAL(log, fmt::runtime(s)); });
/// In case it's a scheduled job write all previous jobs origins call stacks
std::for_each(thread_frame_pointers.rbegin(), thread_frame_pointers.rend(),
[this](const StackTrace::FramePointers & frame_pointers)
{
if (size_t size = std::ranges::find(frame_pointers, nullptr) - frame_pointers.begin())
{
LOG_FATAL(log, "========================================");
WriteBufferFromOwnString bare_stacktrace;
writeString("Job's origin stack trace:", bare_stacktrace);
std::for_each_n(frame_pointers.begin(), size,
[&bare_stacktrace](const void * ptr)
{
writeChar(' ', bare_stacktrace);
writePointerHex(ptr, bare_stacktrace);
}
);
LOG_FATAL(log, fmt::runtime(bare_stacktrace.str()));
StackTrace::toStringEveryLine(const_cast<void **>(frame_pointers.data()), 0, size, [this](std::string_view s) { LOG_FATAL(log, fmt::runtime(s)); });
}
}
);
#if defined(OS_LINUX)
/// Write information about binary checksum. It can be difficult to calculate, so do it only after printing stack trace.
/// Please keep the below log messages in-sync with the ones in programs/server/Server.cpp

View File

@ -135,6 +135,7 @@ QueryPipeline HTTPDictionarySource::loadIds(const std::vector<UInt64> & ids)
WriteBufferFromOStream out_buffer(ostr);
auto output_format = context->getOutputFormatParallelIfPossible(configuration.format, out_buffer, block.cloneEmpty());
formatBlock(output_format, block);
out_buffer.finalize();
};
Poco::URI uri(configuration.url);
@ -164,6 +165,7 @@ QueryPipeline HTTPDictionarySource::loadKeys(const Columns & key_columns, const
WriteBufferFromOStream out_buffer(ostr);
auto output_format = context->getOutputFormatParallelIfPossible(configuration.format, out_buffer, block.cloneEmpty());
formatBlock(output_format, block);
out_buffer.finalize();
};
Poco::URI uri(configuration.url);

View File

@ -157,7 +157,7 @@ CachedOnDiskReadBufferFromFile::getCacheReadBuffer(const FileSegment & file_segm
if (use_external_buffer)
local_read_settings.local_fs_buffer_size = 0;
auto buf = createReadBufferFromFileBase(path, local_read_settings);
auto buf = createReadBufferFromFileBase(path, local_read_settings, std::nullopt, std::nullopt, file_segment.getFlagsForLocalRead());
if (getFileSizeFromReadBuffer(*buf) == 0)
throw Exception(ErrorCodes::LOGICAL_ERROR, "Attempt to read from an empty cache file: {}", path);

View File

@ -56,6 +56,7 @@ SeekableReadBufferPtr ReadBufferFromRemoteFSGather::createImplementationBuffer(c
size_t current_read_until_position = read_until_position ? read_until_position : object.bytes_size;
auto current_read_buffer_creator = [=, this]() { return read_buffer_creator(object_path, current_read_until_position); };
#ifndef CLICKHOUSE_PROGRAM_STANDALONE_BUILD
if (with_cache)
{
auto cache_key = settings.remote_fs_cache->createKeyForPath(object_path);
@ -72,6 +73,7 @@ SeekableReadBufferPtr ReadBufferFromRemoteFSGather::createImplementationBuffer(c
read_until_position ? std::optional<size_t>(read_until_position) : std::nullopt,
cache_log);
}
#endif
return current_read_buffer_creator();
}

View File

@ -7,9 +7,7 @@
#include <Disks/IO/ThreadPoolRemoteFSReader.h>
#include <Disks/IO/ThreadPoolReader.h>
#ifndef CLICKHOUSE_PROGRAM_STANDALONE_BUILD
#include <Interpreters/Context.h>
#endif
namespace DB
{
@ -21,32 +19,10 @@ namespace ErrorCodes
IAsynchronousReader & getThreadPoolReader(FilesystemReaderType type)
{
#ifdef CLICKHOUSE_PROGRAM_STANDALONE_BUILD
const auto & config = Poco::Util::Application::instance().config();
switch (type)
{
case FilesystemReaderType::ASYNCHRONOUS_REMOTE_FS_READER:
{
static auto asynchronous_remote_fs_reader = createThreadPoolReader(type, config);
return *asynchronous_remote_fs_reader;
}
case FilesystemReaderType::ASYNCHRONOUS_LOCAL_FS_READER:
{
static auto asynchronous_local_fs_reader = createThreadPoolReader(type, config);
return *asynchronous_local_fs_reader;
}
case FilesystemReaderType::SYNCHRONOUS_LOCAL_FS_READER:
{
static auto synchronous_local_fs_reader = createThreadPoolReader(type, config);
return *synchronous_local_fs_reader;
}
}
#else
auto context = Context::getGlobalContextInstance();
if (!context)
throw Exception(ErrorCodes::LOGICAL_ERROR, "Global context not initialized");
return context->getThreadPoolReader(type);
#endif
}
std::unique_ptr<IAsynchronousReader> createThreadPoolReader(

View File

@ -11,7 +11,6 @@
#include <Common/logger_useful.h>
#include <Common/filesystemHelpers.h>
#include <Common/CurrentMetrics.h>
#include <Disks/ObjectStorages/Cached/CachedObjectStorage.h>
#include <Disks/ObjectStorages/DiskObjectStorageRemoteMetadataRestoreHelper.h>
#include <Disks/ObjectStorages/DiskObjectStorageTransaction.h>
#include <Disks/FakeDiskTransaction.h>
@ -530,24 +529,6 @@ DiskObjectStoragePtr DiskObjectStorage::createDiskObjectStorage()
threadpool_size);
}
void DiskObjectStorage::wrapWithCache(FileCachePtr cache, const FileCacheSettings & cache_settings, const String & layer_name)
{
object_storage = std::make_shared<CachedObjectStorage>(object_storage, cache, cache_settings, layer_name);
}
NameSet DiskObjectStorage::getCacheLayersNames() const
{
NameSet cache_layers;
auto current_object_storage = object_storage;
while (current_object_storage->supportsCache())
{
auto * cached_object_storage = assert_cast<CachedObjectStorage *>(current_object_storage.get());
cache_layers.insert(cached_object_storage->getCacheConfigName());
current_object_storage = cached_object_storage->getWrappedObjectStorage();
}
return cache_layers;
}
std::unique_ptr<ReadBufferFromFileBase> DiskObjectStorage::readFile(
const String & path,
const ReadSettings & settings,

View File

@ -181,20 +181,22 @@ public:
/// MergeTree table on this disk.
bool isWriteOnce() const override;
/// Add a cache layer.
/// Example: DiskObjectStorage(S3ObjectStorage) -> DiskObjectStorage(CachedObjectStorage(S3ObjectStorage))
/// There can be any number of cache layers:
/// DiskObjectStorage(CachedObjectStorage(...CacheObjectStorage(S3ObjectStorage)...))
void wrapWithCache(FileCachePtr cache, const FileCacheSettings & cache_settings, const String & layer_name);
/// Get structure of object storage this disk works with. Examples:
/// DiskObjectStorage(S3ObjectStorage)
/// DiskObjectStorage(CachedObjectStorage(S3ObjectStorage))
/// DiskObjectStorage(CachedObjectStorage(CachedObjectStorage(S3ObjectStorage)))
String getStructure() const { return fmt::format("DiskObjectStorage-{}({})", getName(), object_storage->getName()); }
#ifndef CLICKHOUSE_PROGRAM_STANDALONE_BUILD
/// Add a cache layer.
/// Example: DiskObjectStorage(S3ObjectStorage) -> DiskObjectStorage(CachedObjectStorage(S3ObjectStorage))
/// There can be any number of cache layers:
/// DiskObjectStorage(CachedObjectStorage(...CacheObjectStorage(S3ObjectStorage)...))
void wrapWithCache(FileCachePtr cache, const FileCacheSettings & cache_settings, const String & layer_name);
/// Get names of all cache layers. Name is how cache is defined in configuration file.
NameSet getCacheLayersNames() const override;
#endif
static std::shared_ptr<Executor> getAsyncExecutor(const std::string & log_name, size_t size);

View File

@ -0,0 +1,28 @@
#include <Disks/ObjectStorages/Cached/CachedObjectStorage.h>
#include <Disks/ObjectStorages/DiskObjectStorage.h>
#include <Common/assert_cast.h>
namespace DB
{
void DiskObjectStorage::wrapWithCache(FileCachePtr cache, const FileCacheSettings & cache_settings, const String & layer_name)
{
object_storage = std::make_shared<CachedObjectStorage>(object_storage, cache, cache_settings, layer_name);
}
NameSet DiskObjectStorage::getCacheLayersNames() const
{
NameSet cache_layers;
auto current_object_storage = object_storage;
while (current_object_storage->supportsCache())
{
auto * cached_object_storage = assert_cast<CachedObjectStorage *>(current_object_storage.get());
cache_layers.insert(cached_object_storage->getCacheConfigName());
current_object_storage = cached_object_storage->getWrappedObjectStorage();
}
return cache_layers;
}
}

View File

@ -63,7 +63,7 @@ public:
uint32_t getHardlinkCount(const std::string & /* path */) const override
{
return 1;
return 0;
}
bool supportsChmod() const override { return false; }

View File

@ -26,12 +26,25 @@ namespace
{
std::string getOrCreateDiskFromDiskAST(const ASTFunction & function, ContextPtr context)
{
/// We need a unique name for a created custom disk, but it needs to be the same
/// after table is reattached or server is restarted, so take a hash of the disk
/// configuration serialized ast as a disk name suffix.
auto disk_setting_string = serializeAST(function, true);
auto disk_name = DiskSelector::TMP_INTERNAL_DISK_PREFIX
+ toString(sipHash128(disk_setting_string.data(), disk_setting_string.size()));
std::string disk_name;
if (function.name == "disk")
{
/// We need a unique name for a created custom disk, but it needs to be the same
/// after table is reattached or server is restarted, so take a hash of the disk
/// configuration serialized ast as a disk name suffix.
auto disk_setting_string = serializeAST(function, true);
disk_name = DiskSelector::TMP_INTERNAL_DISK_PREFIX
+ toString(sipHash128(disk_setting_string.data(), disk_setting_string.size()));
}
else
{
static constexpr std::string_view custom_disk_prefix = "disk_";
if (function.name.size() <= custom_disk_prefix.size() || !function.name.starts_with(custom_disk_prefix))
throw Exception(ErrorCodes::BAD_ARGUMENTS, "Invalid disk name: {}", function.name);
disk_name = function.name.substr(custom_disk_prefix.size());
}
auto result_disk = context->getOrCreateDisk(disk_name, [&](const DisksMap & disks_map) -> DiskPtr {
const auto * function_args_expr = assert_cast<const ASTExpressionList *>(function.arguments.get());
@ -43,6 +56,9 @@ namespace
return disk;
});
if (!result_disk->isCustomDisk())
throw Exception(ErrorCodes::BAD_ARGUMENTS, "Disk with name `{}` already exist", disk_name);
if (!result_disk->isRemote())
{
static constexpr auto custom_disks_base_dir_in_config = "custom_local_disks_base_directory";

View File

@ -32,6 +32,8 @@ void registerDiskCache(DiskFactory & factory, bool global_skip_access_check);
void registerDiskLocalObjectStorage(DiskFactory & factory, bool global_skip_access_check);
#ifndef CLICKHOUSE_PROGRAM_STANDALONE_BUILD
void registerDisks(bool global_skip_access_check)
{
auto & factory = DiskFactory::instance();
@ -61,4 +63,19 @@ void registerDisks(bool global_skip_access_check)
registerDiskLocalObjectStorage(factory, global_skip_access_check);
}
#else
void registerDisks(bool global_skip_access_check)
{
auto & factory = DiskFactory::instance();
registerDiskLocal(factory, global_skip_access_check);
#if USE_AWS_S3
registerDiskS3(factory, global_skip_access_check);
#endif
}
#endif
}

View File

@ -72,6 +72,13 @@ public:
return res;
}
void remove(const std::string & path, int flags)
{
Key key(path, flags);
std::lock_guard lock(mutex);
files.erase(key);
}
static OpenedFileCache & instance()
{
static OpenedFileCache res;
@ -82,5 +89,3 @@ public:
using OpenedFileCachePtr = std::shared_ptr<OpenedFileCache>;
}

View File

@ -12,6 +12,7 @@
#include <type_traits>
#include <Common/StackTrace.h>
#include <Common/formatIPv6.h>
#include <Common/DateLUT.h>
#include <Common/LocalDate.h>
@ -1107,6 +1108,8 @@ inline void readBinary(Decimal128 & x, ReadBuffer & buf) { readPODBinary(x, buf)
inline void readBinary(Decimal256 & x, ReadBuffer & buf) { readPODBinary(x.value, buf); }
inline void readBinary(LocalDate & x, ReadBuffer & buf) { readPODBinary(x, buf); }
inline void readBinary(StackTrace::FramePointers & x, ReadBuffer & buf) { readPODBinary(x, buf); }
template <std::endian endian, typename T>
inline void readBinaryEndian(T & x, ReadBuffer & buf)
{

View File

@ -19,14 +19,7 @@ void WriteBufferFromOStream::nextImpl()
ostr->flush();
if (!ostr->good())
{
/// FIXME do not call finalize in dtors (and remove iostreams)
bool avoid_throwing_exceptions = std::uncaught_exceptions();
if (avoid_throwing_exceptions)
LOG_ERROR(&Poco::Logger::get("WriteBufferFromOStream"), "Cannot write to ostream at offset {}. Stack trace: {}", count(), StackTrace().toString());
else
throw Exception(ErrorCodes::CANNOT_WRITE_TO_OSTREAM, "Cannot write to ostream at offset {}", count());
}
throw Exception(ErrorCodes::CANNOT_WRITE_TO_OSTREAM, "Cannot write to ostream at offset {}", count());
}
WriteBufferFromOStream::WriteBufferFromOStream(
@ -46,9 +39,4 @@ WriteBufferFromOStream::WriteBufferFromOStream(
{
}
WriteBufferFromOStream::~WriteBufferFromOStream()
{
finalize();
}
}

View File

@ -18,8 +18,6 @@ public:
char * existing_memory = nullptr,
size_t alignment = 0);
~WriteBufferFromOStream() override;
protected:
explicit WriteBufferFromOStream(size_t size = DBMS_DEFAULT_BUFFER_SIZE, char * existing_memory = nullptr, size_t alignment = 0);

View File

@ -10,7 +10,8 @@
#include <pcg-random/pcg_random.hpp>
#include "Common/formatIPv6.h"
#include <Common/StackTrace.h>
#include <Common/formatIPv6.h>
#include <Common/DateLUT.h>
#include <Common/LocalDate.h>
#include <Common/LocalDateTime.h>
@ -876,6 +877,8 @@ inline void writeBinary(const UUID & x, WriteBuffer & buf) { writePODBinary(x, b
inline void writeBinary(const IPv4 & x, WriteBuffer & buf) { writePODBinary(x, buf); }
inline void writeBinary(const IPv6 & x, WriteBuffer & buf) { writePODBinary(x, buf); }
inline void writeBinary(const StackTrace::FramePointers & x, WriteBuffer & buf) { writePODBinary(x, buf); }
/// Methods for outputting the value in text form for a tab-separated format.
inline void writeText(is_integer auto x, WriteBuffer & buf)

View File

@ -1,5 +1,6 @@
#include <IO/ZstdDeflatingAppendableWriteBuffer.h>
#include <Common/Exception.h>
#include "IO/ReadBufferFromFileBase.h"
#include <IO/ReadBufferFromFile.h>
namespace DB
@ -11,14 +12,16 @@ namespace ErrorCodes
}
ZstdDeflatingAppendableWriteBuffer::ZstdDeflatingAppendableWriteBuffer(
std::unique_ptr<WriteBufferFromFile> out_,
std::unique_ptr<WriteBufferFromFileBase> out_,
int compression_level,
bool append_to_existing_file_,
std::function<std::unique_ptr<ReadBufferFromFileBase>()> read_buffer_creator_,
size_t buf_size,
char * existing_memory,
size_t alignment)
: BufferWithOwnMemory(buf_size, existing_memory, alignment)
, out(std::move(out_))
, read_buffer_creator(std::move(read_buffer_creator_))
, append_to_existing_file(append_to_existing_file_)
{
cctx = ZSTD_createCCtx();
@ -194,13 +197,13 @@ void ZstdDeflatingAppendableWriteBuffer::addEmptyBlock()
bool ZstdDeflatingAppendableWriteBuffer::isNeedToAddEmptyBlock()
{
ReadBufferFromFile reader(out->getFileName());
auto fsize = reader.getFileSize();
auto reader = read_buffer_creator();
auto fsize = reader->getFileSize();
if (fsize > 3)
{
std::array<char, 3> result;
reader.seek(fsize - 3, SEEK_SET);
reader.readStrict(result.data(), 3);
reader->seek(fsize - 3, SEEK_SET);
reader->readStrict(result.data(), 3);
/// If we don't have correct block in the end, then we need to add it manually.
/// NOTE: maybe we can have the same bytes in case of data corruption/unfinished write.

View File

@ -5,6 +5,7 @@
#include <IO/WriteBuffer.h>
#include <IO/WriteBufferDecorator.h>
#include <IO/WriteBufferFromFile.h>
#include <IO/ReadBufferFromFileBase.h>
#include <zstd.h>
@ -29,9 +30,10 @@ public:
static inline constexpr ZSTDLastBlock ZSTD_CORRECT_TERMINATION_LAST_BLOCK = {0x01, 0x00, 0x00};
ZstdDeflatingAppendableWriteBuffer(
std::unique_ptr<WriteBufferFromFile> out_,
std::unique_ptr<WriteBufferFromFileBase> out_,
int compression_level,
bool append_to_existing_file_,
std::function<std::unique_ptr<ReadBufferFromFileBase>()> read_buffer_creator_,
size_t buf_size = DBMS_DEFAULT_BUFFER_SIZE,
char * existing_memory = nullptr,
size_t alignment = 0);
@ -68,7 +70,8 @@ private:
/// Adding zstd empty block (ZSTD_CORRECT_TERMINATION_LAST_BLOCK) to out.working_buffer
void addEmptyBlock();
std::unique_ptr<WriteBufferFromFile> out;
std::unique_ptr<WriteBufferFromFileBase> out;
std::function<std::unique_ptr<ReadBufferFromFileBase>()> read_buffer_creator;
bool append_to_existing_file = false;
ZSTD_CCtx * cctx;

View File

@ -757,12 +757,14 @@ bool FileCache::tryReserve(FileSegment & file_segment, const size_t size)
chassert(candidate->releasable());
const auto * segment = candidate->file_segment.get();
auto queue_it = segment->getQueueIterator();
chassert(queue_it);
ProfileEvents::increment(ProfileEvents::FilesystemCacheEvictedFileSegments);
ProfileEvents::increment(ProfileEvents::FilesystemCacheEvictedBytes, segment->range().size());
locked_key->removeFileSegment(segment->offset(), segment->lock());
segment->getQueueIterator()->remove(cache_lock);
queue_it->remove(cache_lock);
if (query_context)
query_context->remove(current_key, segment->offset(), cache_lock);

View File

@ -648,8 +648,6 @@ void FileSegment::complete()
if (segment_kind == FileSegmentKind::Temporary && is_last_holder)
{
LOG_TEST(log, "Removing temporary file segment: {}", getInfoForLogUnlocked(segment_lock));
detach(segment_lock, *locked_key);
setDownloadState(State::DETACHED, segment_lock);
locked_key->removeFileSegment(offset(), segment_lock);
return;
}
@ -798,7 +796,6 @@ bool FileSegment::assertCorrectnessUnlocked(const FileSegmentGuard::Lock &) cons
}
chassert(reserved_size >= downloaded_size);
chassert((reserved_size == 0) || queue_iterator);
check_iterator(queue_iterator);
}
@ -872,6 +869,7 @@ void FileSegment::setDetachedState(const FileSegmentGuard::Lock & lock)
setDownloadState(State::DETACHED, lock);
key_metadata.reset();
cache = nullptr;
queue_iterator = nullptr;
}
void FileSegment::detach(const FileSegmentGuard::Lock & lock, const LockedKey &)
@ -890,7 +888,7 @@ void FileSegment::use()
if (!cache)
{
chassert(isCompleted(true));
chassert(isDetached());
return;
}

View File

@ -156,6 +156,8 @@ public:
String getPathInLocalCache() const;
int getFlagsForLocalRead() const { return O_RDONLY | O_CLOEXEC; }
/**
* ========== Methods for _any_ file segment's owner ========================
*/

View File

@ -44,7 +44,7 @@ public:
virtual size_t use(const CacheGuard::Lock &) = 0;
virtual std::shared_ptr<IIterator> remove(const CacheGuard::Lock &) = 0;
virtual void remove(const CacheGuard::Lock &) = 0;
virtual const Entry & getEntry() const = 0;

View File

@ -166,15 +166,17 @@ void LRUFileCachePriority::iterate(IterateFunc && func, const CacheGuard::Lock &
}
}
LRUFileCachePriority::Iterator
LRUFileCachePriority::LRUFileCacheIterator::remove(const CacheGuard::Lock &)
void LRUFileCachePriority::LRUFileCacheIterator::remove(const CacheGuard::Lock &)
{
return std::make_shared<LRUFileCacheIterator>(
cache_priority, cache_priority->remove(queue_iter));
checkUsable();
cache_priority->remove(queue_iter);
queue_iter = LRUQueueIterator{};
}
void LRUFileCachePriority::LRUFileCacheIterator::invalidate()
{
checkUsable();
LOG_TEST(
cache_priority->log,
"Invalidating entry in LRU queue. Key: {}, offset: {}, previous size: {}",
@ -187,6 +189,8 @@ void LRUFileCachePriority::LRUFileCacheIterator::invalidate()
void LRUFileCachePriority::LRUFileCacheIterator::updateSize(int64_t size)
{
checkUsable();
LOG_TEST(
cache_priority->log,
"Update size with {} in LRU queue for key: {}, offset: {}, previous size: {}",
@ -198,8 +202,15 @@ void LRUFileCachePriority::LRUFileCacheIterator::updateSize(int64_t size)
size_t LRUFileCachePriority::LRUFileCacheIterator::use(const CacheGuard::Lock &)
{
checkUsable();
cache_priority->queue.splice(cache_priority->queue.end(), cache_priority->queue, queue_iter);
return ++queue_iter->hits;
}
void LRUFileCachePriority::LRUFileCacheIterator::checkUsable() const
{
if (queue_iter == LRUQueueIterator{})
throw Exception(ErrorCodes::LOGICAL_ERROR, "Attempt to use invalid iterator");
}
}

View File

@ -60,13 +60,15 @@ public:
size_t use(const CacheGuard::Lock &) override;
Iterator remove(const CacheGuard::Lock &) override;
void remove(const CacheGuard::Lock &) override;
void invalidate() override;
void updateSize(int64_t size) override;
private:
void checkUsable() const;
LRUFileCachePriority * cache_priority;
mutable LRUFileCachePriority::LRUQueueIterator queue_iter;
};

View File

@ -147,7 +147,6 @@ String CacheMetadata::getFileNameForFileSegment(size_t offset, FileSegmentKind s
file_suffix = "_temporary";
break;
case FileSegmentKind::Regular:
file_suffix = "";
break;
}
return std::to_string(offset) + file_suffix;
@ -398,17 +397,26 @@ KeyMetadata::iterator LockedKey::removeFileSegment(size_t offset, const FileSegm
if (file_segment->queue_iterator)
file_segment->queue_iterator->invalidate();
file_segment->detach(segment_lock, *this);
const auto path = key_metadata->getFileSegmentPath(*file_segment);
bool exists = fs::exists(path);
if (exists)
{
fs::remove(path);
/// Clear OpenedFileCache to avoid reading from incorrect file descriptor.
int flags = file_segment->getFlagsForLocalRead();
/// Files are created with flags from file_segment->getFlagsForLocalRead()
/// plus optionally O_DIRECT is added, depends on query setting, so remove both.
OpenedFileCache::instance().remove(path, flags);
OpenedFileCache::instance().remove(path, flags | O_DIRECT);
LOG_TEST(key_metadata->log, "Removed file segment at path: {}", path);
}
else if (file_segment->downloaded_size)
throw Exception(ErrorCodes::LOGICAL_ERROR, "Expected path {} to exist", path);
file_segment->detach(segment_lock, *this);
return key_metadata->erase(it);
}

View File

@ -1,5 +1,7 @@
#pragma once
#ifndef CLICKHOUSE_PROGRAM_STANDALONE_BUILD
#include <base/types.h>
#include <Common/isLocalAddress.h>
#include <Common/MultiVersion.h>
@ -1240,3 +1242,9 @@ struct HTTPContext : public IHTTPContext
};
}
#else
#include <Coordination/Standalone/Context.h>
#endif

View File

@ -158,6 +158,17 @@ void CurrentThread::attachQueryForLog(const String & query_)
current_thread->attachQueryForLog(query_);
}
void ThreadStatus::applyGlobalSettings()
{
auto global_context_ptr = global_context.lock();
if (!global_context_ptr)
return;
const Settings & settings = global_context_ptr->getSettingsRef();
DB::Exception::enable_job_stack_trace = settings.enable_job_stack_trace;
}
void ThreadStatus::applyQuerySettings()
{
auto query_context_ptr = query_context.lock();
@ -166,6 +177,8 @@ void ThreadStatus::applyQuerySettings()
const Settings & settings = query_context_ptr->getSettingsRef();
DB::Exception::enable_job_stack_trace = settings.enable_job_stack_trace;
query_id_from_query_context = query_context_ptr->getCurrentQueryId();
initQueryProfiler();
@ -204,6 +217,7 @@ void ThreadStatus::attachToGroupImpl(const ThreadGroupPtr & thread_group_)
local_data = thread_group->getSharedData();
applyGlobalSettings();
applyQuerySettings();
initPerformanceCounters();
}

View File

@ -40,5 +40,5 @@ TEST(CreateTableParser, SS)
ASTPtr ast = parseQuery(p_create_query, input.data(), input.data() + input.size(), "", 0, 0);
WriteBufferFromOStream buf(std::cerr, 4096);
ast->dumpTree(buf);
buf.finalize();
}

View File

@ -215,7 +215,7 @@ bool ParserSetQuery::parseNameValuePair(SettingChange & change, IParser::Pos & p
else if (ParserKeyword("FALSE").ignore(pos, expected))
value = std::make_shared<ASTLiteral>(Field(static_cast<UInt64>(0)));
/// for SETTINGS disk=disk(type='s3', path='', ...)
else if (function_p.parse(pos, function_ast, expected) && function_ast->as<ASTFunction>()->name == "disk")
else if (function_p.parse(pos, function_ast, expected) && function_ast->as<ASTFunction>()->name.starts_with("disk"))
{
tryGetIdentifierNameInto(name, change.name);
change.value = createFieldFromAST(function_ast);
@ -280,7 +280,7 @@ bool ParserSetQuery::parseNameValuePairWithParameterOrDefault(
node = std::make_shared<ASTLiteral>(Field(static_cast<UInt64>(1)));
else if (ParserKeyword("FALSE").ignore(pos, expected))
node = std::make_shared<ASTLiteral>(Field(static_cast<UInt64>(0)));
else if (function_p.parse(pos, function_ast, expected) && function_ast->as<ASTFunction>()->name == "disk")
else if (function_p.parse(pos, function_ast, expected) && function_ast->as<ASTFunction>()->name.starts_with("disk"))
{
change.name = name;
change.value = createFieldFromAST(function_ast);

View File

@ -10,7 +10,7 @@ bool isDiskFunction(ASTPtr ast)
return false;
const auto * function = ast->as<ASTFunction>();
return function && function->name == "disk" && function->arguments->as<ASTExpressionList>();
return function && function->name.starts_with("disk") && function->arguments->as<ASTExpressionList>();
}
}

View File

@ -319,6 +319,8 @@ void AggregatingStep::transformPipeline(QueryPipelineBuilder & pipeline, const B
{
auto column_with_default = col.column->cloneEmpty();
col.type->insertDefaultInto(*column_with_default);
column_with_default->finalize();
auto column = ColumnConst::create(std::move(column_with_default), 0);
const auto * node = &dag->addColumn({ColumnPtr(std::move(column)), col.type, col.name});
node = &dag->materializeNode(*node);

View File

@ -14,6 +14,9 @@ namespace QueryPlanOptimizations
void optimizeTreeFirstPass(const QueryPlanOptimizationSettings & settings, QueryPlan::Node & root, QueryPlan::Nodes & nodes);
/// Second pass is used to apply read-in-order and attach a predicate to PK.
void optimizeTreeSecondPass(const QueryPlanOptimizationSettings & optimization_settings, QueryPlan::Node & root, QueryPlan::Nodes & nodes);
/// Third pass is used to apply filters such as key conditions and skip indexes to the storages that support them.
/// After that it add CreateSetsStep for the subqueries that has not be used in the filters.
void optimizeTreeThirdPass(QueryPlan::Node & root, QueryPlan::Nodes & nodes);
/// Optimization (first pass) is a function applied to QueryPlan::Node.
/// It can read and update subtree of specified node.

Some files were not shown because too many files have changed in this diff Show More