diff --git a/base/glibc-compatibility/musl/aarch64/syscall.s b/base/glibc-compatibility/musl/aarch64/syscall.s
index 845986bf787..aadaea04ef5 100644
--- a/base/glibc-compatibility/musl/aarch64/syscall.s
+++ b/base/glibc-compatibility/musl/aarch64/syscall.s
@@ -2,6 +2,7 @@
.hidden __syscall
.type __syscall,%function
__syscall:
+.cfi_startproc
uxtw x8,w0
mov x0,x1
mov x1,x2
@@ -12,3 +13,4 @@ __syscall:
mov x6,x7
svc 0
ret
+.cfi_endproc
diff --git a/contrib/cctz b/contrib/cctz
index 8529bcef5cd..7918cb7afe8 160000
--- a/contrib/cctz
+++ b/contrib/cctz
@@ -1 +1 @@
-Subproject commit 8529bcef5cd996b7c0f4d7475286b76b5d126c4c
+Subproject commit 7918cb7afe82e53428e39a045a437fdfd4f3df47
diff --git a/docker/test/fuzzer/run-fuzzer.sh b/docker/test/fuzzer/run-fuzzer.sh
index b4376fe2409..ccf450c94f2 100755
--- a/docker/test/fuzzer/run-fuzzer.sh
+++ b/docker/test/fuzzer/run-fuzzer.sh
@@ -386,7 +386,8 @@ if [ -f core.zst ]; then
CORE_LINK='core.zst'
fi
-rg --text -F '' server.log > fatal.log ||:
+# Keep all the lines in the paragraphs containing that either contain or don't start with 20... (year)
+sed -n '//,/^$/p' server.log | awk '// || !/^20/' > fatal.log ||:
FATAL_LINK=''
if [ -s fatal.log ]; then
FATAL_LINK='fatal.log'
diff --git a/docs/changelogs/v23.3.20.27-lts.md b/docs/changelogs/v23.3.20.27-lts.md
new file mode 100644
index 00000000000..9f49e47f0bc
--- /dev/null
+++ b/docs/changelogs/v23.3.20.27-lts.md
@@ -0,0 +1,29 @@
+---
+sidebar_position: 1
+sidebar_label: 2024
+---
+
+# 2024 Changelog
+
+### ClickHouse release v23.3.20.27-lts (cc974ba4f81) FIXME as compared to v23.3.19.32-lts (c4d4ca8ec02)
+
+#### Improvement
+* Backported in [#58818](https://github.com/ClickHouse/ClickHouse/issues/58818): Add `SYSTEM JEMALLOC PURGE` for purging unused jemalloc pages, `SYSTEM JEMALLOC [ ENABLE | DISABLE | FLUSH ] PROFILE` for controlling jemalloc profile if the profiler is enabled. Add jemalloc-related 4LW command in Keeper: `jmst` for dumping jemalloc stats, `jmfp`, `jmep`, `jmdp` for controlling jemalloc profile if the profiler is enabled. [#58665](https://github.com/ClickHouse/ClickHouse/pull/58665) ([Antonio Andelic](https://github.com/antonio2368)).
+
+#### Build/Testing/Packaging Improvement
+* Backported in [#59877](https://github.com/ClickHouse/ClickHouse/issues/59877): If you want to run initdb scripts every time when ClickHouse container is starting you shoud initialize environment varible CLICKHOUSE_ALWAYS_RUN_INITDB_SCRIPTS. [#59808](https://github.com/ClickHouse/ClickHouse/pull/59808) ([Alexander Nikolaev](https://github.com/AlexNik)).
+
+#### Bug Fix (user-visible misbehavior in an official stable release)
+
+* Fix working with read buffers in StreamingFormatExecutor [#57438](https://github.com/ClickHouse/ClickHouse/pull/57438) ([Kruglov Pavel](https://github.com/Avogar)).
+* Fix double destroy call on exception throw in addBatchLookupTable8 [#58745](https://github.com/ClickHouse/ClickHouse/pull/58745) ([Raúl Marín](https://github.com/Algunenano)).
+* Fix: LIMIT BY and LIMIT in distributed query [#59153](https://github.com/ClickHouse/ClickHouse/pull/59153) ([Igor Nikonov](https://github.com/devcrafter)).
+* Fix translate() with FixedString input [#59356](https://github.com/ClickHouse/ClickHouse/pull/59356) ([Raúl Marín](https://github.com/Algunenano)).
+* Fix leftPad / rightPad function with FixedString input [#59739](https://github.com/ClickHouse/ClickHouse/pull/59739) ([Raúl Marín](https://github.com/Algunenano)).
+* Fix cosineDistance crash with Nullable [#60150](https://github.com/ClickHouse/ClickHouse/pull/60150) ([Raúl Marín](https://github.com/Algunenano)).
+
+#### NOT FOR CHANGELOG / INSIGNIFICANT
+
+* Fix possible race in ManyAggregatedData dtor. [#58624](https://github.com/ClickHouse/ClickHouse/pull/58624) ([Nikolai Kochetov](https://github.com/KochetovNicolai)).
+* Make ZooKeeper actually sequentialy consistent [#59735](https://github.com/ClickHouse/ClickHouse/pull/59735) ([Alexander Tokmakov](https://github.com/tavplubix)).
+
diff --git a/docs/changelogs/v23.8.10.43-lts.md b/docs/changelogs/v23.8.10.43-lts.md
new file mode 100644
index 00000000000..0093467d129
--- /dev/null
+++ b/docs/changelogs/v23.8.10.43-lts.md
@@ -0,0 +1,39 @@
+---
+sidebar_position: 1
+sidebar_label: 2024
+---
+
+# 2024 Changelog
+
+### ClickHouse release v23.8.10.43-lts (a278225bba9) FIXME as compared to v23.8.9.54-lts (192a1d231fa)
+
+#### Improvement
+* Backported in [#58819](https://github.com/ClickHouse/ClickHouse/issues/58819): Add `SYSTEM JEMALLOC PURGE` for purging unused jemalloc pages, `SYSTEM JEMALLOC [ ENABLE | DISABLE | FLUSH ] PROFILE` for controlling jemalloc profile if the profiler is enabled. Add jemalloc-related 4LW command in Keeper: `jmst` for dumping jemalloc stats, `jmfp`, `jmep`, `jmdp` for controlling jemalloc profile if the profiler is enabled. [#58665](https://github.com/ClickHouse/ClickHouse/pull/58665) ([Antonio Andelic](https://github.com/antonio2368)).
+* Backported in [#60286](https://github.com/ClickHouse/ClickHouse/issues/60286): Copy S3 file GCP fallback to buffer copy in case GCP returned `Internal Error` with `GATEWAY_TIMEOUT` HTTP error code. [#60164](https://github.com/ClickHouse/ClickHouse/pull/60164) ([Maksim Kita](https://github.com/kitaisreal)).
+
+#### Build/Testing/Packaging Improvement
+* Backported in [#59879](https://github.com/ClickHouse/ClickHouse/issues/59879): If you want to run initdb scripts every time when ClickHouse container is starting you shoud initialize environment varible CLICKHOUSE_ALWAYS_RUN_INITDB_SCRIPTS. [#59808](https://github.com/ClickHouse/ClickHouse/pull/59808) ([Alexander Nikolaev](https://github.com/AlexNik)).
+
+#### Bug Fix (user-visible misbehavior in an official stable release)
+
+* Background merges correctly use temporary data storage in the cache [#57275](https://github.com/ClickHouse/ClickHouse/pull/57275) ([vdimir](https://github.com/vdimir)).
+* MergeTree mutations reuse source part index granularity [#57352](https://github.com/ClickHouse/ClickHouse/pull/57352) ([Maksim Kita](https://github.com/kitaisreal)).
+* Fix double destroy call on exception throw in addBatchLookupTable8 [#58745](https://github.com/ClickHouse/ClickHouse/pull/58745) ([Raúl Marín](https://github.com/Algunenano)).
+* Fix JSONExtract function for LowCardinality(Nullable) columns [#58808](https://github.com/ClickHouse/ClickHouse/pull/58808) ([vdimir](https://github.com/vdimir)).
+* Fix: LIMIT BY and LIMIT in distributed query [#59153](https://github.com/ClickHouse/ClickHouse/pull/59153) ([Igor Nikonov](https://github.com/devcrafter)).
+* Fix translate() with FixedString input [#59356](https://github.com/ClickHouse/ClickHouse/pull/59356) ([Raúl Marín](https://github.com/Algunenano)).
+* Fix error "Read beyond last offset" for AsynchronousBoundedReadBuffer [#59630](https://github.com/ClickHouse/ClickHouse/pull/59630) ([Vitaly Baranov](https://github.com/vitlibar)).
+* Fix query start time on non initial queries [#59662](https://github.com/ClickHouse/ClickHouse/pull/59662) ([Raúl Marín](https://github.com/Algunenano)).
+* Fix leftPad / rightPad function with FixedString input [#59739](https://github.com/ClickHouse/ClickHouse/pull/59739) ([Raúl Marín](https://github.com/Algunenano)).
+* rabbitmq: fix having neither acked nor nacked messages [#59775](https://github.com/ClickHouse/ClickHouse/pull/59775) ([Kseniia Sumarokova](https://github.com/kssenii)).
+* Fix cosineDistance crash with Nullable [#60150](https://github.com/ClickHouse/ClickHouse/pull/60150) ([Raúl Marín](https://github.com/Algunenano)).
+
+#### NOT FOR CHANGELOG / INSIGNIFICANT
+
+* Fix rare race in external sort/aggregation with temporary data in cache [#58013](https://github.com/ClickHouse/ClickHouse/pull/58013) ([Anton Popov](https://github.com/CurtizJ)).
+* Fix possible race in ManyAggregatedData dtor. [#58624](https://github.com/ClickHouse/ClickHouse/pull/58624) ([Nikolai Kochetov](https://github.com/KochetovNicolai)).
+* Fix 02720_row_policy_column_with_dots [#59453](https://github.com/ClickHouse/ClickHouse/pull/59453) ([Duc Canh Le](https://github.com/canhld94)).
+* Pin python dependencies in stateless tests [#59663](https://github.com/ClickHouse/ClickHouse/pull/59663) ([Raúl Marín](https://github.com/Algunenano)).
+* Make ZooKeeper actually sequentialy consistent [#59735](https://github.com/ClickHouse/ClickHouse/pull/59735) ([Alexander Tokmakov](https://github.com/tavplubix)).
+* Remove broken test while we fix it [#60547](https://github.com/ClickHouse/ClickHouse/pull/60547) ([Raúl Marín](https://github.com/Algunenano)).
+
diff --git a/docs/en/development/build.md b/docs/en/development/build.md
index b474c445604..5cbf851b785 100644
--- a/docs/en/development/build.md
+++ b/docs/en/development/build.md
@@ -14,20 +14,6 @@ Supported platforms:
- PowerPC 64 LE (experimental)
- RISC-V 64 (experimental)
-## Building in docker
-We use the docker image `clickhouse/binary-builder` for our CI builds. It contains everything necessary to build the binary and packages. There is a script `docker/packager/packager` to ease the image usage:
-
-```bash
-# define a directory for the output artifacts
-output_dir="build_results"
-# a simplest build
-./docker/packager/packager --package-type=binary --output-dir "$output_dir"
-# build debian packages
-./docker/packager/packager --package-type=deb --output-dir "$output_dir"
-# by default, debian packages use thin LTO, so we can override it to speed up the build
-CMAKE_FLAGS='-DENABLE_THINLTO=' ./docker/packager/packager --package-type=deb --output-dir "./$(git rev-parse --show-cdup)/build_results"
-```
-
## Building on Ubuntu
The following tutorial is based on Ubuntu Linux.
@@ -37,6 +23,7 @@ The minimum recommended Ubuntu version for development is 22.04 LTS.
### Install Prerequisites {#install-prerequisites}
``` bash
+sudo apt-get update
sudo apt-get install git cmake ccache python3 ninja-build nasm yasm gawk lsb-release wget software-properties-common gnupg
```
@@ -57,7 +44,7 @@ sudo add-apt-repository -y ppa:ubuntu-toolchain-r/test
For other Linux distributions - check the availability of LLVM's [prebuild packages](https://releases.llvm.org/download.html).
-As of August 2023, clang-16 or higher will work.
+As of March 2024, clang-17 or higher will work.
GCC as a compiler is not supported.
To build with a specific Clang version:
@@ -67,8 +54,8 @@ to see what version you have installed before setting this environment variable.
:::
``` bash
-export CC=clang-17
-export CXX=clang++-17
+export CC=clang-18
+export CXX=clang++-18
```
### Checkout ClickHouse Sources {#checkout-clickhouse-sources}
@@ -133,3 +120,17 @@ mkdir build
cmake -S . -B build
cmake --build build
```
+
+## Building in docker
+We use the docker image `clickhouse/binary-builder` for our CI builds. It contains everything necessary to build the binary and packages. There is a script `docker/packager/packager` to ease the image usage:
+
+```bash
+# define a directory for the output artifacts
+output_dir="build_results"
+# a simplest build
+./docker/packager/packager --package-type=binary --output-dir "$output_dir"
+# build debian packages
+./docker/packager/packager --package-type=deb --output-dir "$output_dir"
+# by default, debian packages use thin LTO, so we can override it to speed up the build
+CMAKE_FLAGS='-DENABLE_THINLTO=' ./docker/packager/packager --package-type=deb --output-dir "./$(git rev-parse --show-cdup)/build_results"
+```
diff --git a/docs/en/interfaces/formats.md b/docs/en/interfaces/formats.md
index 285737312bd..a76bb01ce9e 100644
--- a/docs/en/interfaces/formats.md
+++ b/docs/en/interfaces/formats.md
@@ -7,6 +7,7 @@ title: Formats for Input and Output Data
ClickHouse can accept and return data in various formats. A format supported for input can be used to parse the data provided to `INSERT`s, to perform `SELECT`s from a file-backed table such as File, URL or HDFS, or to read a dictionary. A format supported for output can be used to arrange the
results of a `SELECT`, and to perform `INSERT`s into a file-backed table.
+All format names are case insensitive.
The supported formats are:
diff --git a/docs/en/interfaces/schema-inference.md b/docs/en/interfaces/schema-inference.md
index 39ae69eaef4..05fae994cbe 100644
--- a/docs/en/interfaces/schema-inference.md
+++ b/docs/en/interfaces/schema-inference.md
@@ -549,6 +549,48 @@ Result:
└───────┴─────────────────────────────────────────────────────────────────────────────────────────────────┴──────────────┴────────────────────┴─────────┴──────────────────┴────────────────┘
```
+##### input_format_json_use_string_type_for_ambiguous_paths_in_named_tuples_inference_from_objects
+
+Enabling this setting allows to use String type for ambiguous paths during named tuples inference from JSON objects (when `input_format_json_try_infer_named_tuples_from_objects` is enabled) instead of an exception.
+It allows to read JSON objects as named Tuples even if there are ambiguous paths.
+
+Disabled by default.
+
+**Examples**
+
+With disabled setting:
+```sql
+SET input_format_json_try_infer_named_tuples_from_objects = 1;
+SET input_format_json_use_string_type_for_ambiguous_paths_in_named_tuples_inference_from_objects = 0;
+DESC format(JSONEachRow, '{"obj" : {"a" : 42}}, {"obj" : {"a" : {"b" : "Hello"}}}');
+```
+Result:
+
+```text
+Code: 636. DB::Exception: The table structure cannot be extracted from a JSONEachRow format file. Error:
+Code: 117. DB::Exception: JSON objects have ambiguous data: in some objects path 'a' has type 'Int64' and in some - 'Tuple(b String)'. You can enable setting input_format_json_use_string_type_for_ambiguous_paths_in_named_tuples_inference_from_objects to use String type for path 'a'. (INCORRECT_DATA) (version 24.3.1.1).
+You can specify the structure manually. (CANNOT_EXTRACT_TABLE_STRUCTURE)
+```
+
+With enabled setting:
+```sql
+SET input_format_json_try_infer_named_tuples_from_objects = 1;
+SET input_format_json_use_string_type_for_ambiguous_paths_in_named_tuples_inference_from_objects = 1;
+DESC format(JSONEachRow, '{"obj" : "a" : 42}, {"obj" : {"a" : {"b" : "Hello"}}}');
+SELECT * FROM format(JSONEachRow, '{"obj" : {"a" : 42}}, {"obj" : {"a" : {"b" : "Hello"}}}');
+```
+
+Result:
+```text
+┌─name─┬─type──────────────────────────┬─default_type─┬─default_expression─┬─comment─┬─codec_expression─┬─ttl_expression─┐
+│ obj │ Tuple(a Nullable(String)) │ │ │ │ │ │
+└──────┴───────────────────────────────┴──────────────┴────────────────────┴─────────┴──────────────────┴────────────────┘
+┌─obj─────────────────┐
+│ ('42') │
+│ ('{"b" : "Hello"}') │
+└─────────────────────┘
+```
+
##### input_format_json_read_objects_as_strings
Enabling this setting allows reading nested JSON objects as strings.
@@ -1554,6 +1596,28 @@ DESC format(JSONEachRow, $$
└──────┴──────────────────┴──────────────┴────────────────────┴─────────┴──────────────────┴────────────────┘
```
+#### input_format_try_infer_exponent_floats
+
+If enabled, ClickHouse will try to infer floats in exponential form for text formats (except JSON where numbers in exponential form are always inferred).
+
+Disabled by default.
+
+**Example**
+
+```sql
+SET input_format_try_infer_exponent_floats = 1;
+DESC format(CSV,
+$$1.1E10
+2.3e-12
+42E00
+$$)
+```
+```response
+┌─name─┬─type──────────────┬─default_type─┬─default_expression─┬─comment─┬─codec_expression─┬─ttl_expression─┐
+│ c1 │ Nullable(Float64) │ │ │ │ │ │
+└──────┴───────────────────┴──────────────┴────────────────────┴─────────┴──────────────────┴────────────────┘
+```
+
## Self describing formats {#self-describing-formats}
Self-describing formats contain information about the structure of the data in the data itself,
diff --git a/docs/en/operations/storing-data.md b/docs/en/operations/storing-data.md
index 003277c8d4f..84251812c01 100644
--- a/docs/en/operations/storing-data.md
+++ b/docs/en/operations/storing-data.md
@@ -275,6 +275,16 @@ Cache profile events:
- `CachedWriteBufferCacheWriteBytes`, `CachedWriteBufferCacheWriteMicroseconds`
+## Using in-memory cache (userspace page cache) {#userspace-page-cache}
+
+The File Cache described above stores cached data in local files. Alternatively, object-store-based disks can be configured to use "Userspace Page Cache", which is RAM-only. Userspace page cache is recommended only if file cache can't be used for some reason, e.g. if the machine doesn't have a local disk at all. Note that file cache effectively uses RAM for caching too, since the OS caches contents of local files.
+
+To enable userspace page cache for disks that don't use file cache, use setting `use_page_cache_for_disks_without_file_cache`.
+
+By default, on Linux, the userspace page cache will use all available memory, similar to the OS page cache. In tools like `top` and `ps`, the clickhouse server process will typically show resident set size near 100% of the machine's RAM - this is normal, and most of this memory is actually reclaimable by the OS on memory pressure (`MADV_FREE`). This behavior can be disabled with server setting `page_cache_use_madv_free = 0`, making the userspace page cache just use a fixed amount of memory `page_cache_size` with no special interaction with the OS. On Mac OS, `page_cache_use_madv_free` is always disabled as it doesn't have lazy `MADV_FREE`.
+
+Unfortunately, `page_cache_use_madv_free` makes it difficult to tell if the server is close to running out of memory, since the RSS metric becomes useless. Async metric `UnreclaimableRSS` shows the amount of physical memory used by the server, excluding the memory reclaimable by the OS: `select value from system.asynchronous_metrics where metric = 'UnreclaimableRSS'`. Use it for monitoring instead of RSS. This metric is only available if `page_cache_use_madv_free` is enabled.
+
## Storing Data on Web Server {#storing-data-on-webserver}
There is a tool `clickhouse-static-files-uploader`, which prepares a data directory for a given table (`SELECT data_paths FROM system.tables WHERE name = 'table_name'`). For each table you need, you get a directory of files. These files can be uploaded to, for example, a web server with static files. After this preparation, you can load this table into any ClickHouse server via `DiskWeb`.
diff --git a/docs/zh/getting-started/example-datasets/opensky.mdx b/docs/zh/getting-started/example-datasets/opensky.mdx
index 92cd104e06e..b79c02ab780 100644
--- a/docs/zh/getting-started/example-datasets/opensky.mdx
+++ b/docs/zh/getting-started/example-datasets/opensky.mdx
@@ -1,4 +1,4 @@
----
+---
slug: /zh/getting-started/example-datasets/opensky
sidebar_label: 空中交通数据
description: 该数据集中的数据是从完整的 OpenSky 数据集中衍生而来的,对其中的数据进行了必要的清理,用以展示在 COVID-19 期间空中交通的发展。
@@ -53,12 +53,12 @@ CREATE TABLE opensky
ls -1 flightlist_*.csv.gz | xargs -P100 -I{} bash -c 'gzip -c -d "{}" | clickhouse-client --date_time_input_format best_effort --query "INSERT INTO opensky FORMAT CSVWithNames"'
```
-- 这里我们将文件列表(`ls -1 flightlist_*.csv.gz`)传递给`xargs`以进行并行处理。 `xargs -P100` 指定最多使用 100 个并行工作程序,但由于我们只有 30 个文件,工作程序的数量将只有 30 个。
-- 对于每个文件,`xargs` 将通过 `bash -c` 为每个文件运行一个脚本文件。该脚本通过使用 `{}` 表示文件名占位符,然后 `xargs` 由命令进行填充(使用 `-I{}`)。
-- 该脚本会将文件 (`gzip -c -d "{}"`) 解压缩到标准输出(`-c` 参数),并将输出重定向到 `clickhouse-client`。
-- 我们还要求使用扩展解析器解析 [DateTime](../../sql-reference/data-types/datetime.md) 字段 ([--date_time_input_format best_effort](../../operations/settings/ settings.md#settings-date_time_input_format)) 以识别具有时区偏移的 ISO-8601 格式。
+- 这里我们将文件列表(`ls -1 flightlist_*.csv.gz`)传递给`xargs`以进行并行处理。 `xargs -P100` 指定最多使用 100 个并行工作程序,但由于我们只有 30 个文件,工作程序的数量将只有 30 个。
+- 对于每个文件,`xargs` 将通过 `bash -c` 为每个文件运行一个脚本文件。该脚本通过使用 `{}` 表示文件名占位符,然后 `xargs` 由命令进行填充(使用 `-I{}`)。
+- 该脚本会将文件 (`gzip -c -d "{}"`) 解压缩到标准输出(`-c` 参数),并将输出重定向到 `clickhouse-client`。
+- 我们还要求使用扩展解析器解析 [DateTime](/docs/zh/sql-reference/data-types/datetime.md) 字段 ([--date_time_input_format best_effort](/docs/zh/operations/settings/settings.md#settings-date_time_input_format)) 以识别具有时区偏移的 ISO-8601 格式。
-最后,`clickhouse-client` 会以 [CSVWithNames](../../interfaces/formats.md#csvwithnames) 格式读取输入数据然后执行插入。
+最后,`clickhouse-client` 会以 [CSVWithNames](/docs/zh/interfaces/formats.md#csvwithnames) 格式读取输入数据然后执行插入。
并行导入需要 24 秒。
diff --git a/programs/server/Server.cpp b/programs/server/Server.cpp
index a10f47be0b8..c45291ba52c 100644
--- a/programs/server/Server.cpp
+++ b/programs/server/Server.cpp
@@ -1228,6 +1228,13 @@ try
}
global_context->setMarkCache(mark_cache_policy, mark_cache_size, mark_cache_size_ratio);
+ size_t page_cache_size = server_settings.page_cache_size;
+ if (page_cache_size != 0)
+ global_context->setPageCache(
+ server_settings.page_cache_chunk_size, server_settings.page_cache_mmap_size,
+ page_cache_size, server_settings.page_cache_use_madv_free,
+ server_settings.page_cache_use_transparent_huge_pages);
+
String index_uncompressed_cache_policy = server_settings.index_uncompressed_cache_policy;
size_t index_uncompressed_cache_size = server_settings.index_uncompressed_cache_size;
double index_uncompressed_cache_size_ratio = server_settings.index_uncompressed_cache_size_ratio;
@@ -1874,7 +1881,6 @@ try
{
total_memory_tracker.setSampleMaxAllocationSize(server_settings.total_memory_profiler_sample_max_allocation_size);
}
-
}
#endif
@@ -1889,10 +1895,6 @@ try
" when two different stack unwinding methods will interfere with each other.");
#endif
-#if !defined(__x86_64__)
- LOG_INFO(log, "Query Profiler and TraceCollector is only tested on x86_64. It also known to not work under qemu-user.");
-#endif
-
if (!hasPHDRCache())
LOG_INFO(log, "Query Profiler and TraceCollector are disabled because they require PHDR cache to be created"
" (otherwise the function 'dl_iterate_phdr' is not lock free and not async-signal safe).");
diff --git a/src/Access/Common/AccessType.h b/src/Access/Common/AccessType.h
index ea3fb123b38..87f96ca48be 100644
--- a/src/Access/Common/AccessType.h
+++ b/src/Access/Common/AccessType.h
@@ -163,6 +163,7 @@ enum class AccessType
M(SYSTEM_DROP_FILESYSTEM_CACHE, "SYSTEM DROP FILESYSTEM CACHE, DROP FILESYSTEM CACHE", GLOBAL, SYSTEM_DROP_CACHE) \
M(SYSTEM_DROP_DISTRIBUTED_CACHE, "SYSTEM DROP DISTRIBUTED CACHE, DROP DISTRIBUTED CACHE", GLOBAL, SYSTEM_DROP_CACHE) \
M(SYSTEM_SYNC_FILESYSTEM_CACHE, "SYSTEM REPAIR FILESYSTEM CACHE, REPAIR FILESYSTEM CACHE, SYNC FILESYSTEM CACHE", GLOBAL, SYSTEM) \
+ M(SYSTEM_DROP_PAGE_CACHE, "SYSTEM DROP PAGE CACHE, DROP PAGE CACHE", GLOBAL, SYSTEM_DROP_CACHE) \
M(SYSTEM_DROP_SCHEMA_CACHE, "SYSTEM DROP SCHEMA CACHE, DROP SCHEMA CACHE", GLOBAL, SYSTEM_DROP_CACHE) \
M(SYSTEM_DROP_FORMAT_SCHEMA_CACHE, "SYSTEM DROP FORMAT SCHEMA CACHE, DROP FORMAT SCHEMA CACHE", GLOBAL, SYSTEM_DROP_CACHE) \
M(SYSTEM_DROP_S3_CLIENT_CACHE, "SYSTEM DROP S3 CLIENT, DROP S3 CLIENT CACHE", GLOBAL, SYSTEM_DROP_CACHE) \
diff --git a/src/Analyzer/Passes/FunctionToSubcolumnsPass.cpp b/src/Analyzer/Passes/FunctionToSubcolumnsPass.cpp
index 8ba33a50ccf..146af7751e0 100644
--- a/src/Analyzer/Passes/FunctionToSubcolumnsPass.cpp
+++ b/src/Analyzer/Passes/FunctionToSubcolumnsPass.cpp
@@ -361,7 +361,7 @@ private:
if (table_node.hasTableExpressionModifiers() && table_node.getTableExpressionModifiers()->hasFinal())
return;
- auto column = first_argument_column_node.getColumn();
+ const auto & column = first_argument_column_node.getColumn();
auto table_name = table_node.getStorage()->getStorageID().getFullTableName();
Identifier qualified_name({table_name, column.name});
diff --git a/src/Common/CurrentMetrics.cpp b/src/Common/CurrentMetrics.cpp
index 6931001202d..82da4c4bbad 100644
--- a/src/Common/CurrentMetrics.cpp
+++ b/src/Common/CurrentMetrics.cpp
@@ -264,7 +264,17 @@
M(RefreshingViews, "Number of materialized views currently executing a refresh") \
M(StorageBufferFlushThreads, "Number of threads for background flushes in StorageBuffer") \
M(StorageBufferFlushThreadsActive, "Number of threads for background flushes in StorageBuffer running a task") \
- M(StorageBufferFlushThreadsScheduled, "Number of queued or active threads for background flushes in StorageBuffer")
+ M(StorageBufferFlushThreadsScheduled, "Number of queued or active threads for background flushes in StorageBuffer") \
+ M(SharedMergeTreeThreads, "Number of threads in the thread pools in internals of SharedMergeTree") \
+ M(SharedMergeTreeThreadsActive, "Number of threads in the thread pools in internals of SharedMergeTree running a task") \
+ M(SharedMergeTreeThreadsScheduled, "Number of queued or active threads in the thread pools in internals of SharedMergeTree") \
+ M(SharedMergeTreeFetch, "Number of fetches in progress") \
+ M(CacheWarmerBytesInProgress, "Total size of remote file segments waiting to be asynchronously loaded into filesystem cache.") \
+ M(DistrCacheOpenedConnections, "Number of open connections to Distributed Cache") \
+ M(DistrCacheUsedConnections, "Number of currently used connections to Distributed Cache") \
+ M(DistrCacheReadRequests, "Number of executed Read requests to Distributed Cache") \
+ M(DistrCacheWriteRequests, "Number of executed Write requests to Distributed Cache") \
+ M(DistrCacheServerConnections, "Number of open connections to ClickHouse server from Distributed Cache")
#ifdef APPLY_FOR_EXTERNAL_METRICS
#define APPLY_FOR_METRICS(M) APPLY_FOR_BUILTIN_METRICS(M) APPLY_FOR_EXTERNAL_METRICS(M)
diff --git a/src/Common/ErrorCodes.cpp b/src/Common/ErrorCodes.cpp
index ca00f2fd513..44463f7f437 100644
--- a/src/Common/ErrorCodes.cpp
+++ b/src/Common/ErrorCodes.cpp
@@ -379,7 +379,6 @@
M(467, CANNOT_PARSE_BOOL) \
M(468, CANNOT_PTHREAD_ATTR) \
M(469, VIOLATED_CONSTRAINT) \
- M(470, QUERY_IS_NOT_SUPPORTED_IN_LIVE_VIEW) \
M(471, INVALID_SETTING_VALUE) \
M(472, READONLY_SETTING) \
M(473, DEADLOCK_AVOIDED) \
@@ -585,6 +584,10 @@
M(703, INVALID_IDENTIFIER) \
M(704, QUERY_CACHE_USED_WITH_NONDETERMINISTIC_FUNCTIONS) \
M(705, TABLE_NOT_EMPTY) \
+ \
+ M(900, DISTRIBUTED_CACHE_ERROR) \
+ M(901, CANNOT_USE_DISTRIBUTED_CACHE) \
+ \
M(706, LIBSSH_ERROR) \
M(707, GCP_ERROR) \
M(708, ILLEGAL_STATISTIC) \
diff --git a/src/Common/FailPoint.cpp b/src/Common/FailPoint.cpp
index a23133b7522..9e551c8f2cd 100644
--- a/src/Common/FailPoint.cpp
+++ b/src/Common/FailPoint.cpp
@@ -39,6 +39,14 @@ static struct InitFiu
REGULAR(replicated_merge_tree_commit_zk_fail_when_recovering_from_hw_fault) \
REGULAR(use_delayed_remote_source) \
REGULAR(cluster_discovery_faults) \
+ ONCE(smt_commit_merge_mutate_zk_fail_after_op) \
+ ONCE(smt_commit_merge_mutate_zk_fail_before_op) \
+ ONCE(smt_commit_write_zk_fail_after_op) \
+ ONCE(smt_commit_write_zk_fail_before_op) \
+ ONCE(smt_commit_merge_change_version_before_op) \
+ ONCE(smt_merge_mutate_intention_freeze_in_destructor) \
+ ONCE(meta_in_keeper_create_metadata_failure) \
+ REGULAR(cache_warmer_stall) \
REGULAR(check_table_query_delay_for_part) \
REGULAR(dummy_failpoint) \
REGULAR(prefetched_reader_pool_failpoint) \
diff --git a/src/Common/PageCache.cpp b/src/Common/PageCache.cpp
new file mode 100644
index 00000000000..511ec23d431
--- /dev/null
+++ b/src/Common/PageCache.cpp
@@ -0,0 +1,688 @@
+#include "PageCache.h"
+
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+
+namespace ProfileEvents
+{
+ extern const Event PageCacheChunkMisses;
+ extern const Event PageCacheChunkShared;
+ extern const Event PageCacheChunkDataHits;
+ extern const Event PageCacheChunkDataPartialHits;
+ extern const Event PageCacheChunkDataMisses;
+ extern const Event PageCacheBytesUnpinnedRoundedToPages;
+ extern const Event PageCacheBytesUnpinnedRoundedToHugePages;
+}
+
+namespace DB
+{
+
+namespace ErrorCodes
+{
+ extern const int SYSTEM_ERROR;
+ extern const int MEMORY_LIMIT_EXCEEDED;
+ extern const int CANNOT_ALLOCATE_MEMORY;
+ extern const int INVALID_SETTING_VALUE;
+ extern const int FILE_DOESNT_EXIST;
+}
+
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wunknown-warning-option"
+#pragma clang diagnostic ignored "-Wreadability-make-member-function-const"
+
+PinnedPageChunk::PinnedPageChunk(PinnedPageChunk && c) noexcept
+ : cache(std::exchange(c.cache, nullptr)), chunk(std::exchange(c.chunk, nullptr)) {}
+
+PinnedPageChunk & PinnedPageChunk::operator=(PinnedPageChunk && c) noexcept
+{
+ if (cache)
+ cache->removeRef(chunk);
+ cache = std::exchange(c.cache, nullptr);
+ chunk = std::exchange(c.chunk, nullptr);
+ return *this;
+}
+
+PinnedPageChunk::~PinnedPageChunk() noexcept
+{
+ if (cache)
+ cache->removeRef(chunk);
+}
+
+PinnedPageChunk::PinnedPageChunk(PageCache * cache_, PageChunk * chunk_) noexcept : cache(cache_), chunk(chunk_) {}
+
+const PageChunk * PinnedPageChunk::getChunk() const { return chunk; }
+
+bool PinnedPageChunk::markPagePopulated(size_t page_idx)
+{
+ bool r = chunk->pages_populated.set(page_idx);
+ return r;
+}
+
+void PinnedPageChunk::markPrefixPopulated(size_t bytes)
+{
+ for (size_t i = 0; i < (bytes + chunk->page_size - 1) / chunk->page_size; ++i)
+ markPagePopulated(i);
+}
+
+bool PinnedPageChunk::isPrefixPopulated(size_t bytes) const
+{
+ for (size_t i = 0; i < (bytes + chunk->page_size - 1) / chunk->page_size; ++i)
+ if (!chunk->pages_populated.get(i))
+ return false;
+ return true;
+}
+
+AtomicBitSet::AtomicBitSet() = default;
+
+void AtomicBitSet::init(size_t nn)
+{
+ n = nn;
+ v = std::make_unique[]>((n + 7) / 8);
+}
+
+bool AtomicBitSet::get(size_t i) const
+{
+ return (v[i / 8] & (1 << (i % 8))) != 0;
+}
+
+bool AtomicBitSet::any() const
+{
+ for (size_t i = 0; i < (n + 7) / 8; ++i)
+ if (v[i])
+ return true;
+ return false;
+}
+
+bool AtomicBitSet::set(size_t i) const
+{
+ UInt8 prev = v[i / 8].fetch_or(1 << (i % 8));
+ return (prev & (1 << (i % 8))) == 0;
+}
+
+bool AtomicBitSet::set(size_t i, bool val) const
+{
+ if (val)
+ return set(i);
+ else
+ return unset(i);
+}
+
+bool AtomicBitSet::unset(size_t i) const
+{
+ UInt8 prev = v[i / 8].fetch_and(~(1 << (i % 8)));
+ return (prev & (1 << (i % 8))) != 0;
+}
+
+void AtomicBitSet::unsetAll() const
+{
+ for (size_t i = 0; i < (n + 7) / 8; ++i)
+ v[i].store(0, std::memory_order_relaxed);
+}
+
+PageCache::PageCache(size_t bytes_per_chunk, size_t bytes_per_mmap, size_t bytes_total, bool use_madv_free_, bool use_huge_pages_)
+ : bytes_per_page(getPageSize())
+ , use_madv_free(use_madv_free_)
+ , use_huge_pages(use_huge_pages_)
+ , rng(randomSeed())
+{
+ if (bytes_per_chunk == 0 || bytes_per_mmap == 0)
+ throw Exception(ErrorCodes::INVALID_SETTING_VALUE, "Userspace page cache chunk size and mmap size can't be zero.");
+
+ if (use_huge_pages)
+ {
+ use_huge_pages = false;
+ bool print_warning = false;
+#ifdef OS_LINUX
+ try
+ {
+ ReadBufferFromFile in("/sys/kernel/mm/transparent_hugepage/hpage_pmd_size");
+ size_t huge_page_size;
+ readIntText(huge_page_size, in);
+
+ if (huge_page_size == 0 || huge_page_size % bytes_per_page != 0)
+ throw Exception(ErrorCodes::SYSTEM_ERROR, "Invalid huge page size reported by the OS: {}", huge_page_size);
+
+ /// THP can be configured to be 2 MiB or 1 GiB in size. 1 GiB is way too big for us.
+ if (huge_page_size <= (16 << 20))
+ {
+ pages_per_big_page = huge_page_size / bytes_per_page;
+ use_huge_pages = true;
+ }
+ else
+ {
+ LOG_WARNING(&Poco::Logger::get("PageCache"), "The OS huge page size is too large for our purposes: {} KiB. Using regular pages. Userspace page cache will be relatively slow.", huge_page_size);
+ }
+ }
+ catch (Exception & e)
+ {
+ if (e.code() != ErrorCodes::FILE_DOESNT_EXIST)
+ throw;
+ print_warning = true;
+ }
+#else
+ print_warning = true;
+#endif
+ if (print_warning)
+ LOG_WARNING(&Poco::Logger::get("PageCache"), "The OS doesn't support transparent huge pages. Userspace page cache will be relatively slow.");
+ }
+
+ pages_per_chunk = ((bytes_per_chunk - 1) / (bytes_per_page * pages_per_big_page) + 1) * pages_per_big_page;
+ chunks_per_mmap_target = (bytes_per_mmap - 1) / (bytes_per_page * pages_per_chunk) + 1;
+ max_mmaps = (bytes_total - 1) / (bytes_per_page * pages_per_chunk * chunks_per_mmap_target) + 1;
+}
+
+PageCache::~PageCache()
+{
+ chassert(getPinnedSize() == 0);
+}
+
+size_t PageCache::pageSize() const { return bytes_per_page; }
+size_t PageCache::chunkSize() const { return bytes_per_page * pages_per_chunk; }
+size_t PageCache::maxChunks() const { return chunks_per_mmap_target * max_mmaps; }
+
+size_t PageCache::getPinnedSize() const
+{
+ std::unique_lock lock(global_mutex);
+ return (total_chunks - lru.size()) * bytes_per_page * pages_per_chunk;
+}
+
+PageCache::MemoryStats PageCache::getResidentSetSize() const
+{
+ MemoryStats stats;
+#ifdef OS_LINUX
+ if (use_madv_free)
+ {
+ std::unordered_set cache_mmap_addrs;
+ for (const auto & m : mmaps)
+ cache_mmap_addrs.insert(reinterpret_cast(m.ptr));
+
+ ReadBufferFromFile in("/proc/self/smaps");
+
+ /// Parse the smaps contents, which is text consisting of entries like this:
+ ///
+ /// 117ba4a00000-117be4a00000 rw-p 00000000 00:00 0
+ /// Size: 1048576 kB
+ /// KernelPageSize: 4 kB
+ /// MMUPageSize: 4 kB
+ /// Rss: 539516 kB
+ /// Pss: 539516 kB
+ /// ...
+
+ auto read_token = [&]
+ {
+ String res;
+ while (!in.eof())
+ {
+ char c = *in.position();
+ if (c == '\n' || c == '\t' || c == ' ' || c == '-')
+ break;
+ res += c;
+ ++in.position();
+ }
+ return res;
+ };
+
+ auto skip_whitespace = [&]
+ {
+ while (!in.eof())
+ {
+ char c = *in.position();
+ if (c != ' ' && c != '\t')
+ break;
+ ++in.position();
+ }
+ };
+
+ bool current_range_is_cache = false;
+ size_t total_rss = 0;
+ size_t total_lazy_free = 0;
+ while (!in.eof())
+ {
+ String s = read_token();
+ if (!in.eof() && *in.position() == '-')
+ {
+ if (s.size() < 16)
+ s.insert(0, 16 - s.size(), '0');
+ UInt64 addr = unhexUInt(s.c_str());
+ current_range_is_cache = cache_mmap_addrs.contains(addr);
+ }
+ else if (s == "Rss:" || s == "LazyFree")
+ {
+ skip_whitespace();
+ size_t val;
+ readIntText(val, in);
+ skip_whitespace();
+ String unit = read_token();
+ if (unit != "kB")
+ throw Exception(ErrorCodes::SYSTEM_ERROR, "Unexpected units in /proc/self/smaps: {}", unit);
+ size_t bytes = val * 1024;
+
+ if (s == "Rss:")
+ {
+ total_rss += bytes;
+ if (current_range_is_cache)
+ stats.page_cache_rss += bytes;
+ }
+ else
+ total_lazy_free += bytes;
+ }
+ skipToNextLineOrEOF(in);
+ }
+ stats.unreclaimable_rss = total_rss - std::min(total_lazy_free, total_rss);
+
+ return stats;
+ }
+#endif
+
+ stats.page_cache_rss = bytes_per_page * pages_per_chunk * total_chunks;
+ return stats;
+}
+
+PinnedPageChunk PageCache::getOrSet(PageCacheKey key, bool detached_if_missing, bool inject_eviction)
+{
+ PageChunk * chunk;
+ /// Make sure we increment exactly one of the counters about the fate of a chunk lookup.
+ bool incremented_profile_events = false;
+
+ {
+ std::unique_lock lock(global_mutex);
+
+ auto * it = chunk_by_key.find(key);
+ if (it == chunk_by_key.end())
+ {
+ chunk = getFreeChunk(lock);
+ chassert(!chunk->key.has_value());
+
+ if (!detached_if_missing)
+ {
+ chunk->key = key;
+ chunk_by_key.insert({key, chunk});
+ }
+
+ ProfileEvents::increment(ProfileEvents::PageCacheChunkMisses);
+ incremented_profile_events = true;
+ }
+ else
+ {
+ chunk = it->getMapped();
+ size_t prev_pin_count = chunk->pin_count.fetch_add(1);
+
+ if (prev_pin_count == 0)
+ {
+ /// Not eligible for LRU eviction while pinned.
+ chassert(chunk->is_linked());
+ lru.erase(lru.iterator_to(*chunk));
+
+ if (detached_if_missing)
+ {
+ /// Peek the first page to see if it's evicted.
+ /// (Why not use the full probing procedure instead, restoreChunkFromLimbo()?
+ /// Right here we can't do it because of how the two mutexes are organized.
+ /// And we want to do the check+detach before unlocking global_mutex, because
+ /// otherwise we may detach a chunk pinned by someone else, which may be unexpected
+ /// for that someone else. Or maybe the latter is fine, dropCache() already does it.)
+ if (chunk->pages_populated.get(0) && reinterpret_cast*>(chunk->data)->load(std::memory_order_relaxed) == 0)
+ evictChunk(chunk, lock);
+ }
+
+ if (inject_eviction && chunk->key.has_value() && rng() % 10 == 0)
+ {
+ /// Simulate eviction of the chunk or some of its pages.
+ if (rng() % 2 == 0)
+ evictChunk(chunk, lock);
+ else
+ for (size_t i = 0; i < 20; ++i)
+ chunk->pages_populated.unset(rng() % (chunk->size / chunk->page_size));
+ }
+ }
+ else
+ {
+ ProfileEvents::increment(ProfileEvents::PageCacheChunkShared);
+ incremented_profile_events = true;
+ }
+ }
+ }
+
+ {
+ std::unique_lock chunk_lock(chunk->chunk_mutex);
+
+ if (chunk->pages_state == PageChunkState::Limbo)
+ {
+ auto [pages_restored, pages_evicted] = restoreChunkFromLimbo(chunk, chunk_lock);
+ chunk->pages_state = PageChunkState::Stable;
+
+ if (!incremented_profile_events)
+ {
+ if (pages_evicted == 0)
+ ProfileEvents::increment(ProfileEvents::PageCacheChunkDataHits);
+ else if (pages_evicted < pages_restored)
+ ProfileEvents::increment(ProfileEvents::PageCacheChunkDataPartialHits);
+ else
+ ProfileEvents::increment(ProfileEvents::PageCacheChunkDataMisses);
+ }
+ }
+ }
+
+ return PinnedPageChunk(this, chunk);
+}
+
+void PageCache::removeRef(PageChunk * chunk) noexcept
+{
+ /// Fast path if this is not the last reference.
+ size_t prev_pin_count = chunk->pin_count.load();
+ if (prev_pin_count > 1 && chunk->pin_count.compare_exchange_strong(prev_pin_count, prev_pin_count - 1))
+ return;
+
+ {
+ std::unique_lock lock(global_mutex);
+
+ prev_pin_count = chunk->pin_count.fetch_sub(1);
+ if (prev_pin_count > 1)
+ return;
+
+ chassert(!chunk->is_linked());
+ if (chunk->key.has_value())
+ lru.push_back(*chunk);
+ else
+ /// Unpinning detached chunk. We'd rather reuse it soon, so put it at the front.
+ lru.push_front(*chunk);
+ }
+
+ {
+ std::unique_lock chunk_lock(chunk->chunk_mutex);
+
+ /// Need to be extra careful here because we unlocked global_mutex above, so other
+ /// getOrSet()/removeRef() calls could have happened during this brief period.
+ if (use_madv_free && chunk->pages_state == PageChunkState::Stable && chunk->pin_count.load() == 0)
+ {
+ sendChunkToLimbo(chunk, chunk_lock);
+ chunk->pages_state = PageChunkState::Limbo;
+ }
+ }
+}
+
+static void logUnexpectedSyscallError(std::string name)
+{
+ std::string message = fmt::format("{} failed: {}", name, errnoToString());
+ LOG_WARNING(&Poco::Logger::get("PageCache"), "{}", message);
+#if defined(ABORT_ON_LOGICAL_ERROR)
+ volatile bool true_ = true;
+ if (true_) // suppress warning about missing [[noreturn]]
+ abortOnFailedAssertion(message);
+#endif
+}
+
+void PageCache::sendChunkToLimbo(PageChunk * chunk [[maybe_unused]], std::unique_lock & /* chunk_mutex */) const noexcept
+{
+#ifdef MADV_FREE // if we're not on a very old version of Linux
+ chassert(chunk->size == bytes_per_page * pages_per_chunk);
+ size_t populated_pages = 0;
+ size_t populated_big_pages = 0;
+ for (size_t big_page_idx = 0; big_page_idx < pages_per_chunk / pages_per_big_page; ++big_page_idx)
+ {
+ bool big_page_populated = false;
+ for (size_t sub_idx = 0; sub_idx < pages_per_big_page; ++sub_idx)
+ {
+ size_t idx = big_page_idx * pages_per_big_page + sub_idx;
+ if (!chunk->pages_populated.get(idx))
+ continue;
+ big_page_populated = true;
+ populated_pages += 1;
+
+ auto & byte = reinterpret_cast &>(chunk->data[idx * bytes_per_page]);
+ chunk->first_bit_of_each_page.set(idx, (byte.load(std::memory_order_relaxed) & 1) != 0);
+ byte.fetch_or(1, std::memory_order_relaxed);
+ }
+ if (big_page_populated)
+ populated_big_pages += 1;
+ }
+ int r = madvise(chunk->data, chunk->size, MADV_FREE);
+ if (r != 0)
+ logUnexpectedSyscallError("madvise(MADV_FREE)");
+
+ ProfileEvents::increment(ProfileEvents::PageCacheBytesUnpinnedRoundedToPages, bytes_per_page * populated_pages);
+ ProfileEvents::increment(ProfileEvents::PageCacheBytesUnpinnedRoundedToHugePages, bytes_per_page * pages_per_big_page * populated_big_pages);
+#endif
+}
+
+std::pair PageCache::restoreChunkFromLimbo(PageChunk * chunk, std::unique_lock & /* chunk_mutex */) const noexcept
+{
+ static_assert(sizeof(std::atomic) == 1, "char is not atomic?");
+ // Make sure our strategic memory reads/writes are not reordered or optimized out.
+ auto * data = reinterpret_cast *>(chunk->data);
+ size_t pages_restored = 0;
+ size_t pages_evicted = 0;
+ for (size_t idx = 0; idx < chunk->size / bytes_per_page; ++idx)
+ {
+ if (!chunk->pages_populated.get(idx))
+ continue;
+
+ /// After MADV_FREE, it's guaranteed that:
+ /// * writing to the page makes it non-freeable again (reading doesn't),
+ /// * after the write, the page contents are either fully intact or fully zero-filled,
+ /// * even before the write, reads return either intact data (if the page wasn't freed) or zeroes (if it was, and the read page-faulted).
+ /// (And when doing the write there's no way to tell whether it page-faulted or not, AFAICT; that would make our life much easier!)
+ ///
+ /// With that in mind, we do the following dance to bring the page back from the MADV_FREE limbo:
+ /// 0. [in advance] Before doing MADV_FREE, make sure the page's first byte is not zero.
+ /// We do it by setting the lowest bit of the first byte to 1, after saving the original value of that bit into a bitset.
+ /// 1. Read the second byte.
+ /// 2. Write the second byte back. This makes the page non-freeable.
+ /// 3. Read the first byte.
+ /// 3a. If it's zero, the page was freed.
+ /// Set the second byte to 0, to keep the buffer zero-filled if the page was freed
+ /// between steps 1 and 2.
+ /// 3b. If it's nonzero, the page is intact.
+ /// Restore the lowest bit of the first byte to the saved original value from the bitset.
+
+ char second_byte = data[idx * bytes_per_page + 1].load(std::memory_order_relaxed);
+ data[idx * bytes_per_page + 1].store(second_byte, std::memory_order_relaxed);
+
+ char first_byte = data[idx * bytes_per_page].load(std::memory_order_relaxed);
+ if (first_byte == 0)
+ {
+ pages_evicted += 1;
+ data[idx * bytes_per_page + 1].store(0, std::memory_order_relaxed);
+ chunk->pages_populated.unset(idx);
+ }
+ else
+ {
+ pages_restored += 1;
+ chassert(first_byte & 1);
+ if (!chunk->first_bit_of_each_page.get(idx))
+ data[idx * bytes_per_page].fetch_and(~1, std::memory_order_relaxed);
+ }
+ }
+ return {pages_restored, pages_evicted};
+}
+
+PageChunk * PageCache::getFreeChunk(std::unique_lock & lock /* global_mutex */)
+{
+ if (lru.empty() || (mmaps.size() < max_mmaps && lru.front().key.has_value()))
+ addMmap(lock);
+ if (lru.empty())
+ throw Exception(ErrorCodes::MEMORY_LIMIT_EXCEEDED, "All chunks in the entire page cache ({:.3} GiB) are pinned.",
+ bytes_per_page * pages_per_chunk * total_chunks * 1. / (1l << 30));
+
+ PageChunk * chunk = &lru.front();
+ lru.erase(lru.iterator_to(*chunk));
+
+ size_t prev_pin_count = chunk->pin_count.fetch_add(1);
+ chassert(prev_pin_count == 0);
+
+ evictChunk(chunk, lock);
+
+ return chunk;
+}
+
+void PageCache::evictChunk(PageChunk * chunk, std::unique_lock & /* global_mutex */)
+{
+ if (chunk->key.has_value())
+ {
+ size_t erased = chunk_by_key.erase(chunk->key.value());
+ chassert(erased);
+ chunk->key.reset();
+ }
+
+ chunk->state.reset();
+
+ /// This is tricky. We're not holding the chunk_mutex, so another thread might be running
+ /// sendChunkToLimbo() or even restoreChunkFromLimbo() on this chunk right now.
+ ///
+ /// Nevertheless, it's correct and sufficient to clear pages_populated here because sendChunkToLimbo()
+ /// and restoreChunkFromLimbo() only touch pages_populated (only unsetting the bits),
+ /// first_bit_of_each_page, and the data; and we don't care about first_bit_of_each_page and the data.
+ ///
+ /// This is precarious, but I don't have better ideas. Note that this clearing (or something else)
+ /// must be done before unlocking the global_mutex because otherwise another call to getOrSet() might
+ /// return this chunk before we clear it.
+ chunk->pages_populated.unsetAll();
+}
+
+void PageCache::addMmap(std::unique_lock & /* global_mutex */)
+{
+ /// ASLR by hand.
+ void * address_hint = reinterpret_cast(std::uniform_int_distribution(0x100000000000UL, 0x700000000000UL)(rng));
+
+ mmaps.emplace_back(bytes_per_page, pages_per_chunk, pages_per_big_page, chunks_per_mmap_target, address_hint, use_huge_pages);
+
+ size_t num_chunks = mmaps.back().num_chunks;
+ total_chunks += num_chunks;
+ for (size_t i = 0; i < num_chunks; ++i)
+ /// Link in reverse order, so they get assigned in increasing order. Not important, just seems nice.
+ lru.push_front(mmaps.back().chunks[num_chunks - 1 - i]);
+}
+
+void PageCache::dropCache()
+{
+ std::unique_lock lock(global_mutex);
+
+ /// Detach and free unpinned chunks.
+ bool logged_error = false;
+ for (PageChunk & chunk : lru)
+ {
+ evictChunk(&chunk, lock);
+
+ if (use_madv_free)
+ {
+ /// This might happen in parallel with sendChunkToLimbo() or restoreChunkFromLimbo(), but it's ok.
+ int r = madvise(chunk.data, chunk.size, MADV_DONTNEED);
+ if (r != 0 && !logged_error)
+ {
+ logUnexpectedSyscallError("madvise(MADV_DONTNEED)");
+ logged_error = true;
+ }
+ }
+ }
+
+ /// Detach pinned chunks.
+ for (auto [key, chunk] : chunk_by_key)
+ {
+ chassert(chunk->key == key);
+ chassert(chunk->pin_count > 0); // otherwise it would have been evicted above
+ chunk->key.reset();
+ }
+ chunk_by_key.clear();
+}
+
+PageCache::Mmap::Mmap(size_t bytes_per_page_, size_t pages_per_chunk_, size_t pages_per_big_page_, size_t num_chunks_, void * address_hint, bool use_huge_pages_)
+{
+ num_chunks = num_chunks_;
+ size = bytes_per_page_ * pages_per_chunk_ * num_chunks;
+
+ size_t alignment = bytes_per_page_ * pages_per_big_page_;
+ address_hint = reinterpret_cast(reinterpret_cast(address_hint) / alignment * alignment);
+
+ auto temp_chunks = std::make_unique(num_chunks);
+
+ int flags = MAP_PRIVATE | MAP_ANONYMOUS;
+#ifdef OS_LINUX
+ flags |= MAP_NORESERVE;
+#endif
+ ptr = mmap(address_hint, size, PROT_READ | PROT_WRITE, flags, -1, 0);
+ if (MAP_FAILED == ptr)
+ throw ErrnoException(ErrorCodes::CANNOT_ALLOCATE_MEMORY, fmt::format("Cannot mmap {}.", ReadableSize(size)));
+ if (reinterpret_cast(ptr) % bytes_per_page_ != 0)
+ {
+ munmap(ptr, size);
+ throw Exception(ErrorCodes::SYSTEM_ERROR, "mmap returned unaligned address: {}", ptr);
+ }
+
+ void * chunks_start = ptr;
+
+#ifdef OS_LINUX
+ if (madvise(ptr, size, MADV_DONTDUMP) != 0)
+ logUnexpectedSyscallError("madvise(MADV_DONTDUMP)");
+ if (madvise(ptr, size, MADV_DONTFORK) != 0)
+ logUnexpectedSyscallError("madvise(MADV_DONTFORK)");
+
+ if (use_huge_pages_)
+ {
+ if (reinterpret_cast(ptr) % alignment != 0)
+ {
+ LOG_DEBUG(&Poco::Logger::get("PageCache"), "mmap() returned address not aligned on huge page boundary.");
+ chunks_start = reinterpret_cast((reinterpret_cast(ptr) / alignment + 1) * alignment);
+ chassert(reinterpret_cast(chunks_start) % alignment == 0);
+ num_chunks -= 1;
+ }
+
+ if (madvise(ptr, size, MADV_HUGEPAGE) != 0)
+ LOG_WARNING(&Poco::Logger::get("PageCache"),
+ "madvise(MADV_HUGEPAGE) failed: {}. Userspace page cache will be relatively slow.", errnoToString());
+ }
+#else
+ (void)use_huge_pages_;
+#endif
+
+ chunks = std::move(temp_chunks);
+ for (size_t i = 0; i < num_chunks; ++i)
+ {
+ PageChunk * chunk = &chunks[i];
+ chunk->data = reinterpret_cast(chunks_start) + bytes_per_page_ * pages_per_chunk_ * i;
+ chunk->size = bytes_per_page_ * pages_per_chunk_;
+ chunk->page_size = bytes_per_page_;
+ chunk->big_page_size = bytes_per_page_ * pages_per_big_page_;
+ chunk->pages_populated.init(pages_per_chunk_);
+ chunk->first_bit_of_each_page.init(pages_per_chunk_);
+ }
+}
+
+PageCache::Mmap::Mmap(Mmap && m) noexcept : ptr(std::exchange(m.ptr, nullptr)), size(std::exchange(m.size, 0)), chunks(std::move(m.chunks)), num_chunks(std::exchange(m.num_chunks, 0)) {}
+
+PageCache::Mmap::~Mmap() noexcept
+{
+ if (ptr && 0 != munmap(ptr, size))
+ logUnexpectedSyscallError("munmap");
+}
+
+void FileChunkState::reset() {}
+
+PageCacheKey FileChunkAddress::hash() const
+{
+ SipHash hash(offset);
+ hash.update(path.data(), path.size());
+ if (!file_version.empty())
+ {
+ hash.update("\0", 1);
+ hash.update(file_version.data(), file_version.size());
+ }
+ return hash.get128();
+}
+
+std::string FileChunkAddress::toString() const
+{
+ return fmt::format("{}:{}{}{}", path, offset, file_version.empty() ? "" : ":", file_version);
+}
+
+#pragma clang diagnostic pop
+
+}
diff --git a/src/Common/PageCache.h b/src/Common/PageCache.h
new file mode 100644
index 00000000000..7ff376baa6b
--- /dev/null
+++ b/src/Common/PageCache.h
@@ -0,0 +1,299 @@
+#pragma once
+
+#include
+#include
+#include
+#include
+#include
+
+/// "Userspace page cache"
+/// A cache for contents of remote files.
+/// Uses MADV_FREE to allow Linux to evict pages from our cache under memory pressure.
+/// Typically takes up almost all of the available memory, similar to the actual page cache.
+///
+/// Intended for caching data retrieved from distributed cache, but can be used for other things too,
+/// just replace FileChunkState with a discriminated union, or something, if needed.
+///
+/// There are two fixed-size units of caching here:
+/// * OS pages, typically 4 KiB each.
+/// * Page chunks, 2 MiB each (configurable with page_cache_block_size setting).
+///
+/// Each file is logically split into aligned 2 MiB blocks, which are mapped to page chunks inside the cache.
+/// They are cached independently from each other.
+///
+/// Each page chunk has a contiguous 2 MiB buffer that can be pinned and directly used e.g. by ReadBuffers.
+/// While pinned (by at least one PinnedPageChunk), the pages are not reclaimable by the OS.
+///
+/// Inside each page chunk, any subset of pages may be populated. Unpopulated pages may or not be
+/// mapped to any physical RAM. We maintain a bitmask that keeps track of which pages are populated.
+/// Pages become unpopulated if they're reclaimed by the OS (when the page chunk is not pinned),
+/// or if we just never populate them in the first place (e.g. if a file is shorter than 2 MiB we
+/// still create a 2 MiB page chunk, but use only a prefix of it).
+///
+/// There are two separate eviction mechanisms at play:
+/// * LRU eviction of page chunks in PageCache.
+/// * OS reclaiming pages on memory pressure. We have no control over the eviction policy.
+/// It probably picks the pages in the same order in which they were marked with MADV_FREE, so
+/// effectively in the same LRU order as our policy in PageCache.
+/// When using PageCache in oversubscribed fashion, using all available memory and relying on OS eviction,
+/// the PageCache's eviction policy mostly doesn't matter. It just needs to be similar enough to the OS's
+/// policy that we rarely evict chunks with unevicted pages.
+///
+/// We mmap memory directly instead of using allocator because this enables:
+/// * knowing how much RAM the cache is using, via /proc/self/smaps,
+/// * MADV_HUGEPAGE (use transparent huge pages - this makes MADV_FREE 10x less slow),
+/// * MAP_NORESERVE (don't reserve swap space - otherwise large mmaps usually fail),
+/// * MADV_DONTDUMP (don't include in core dumps),
+/// * page-aligned addresses without padding.
+///
+/// madvise(MADV_FREE) call is slow: ~6 GiB/s (doesn't scale with more threads). Enabling transparent
+/// huge pages (MADV_HUGEPAGE) makes it 10x less slow, so we do that. That makes the physical RAM allocation
+/// work at 2 MiB granularity instead of 4 KiB, so the cache becomes less suitable for small files.
+/// If this turns out to be a problem, we may consider allowing different mmaps to have different flags,
+/// some having no huge pages.
+/// Note that we do our bookkeeping at small-page granularity even if huge pages are enabled.
+///
+/// It's unfortunate that Linux's MADV_FREE eviction doesn't use the two-list strategy like the real
+/// page cache (IIUC, MADV_FREE puts the pages at the head of the inactive list, and they can never
+/// get to the active list).
+/// If this turns out to be a problem, we could make PageCache do chunk eviction based on observed
+/// system memory usage, so that most eviction is done by us, and the MADV_FREE eviction kicks in
+/// only as a last resort. Then we can make PageCache's eviction policy arbitrarily more sophisticated.
+
+namespace DB
+{
+
+/// Hash of FileChunkAddress.
+using PageCacheKey = UInt128;
+
+/// Identifies a chunk of a file or object.
+/// We assume that contents of such file/object don't change (without file_version changing), so
+/// cache invalidation is needed.
+struct FileChunkAddress
+{
+ /// Path, usually prefixed with storage system name and anything else needed to make it unique.
+ /// E.g. "s3:/"
+ std::string path;
+ /// Optional string with ETag, or file modification time, or anything else.
+ std::string file_version;
+ size_t offset = 0;
+
+ PageCacheKey hash() const;
+
+ std::string toString() const;
+};
+
+struct AtomicBitSet
+{
+ size_t n = 0;
+ std::unique_ptr[]> v;
+
+ AtomicBitSet();
+
+ void init(size_t n);
+
+ bool get(size_t i) const;
+ bool any() const;
+ /// These return true if the bit was changed, false if it already had the target value.
+ /// (These methods are logically not const, but clang insists that I make them const, and
+ /// '#pragma clang diagnostic ignored' doesn't seem to work.)
+ bool set(size_t i) const;
+ bool set(size_t i, bool val) const;
+ bool unset(size_t i) const;
+ void unsetAll() const;
+};
+
+enum class PageChunkState
+{
+ /// Pages are not reclaimable by the OS, the buffer has correct contents.
+ Stable,
+ /// Pages are reclaimable by the OS, the buffer contents are altered (first bit of each page set to 1).
+ Limbo,
+};
+
+/// (This is a separate struct just in case we want to use this cache for other things in future.
+/// Then this struct would be the customization point, while the rest of PageChunk can stay unchanged.)
+struct FileChunkState
+{
+ std::mutex download_mutex;
+
+ void reset();
+};
+
+using PageChunkLRUListHook = boost::intrusive::list_base_hook<>;
+
+/// Cache entry.
+struct PageChunk : public PageChunkLRUListHook
+{
+ char * data;
+ size_t size; // in bytes
+ /// Page size for use in pages_populated and first_bit_of_each_page. Same as PageCache::pageSize().
+ size_t page_size;
+
+ /// Actual eviction granularity. Just for information. If huge pages are used, huge page size, otherwise page_size.
+ size_t big_page_size;
+
+ mutable FileChunkState state;
+
+ AtomicBitSet pages_populated;
+
+private:
+ friend class PinnedPageChunk;
+ friend class PageCache;
+
+ /// If nullopt, the chunk is "detached", i.e. not associated with any key.
+ /// Detached chunks may still be pinned. Chunk may get detached even while pinned, in particular when dropping cache.
+ /// Protected by global_mutex.
+ std::optional key;
+
+ /// Refcount for usage of this chunk. When zero, the pages are reclaimable by the OS, and
+ /// the PageChunk itself is evictable (linked into PageCache::lru).
+ std::atomic pin_count {0};
+
+ /// Bit mask containing the first bit of data from each page. Needed for the weird probing procedure when un-MADV_FREE-ing the pages.
+ AtomicBitSet first_bit_of_each_page;
+
+ /// Locked when changing pages_state, along with the corresponding expensive MADV_FREE/un-MADV_FREE operation.
+ mutable std::mutex chunk_mutex;
+
+ /// Normally pin_count == 0 <=> state == PageChunkState::Limbo,
+ /// pin_count > 0 <=> state == PageChunkState::Stable.
+ /// This separate field is needed because of synchronization: pin_count is changed with global_mutex locked,
+ /// this field is changed with chunk_mutex locked, and we never have to lock both mutexes at once.
+ PageChunkState pages_state = PageChunkState::Stable;
+};
+
+class PageCache;
+
+/// Handle for a cache entry. Neither the entry nor its pages can get evicted while there's at least one PinnedPageChunk pointing to it.
+class PinnedPageChunk
+{
+public:
+ const PageChunk * getChunk() const;
+
+ /// Sets the bit in pages_populated. Returns true if it actually changed (i.e. was previously 0).
+ bool markPagePopulated(size_t page_idx);
+
+ /// Calls markPagePopulated() for pages 0..ceil(bytes/page_size).
+ void markPrefixPopulated(size_t bytes);
+
+ bool isPrefixPopulated(size_t bytes) const;
+
+ PinnedPageChunk() = default;
+ ~PinnedPageChunk() noexcept;
+
+ PinnedPageChunk(PinnedPageChunk &&) noexcept;
+ PinnedPageChunk & operator=(PinnedPageChunk &&) noexcept;
+
+private:
+ friend class PageCache;
+
+ PageCache * cache = nullptr;
+ PageChunk * chunk = nullptr;
+
+ PinnedPageChunk(PageCache * cache_, PageChunk * chunk_) noexcept;
+};
+
+class PageCache
+{
+public:
+ PageCache(size_t bytes_per_chunk, size_t bytes_per_mmap, size_t bytes_total, bool use_madv_free, bool use_huge_pages);
+ ~PageCache();
+
+ /// Get or insert a chunk for the given key.
+ ///
+ /// If detached_if_missing = true, and the key is not present in the cache, the returned chunk
+ /// won't be associated with the key and will be evicted as soon as it's unpinned.
+ /// It's like "get if exists, otherwise return null", but instead of null we return a usable
+ /// temporary buffer, for convenience. Pinning and page eviction make the story more complicated:
+ /// * If the chunk for this key is pinned, we return it even if it's not fully populated
+ /// (because PageCache doesn't know what "fully populated" means).
+ /// * If the chunk exists, but some of its pages were evicted, we detach it. (Currently we only
+ /// check the first page here.)
+ PinnedPageChunk getOrSet(PageCacheKey key, bool detached_if_missing, bool inject_eviction);
+
+ /// OS page size, e.g. 4 KiB on x86, 4 KiB or 64 KiB on aarch64.
+ ///
+ /// If transparent huge pages are enabled, this is still the regular page size, and all our bookkeeping
+ /// is still based on regular page size (e.g. pages_populated), because (a) it's cheap anyway,
+ /// and (b) I'm not sure if Linux guarantees that MADV_FREE reclamation always happens at huge page
+ /// granularity, and wouldn't want to rely on this even if it does.
+ size_t pageSize() const;
+ size_t chunkSize() const;
+ size_t maxChunks() const;
+
+ struct MemoryStats
+ {
+ /// How many bytes of actual RAM are used for the cache pages. Doesn't include metadata
+ /// and overhead (e.g. PageChunk structs).
+ size_t page_cache_rss = 0;
+ /// Resident set size for the whole process, excluding any MADV_FREE pages (PageCache's or not).
+ /// This can be used as a more useful memory usage number for clickhouse server, instead of RSS.
+ /// Populated only if MADV_FREE is used, otherwise zero.
+ std::optional unreclaimable_rss;
+ };
+
+ /// Reads /proc/self/smaps, so not very fast.
+ MemoryStats getResidentSetSize() const;
+
+ /// Total length of memory ranges currently pinned by PinnedPageChunk-s, including unpopulated pages.
+ size_t getPinnedSize() const;
+
+ /// Clears the key -> chunk mapping. Frees memory (MADV_DONTNEED) of all chunks that are not pinned.
+ /// Doesn't unmap any virtual memory. Detaches but doesn't free the pinned chunks.
+ /// Locks the global mutex for the duration of the operation, which may block queries for hundreds of milliseconds.
+ void dropCache();
+
+private:
+ friend class PinnedPageChunk;
+
+ struct Mmap
+ {
+ void * ptr = nullptr;
+ size_t size = 0;
+
+ std::unique_ptr chunks;
+ size_t num_chunks = 0; // might be smaller than chunks_per_mmap_target because of alignment
+
+ Mmap(Mmap &&) noexcept;
+ Mmap(size_t bytes_per_page, size_t pages_per_chunk, size_t pages_per_big_page, size_t num_chunks, void * address_hint, bool use_huge_pages_);
+ ~Mmap() noexcept;
+ };
+
+ size_t bytes_per_page;
+ size_t pages_per_chunk;
+ size_t chunks_per_mmap_target;
+ size_t max_mmaps;
+ size_t pages_per_big_page = 1; // if huge pages are used, huge_page_size/page_size, otherwise 1
+ bool use_madv_free = true;
+ bool use_huge_pages = true;
+
+ mutable std::mutex global_mutex;
+
+ pcg64 rng;
+
+ std::vector mmaps;
+ size_t total_chunks = 0;
+
+ /// All non-pinned chunks, including ones not assigned to any file. Least recently used is begin().
+ boost::intrusive::list, boost::intrusive::constant_time_size> lru;
+
+ HashMap chunk_by_key;
+
+ /// Get a usable chunk, doing eviction or allocation if needed.
+ /// Caller is responsible for clearing pages_populated.
+ PageChunk * getFreeChunk(std::unique_lock & /* global_mutex */);
+ void addMmap(std::unique_lock & /* global_mutex */);
+ void evictChunk(PageChunk * chunk, std::unique_lock & /* global_mutex */);
+
+ void removeRef(PageChunk * chunk) noexcept;
+
+ /// These may run in parallel with getFreeChunk(), so be very careful about which fields of the PageChunk we touch here.
+ void sendChunkToLimbo(PageChunk * chunk, std::unique_lock & /* chunk_mutex */) const noexcept;
+ /// Returns {pages_restored, pages_evicted}.
+ std::pair restoreChunkFromLimbo(PageChunk * chunk, std::unique_lock & /* chunk_mutex */) const noexcept;
+};
+
+using PageCachePtr = std::shared_ptr;
+
+}
diff --git a/src/Common/ProfileEvents.cpp b/src/Common/ProfileEvents.cpp
index d8ca1ab9e93..87fcf220ff0 100644
--- a/src/Common/ProfileEvents.cpp
+++ b/src/Common/ProfileEvents.cpp
@@ -63,6 +63,15 @@
M(MarkCacheMisses, "Number of times an entry has not been found in the mark cache, so we had to load a mark file in memory, which is a costly operation, adding to query latency.") \
M(QueryCacheHits, "Number of times a query result has been found in the query cache (and query computation was avoided). Only updated for SELECT queries with SETTING use_query_cache = 1.") \
M(QueryCacheMisses, "Number of times a query result has not been found in the query cache (and required query computation). Only updated for SELECT queries with SETTING use_query_cache = 1.") \
+ /* Each page cache chunk access increments exactly one of the following 5 PageCacheChunk* counters. */ \
+ /* Something like hit rate: (PageCacheChunkShared + PageCacheChunkDataHits) / [sum of all 5]. */ \
+ M(PageCacheChunkMisses, "Number of times a chunk has not been found in the userspace page cache.") \
+ M(PageCacheChunkShared, "Number of times a chunk has been found in the userspace page cache, already in use by another thread.") \
+ M(PageCacheChunkDataHits, "Number of times a chunk has been found in the userspace page cache, not in use, with all pages intact.") \
+ M(PageCacheChunkDataPartialHits, "Number of times a chunk has been found in the userspace page cache, not in use, but some of its pages were evicted by the OS.") \
+ M(PageCacheChunkDataMisses, "Number of times a chunk has been found in the userspace page cache, not in use, but all its pages were evicted by the OS.") \
+ M(PageCacheBytesUnpinnedRoundedToPages, "Total size of populated pages in chunks that became evictable in PageCache. Rounded up to whole pages.") \
+ M(PageCacheBytesUnpinnedRoundedToHugePages, "See PageCacheBytesUnpinnedRoundedToPages, but rounded to huge pages. Use the ratio between the two as a measure of memory waste from using huge pages.") \
M(CreatedReadBufferOrdinary, "Number of times ordinary read buffer was created for reading data (while choosing among other read methods).") \
M(CreatedReadBufferDirectIO, "Number of times a read buffer with O_DIRECT was created for reading data (while choosing among other read methods).") \
M(CreatedReadBufferDirectIOFailed, "Number of times a read buffer with O_DIRECT was attempted to be created for reading data (while choosing among other read methods), but the OS did not allow it (due to lack of filesystem support or other reasons) and we fallen back to the ordinary reading method.") \
@@ -92,6 +101,8 @@
M(LocalWriteThrottlerBytes, "Bytes passed through 'max_local_write_bandwidth_for_server'/'max_local_write_bandwidth' throttler.") \
M(LocalWriteThrottlerSleepMicroseconds, "Total time a query was sleeping to conform 'max_local_write_bandwidth_for_server'/'max_local_write_bandwidth' throttling.") \
M(ThrottlerSleepMicroseconds, "Total time a query was sleeping to conform all throttling settings.") \
+ M(PartsWithAppliedMutationsOnFly, "Total number of parts for which there was any mutation applied on fly") \
+ M(MutationsAppliedOnFlyInAllParts, "The sum of number of applied mutations on-fly for part among all read parts") \
\
M(QueryMaskingRulesMatch, "Number of times query masking rules was successfully matched.") \
\
@@ -311,6 +322,12 @@ The server successfully detected this situation and will download merged part fr
M(ParallelReplicasProcessingPartsMicroseconds, "Time spent processing data parts") \
M(ParallelReplicasStealingLeftoversMicroseconds, "Time spent collecting orphaned segments") \
M(ParallelReplicasCollectingOwnedSegmentsMicroseconds, "Time spent collecting segments meant by hash") \
+ M(ParallelReplicasNumRequests, "Number of requests to the initiator.") \
+ M(ParallelReplicasDeniedRequests, "Number of completely denied requests to the initiator") \
+ M(CacheWarmerBytesDownloaded, "Amount of data fetched into filesystem cache by dedicated background threads.") \
+ M(CacheWarmerDataPartsDownloaded, "Number of data parts that were fully fetched by CacheWarmer.") \
+ M(IgnoredColdParts, "See setting ignore_cold_parts_seconds. Number of times read queries ignored very new parts that weren't pulled into cache by CacheWarmer yet.") \
+ M(PreferredWarmedUnmergedParts, "See setting prefer_warmed_unmerged_parts_seconds. Number of times read queries used outdated pre-merge parts that are in cache instead of merged part that wasn't pulled into cache by CacheWarmer yet.") \
\
M(PerfCPUCycles, "Total cycles. Be wary of what happens during CPU frequency scaling.") \
M(PerfInstructions, "Retired instructions. Be careful, these can be affected by various issues, most notably hardware interrupt counts.") \
@@ -516,6 +533,21 @@ The server successfully detected this situation and will download merged part fr
M(AggregationPreallocatedElementsInHashTables, "How many elements were preallocated in hash tables for aggregation.") \
M(AggregationHashTablesInitializedAsTwoLevel, "How many hash tables were inited as two-level for aggregation.") \
\
+ M(MetadataFromKeeperCacheHit, "Number of times an object storage metadata request was answered from cache without making request to Keeper") \
+ M(MetadataFromKeeperCacheMiss, "Number of times an object storage metadata request had to be answered from Keeper") \
+ M(MetadataFromKeeperCacheUpdateMicroseconds, "Total time spent in updating the cache including waiting for responses from Keeper") \
+ M(MetadataFromKeeperUpdateCacheOneLevel, "Number of times a cache update for one level of directory tree was done") \
+ M(MetadataFromKeeperTransactionCommit, "Number of times metadata transaction commit was attempted") \
+ M(MetadataFromKeeperTransactionCommitRetry, "Number of times metadata transaction commit was retried") \
+ M(MetadataFromKeeperCleanupTransactionCommit, "Number of times metadata transaction commit for deleted objects cleanup was attempted") \
+ M(MetadataFromKeeperCleanupTransactionCommitRetry, "Number of times metadata transaction commit for deleted objects cleanup was retried") \
+ M(MetadataFromKeeperOperations, "Number of times a request was made to Keeper") \
+ M(MetadataFromKeeperIndividualOperations, "Number of paths read or written by single or multi requests to Keeper") \
+ M(MetadataFromKeeperReconnects, "Number of times a reconnect to Keeper was done") \
+ M(MetadataFromKeeperBackgroundCleanupObjects, "Number of times a old deleted object clean up was performed by background task") \
+ M(MetadataFromKeeperBackgroundCleanupTransactions, "Number of times old transaction idempotency token was cleaned up by background task") \
+ M(MetadataFromKeeperBackgroundCleanupErrors, "Number of times an error was encountered in background cleanup task") \
+ \
M(KafkaRebalanceRevocations, "Number of partition revocations (the first stage of consumer group rebalance)") \
M(KafkaRebalanceAssignments, "Number of partition assignments (the final stage of consumer group rebalance)") \
M(KafkaRebalanceErrors, "Number of failed consumer group rebalances") \
@@ -607,9 +639,32 @@ The server successfully detected this situation and will download merged part fr
M(MergeTreeAllRangesAnnouncementsSentElapsedMicroseconds, "Time spent in sending the announcement from the remote server to the initiator server about the set of data parts (for MergeTree tables). Measured on the remote server side.") \
\
M(ConnectionPoolIsFullMicroseconds, "Total time spent waiting for a slot in connection pool.") \
- \
M(AsyncLoaderWaitMicroseconds, "Total time a query was waiting for async loader jobs.") \
\
+ M(DistrCacheServerSwitches, "Number of server switches between distributed cache servers in read/write-through cache") \
+ M(DistrCacheReadMicroseconds, "Time spent reading from distributed cache") \
+ M(DistrCacheFallbackReadMicroseconds, "Time spend reading from fallback buffer instead of distribted cache") \
+ M(DistrCachePrecomputeRangesMicroseconds, "Time spent to precompute read ranges") \
+ M(DistrCacheNextImplMicroseconds, "Time spend in ReadBufferFromDistributedCache::nextImpl") \
+ M(DistrCacheOpenedConnections, "The number of open connections to distributed cache") \
+ M(DistrCacheReusedConnections, "The number of reused connections to distributed cache") \
+ M(DistrCacheHoldConnections, "The number of used connections to distributed cache") \
+ \
+ M(DistrCacheGetResponseMicroseconds, "Time spend to wait for response from distributed cache") \
+ M(DistrCacheStartRangeMicroseconds, "Time spent to start a new read range with distributed cache") \
+ M(DistrCacheLockRegistryMicroseconds, "Time spent to take DistributedCacheRegistry lock") \
+ M(DistrCacheUnusedPackets, "Number of skipped unused packets from distributed cache") \
+ M(DistrCachePackets, "Total number of packets received from distributed cache") \
+ M(DistrCacheUnusedPacketsBytes, "The number of bytes in Data packets which were ignored") \
+ M(DistrCacheRegistryUpdateMicroseconds, "Time spent updating distributed cache registry") \
+ M(DistrCacheRegistryUpdates, "Number of distributed cache registry updates") \
+ \
+ M(DistrCacheConnectMicroseconds, "The time spent to connect to distributed cache") \
+ M(DistrCacheConnectAttempts, "The number of connection attempts to distributed cache") \
+ M(DistrCacheGetClient, "Number of client access times") \
+ \
+ M(DistrCacheServerProcessRequestMicroseconds, "Time spent processing request on DistributedCache server side") \
+ \
M(LogTest, "Number of log messages with level Test") \
M(LogTrace, "Number of log messages with level Trace") \
M(LogDebug, "Number of log messages with level Debug") \
diff --git a/src/Common/ThreadStatus.cpp b/src/Common/ThreadStatus.cpp
index 05524a5d6b9..cf50d305e95 100644
--- a/src/Common/ThreadStatus.cpp
+++ b/src/Common/ThreadStatus.cpp
@@ -196,8 +196,9 @@ bool ThreadStatus::isQueryCanceled() const
if (!thread_group)
return false;
- chassert(local_data.query_is_canceled_predicate);
- return local_data.query_is_canceled_predicate();
+ if (local_data.query_is_canceled_predicate)
+ return local_data.query_is_canceled_predicate();
+ return false;
}
ThreadStatus::~ThreadStatus()
diff --git a/src/Common/ZooKeeper/IKeeper.h b/src/Common/ZooKeeper/IKeeper.h
index 00db4e60afc..c7b902ea03a 100644
--- a/src/Common/ZooKeeper/IKeeper.h
+++ b/src/Common/ZooKeeper/IKeeper.h
@@ -8,6 +8,7 @@
#include
#include
#include
+#include
#include
/** Generic interface for ZooKeeper-like services.
@@ -622,6 +623,10 @@ public:
int32_t version,
ReconfigCallback callback) = 0;
+ virtual void multi(
+ std::span requests,
+ MultiCallback callback) = 0;
+
virtual void multi(
const Requests & requests,
MultiCallback callback) = 0;
diff --git a/src/Common/ZooKeeper/TestKeeper.cpp b/src/Common/ZooKeeper/TestKeeper.cpp
index a25329ad7c0..fce29a21e15 100644
--- a/src/Common/ZooKeeper/TestKeeper.cpp
+++ b/src/Common/ZooKeeper/TestKeeper.cpp
@@ -157,6 +157,10 @@ struct TestKeeperReconfigRequest final : ReconfigRequest, TestKeeperRequest
struct TestKeeperMultiRequest final : MultiRequest, TestKeeperRequest
{
explicit TestKeeperMultiRequest(const Requests & generic_requests)
+ : TestKeeperMultiRequest(std::span(generic_requests))
+ {}
+
+ explicit TestKeeperMultiRequest(std::span generic_requests)
{
requests.reserve(generic_requests.size());
@@ -883,6 +887,13 @@ void TestKeeper::reconfig(
void TestKeeper::multi(
const Requests & requests,
MultiCallback callback)
+{
+ return multi(std::span(requests), std::move(callback));
+}
+
+void TestKeeper::multi(
+ std::span requests,
+ MultiCallback callback)
{
TestKeeperMultiRequest request(requests);
diff --git a/src/Common/ZooKeeper/TestKeeper.h b/src/Common/ZooKeeper/TestKeeper.h
index 36db5accff1..2774055652c 100644
--- a/src/Common/ZooKeeper/TestKeeper.h
+++ b/src/Common/ZooKeeper/TestKeeper.h
@@ -101,6 +101,10 @@ public:
const Requests & requests,
MultiCallback callback) override;
+ void multi(
+ std::span requests,
+ MultiCallback callback) override;
+
void finalize(const String & reason) override;
bool isFeatureEnabled(DB::KeeperFeatureFlag) const override
diff --git a/src/Common/ZooKeeper/ZooKeeper.cpp b/src/Common/ZooKeeper/ZooKeeper.cpp
index 93568909041..ca0a211c716 100644
--- a/src/Common/ZooKeeper/ZooKeeper.cpp
+++ b/src/Common/ZooKeeper/ZooKeeper.cpp
@@ -1266,6 +1266,11 @@ std::future ZooKeeper::asyncTryRemoveNoThrow(const
}
std::future ZooKeeper::asyncTryMultiNoThrow(const Coordination::Requests & ops)
+{
+ return asyncTryMultiNoThrow(std::span(ops));
+}
+
+std::future ZooKeeper::asyncTryMultiNoThrow(std::span ops)
{
auto promise = std::make_shared>();
auto future = promise->get_future();
diff --git a/src/Common/ZooKeeper/ZooKeeper.h b/src/Common/ZooKeeper/ZooKeeper.h
index 5d43a1159f6..42c77e5ca72 100644
--- a/src/Common/ZooKeeper/ZooKeeper.h
+++ b/src/Common/ZooKeeper/ZooKeeper.h
@@ -550,6 +550,7 @@ public:
FutureMulti asyncMulti(const Coordination::Requests & ops);
/// Like the previous one but don't throw any exceptions on future.get()
FutureMulti asyncTryMultiNoThrow(const Coordination::Requests & ops);
+ FutureMulti asyncTryMultiNoThrow(std::span ops);
using FutureSync = std::future;
FutureSync asyncSync(const std::string & path);
diff --git a/src/Common/ZooKeeper/ZooKeeperCommon.cpp b/src/Common/ZooKeeper/ZooKeeperCommon.cpp
index 660ae59e81e..4634eae7759 100644
--- a/src/Common/ZooKeeper/ZooKeeperCommon.cpp
+++ b/src/Common/ZooKeeper/ZooKeeperCommon.cpp
@@ -156,6 +156,12 @@ std::string ZooKeeperAuthRequest::toStringImpl() const
void ZooKeeperCreateRequest::writeImpl(WriteBuffer & out) const
{
+ /// See https://github.com/ClickHouse/clickhouse-private/issues/3029
+ if (path.starts_with("/clickhouse/tables/") && path.find("/parts/") != std::string::npos)
+ {
+ LOG_TRACE(getLogger(__PRETTY_FUNCTION__), "Creating part at path {}", path);
+ }
+
Coordination::write(path, out);
Coordination::write(data, out);
Coordination::write(acls, out);
@@ -480,6 +486,10 @@ OpNum ZooKeeperMultiRequest::getOpNum() const
}
ZooKeeperMultiRequest::ZooKeeperMultiRequest(const Requests & generic_requests, const ACLs & default_acls)
+ : ZooKeeperMultiRequest(std::span{generic_requests}, default_acls)
+{}
+
+ZooKeeperMultiRequest::ZooKeeperMultiRequest(std::span generic_requests, const ACLs & default_acls)
{
/// Convert nested Requests to ZooKeeperRequests.
/// Note that deep copy is required to avoid modifying path in presence of chroot prefix.
diff --git a/src/Common/ZooKeeper/ZooKeeperCommon.h b/src/Common/ZooKeeper/ZooKeeperCommon.h
index 5289be7a816..a1bd9b582e9 100644
--- a/src/Common/ZooKeeper/ZooKeeperCommon.h
+++ b/src/Common/ZooKeeper/ZooKeeperCommon.h
@@ -7,17 +7,13 @@
#include
#include
#include
-#include