Merge remote-tracking branch 'upstream/master' into HEAD

This commit is contained in:
Anton Popov 2024-03-05 16:18:44 +00:00
commit 73d78e8ec2
155 changed files with 3041 additions and 685 deletions

View File

@ -2,6 +2,7 @@
.hidden __syscall .hidden __syscall
.type __syscall,%function .type __syscall,%function
__syscall: __syscall:
.cfi_startproc
uxtw x8,w0 uxtw x8,w0
mov x0,x1 mov x0,x1
mov x1,x2 mov x1,x2
@ -12,3 +13,4 @@ __syscall:
mov x6,x7 mov x6,x7
svc 0 svc 0
ret ret
.cfi_endproc

2
contrib/cctz vendored

@ -1 +1 @@
Subproject commit 8529bcef5cd996b7c0f4d7475286b76b5d126c4c Subproject commit 7918cb7afe82e53428e39a045a437fdfd4f3df47

View File

@ -386,7 +386,8 @@ if [ -f core.zst ]; then
CORE_LINK='<a href="core.zst">core.zst</a>' CORE_LINK='<a href="core.zst">core.zst</a>'
fi fi
rg --text -F '<Fatal>' server.log > fatal.log ||: # Keep all the lines in the paragraphs containing <Fatal> that either contain <Fatal> or don't start with 20... (year)
sed -n '/<Fatal>/,/^$/p' server.log | awk '/<Fatal>/ || !/^20/' > fatal.log ||:
FATAL_LINK='' FATAL_LINK=''
if [ -s fatal.log ]; then if [ -s fatal.log ]; then
FATAL_LINK='<a href="fatal.log">fatal.log</a>' FATAL_LINK='<a href="fatal.log">fatal.log</a>'

View File

@ -0,0 +1,29 @@
---
sidebar_position: 1
sidebar_label: 2024
---
# 2024 Changelog
### ClickHouse release v23.3.20.27-lts (cc974ba4f81) FIXME as compared to v23.3.19.32-lts (c4d4ca8ec02)
#### Improvement
* Backported in [#58818](https://github.com/ClickHouse/ClickHouse/issues/58818): Add `SYSTEM JEMALLOC PURGE` for purging unused jemalloc pages, `SYSTEM JEMALLOC [ ENABLE | DISABLE | FLUSH ] PROFILE` for controlling jemalloc profile if the profiler is enabled. Add jemalloc-related 4LW command in Keeper: `jmst` for dumping jemalloc stats, `jmfp`, `jmep`, `jmdp` for controlling jemalloc profile if the profiler is enabled. [#58665](https://github.com/ClickHouse/ClickHouse/pull/58665) ([Antonio Andelic](https://github.com/antonio2368)).
#### Build/Testing/Packaging Improvement
* Backported in [#59877](https://github.com/ClickHouse/ClickHouse/issues/59877): If you want to run initdb scripts every time when ClickHouse container is starting you shoud initialize environment varible CLICKHOUSE_ALWAYS_RUN_INITDB_SCRIPTS. [#59808](https://github.com/ClickHouse/ClickHouse/pull/59808) ([Alexander Nikolaev](https://github.com/AlexNik)).
#### Bug Fix (user-visible misbehavior in an official stable release)
* Fix working with read buffers in StreamingFormatExecutor [#57438](https://github.com/ClickHouse/ClickHouse/pull/57438) ([Kruglov Pavel](https://github.com/Avogar)).
* Fix double destroy call on exception throw in addBatchLookupTable8 [#58745](https://github.com/ClickHouse/ClickHouse/pull/58745) ([Raúl Marín](https://github.com/Algunenano)).
* Fix: LIMIT BY and LIMIT in distributed query [#59153](https://github.com/ClickHouse/ClickHouse/pull/59153) ([Igor Nikonov](https://github.com/devcrafter)).
* Fix translate() with FixedString input [#59356](https://github.com/ClickHouse/ClickHouse/pull/59356) ([Raúl Marín](https://github.com/Algunenano)).
* Fix leftPad / rightPad function with FixedString input [#59739](https://github.com/ClickHouse/ClickHouse/pull/59739) ([Raúl Marín](https://github.com/Algunenano)).
* Fix cosineDistance crash with Nullable [#60150](https://github.com/ClickHouse/ClickHouse/pull/60150) ([Raúl Marín](https://github.com/Algunenano)).
#### NOT FOR CHANGELOG / INSIGNIFICANT
* Fix possible race in ManyAggregatedData dtor. [#58624](https://github.com/ClickHouse/ClickHouse/pull/58624) ([Nikolai Kochetov](https://github.com/KochetovNicolai)).
* Make ZooKeeper actually sequentialy consistent [#59735](https://github.com/ClickHouse/ClickHouse/pull/59735) ([Alexander Tokmakov](https://github.com/tavplubix)).

View File

@ -0,0 +1,39 @@
---
sidebar_position: 1
sidebar_label: 2024
---
# 2024 Changelog
### ClickHouse release v23.8.10.43-lts (a278225bba9) FIXME as compared to v23.8.9.54-lts (192a1d231fa)
#### Improvement
* Backported in [#58819](https://github.com/ClickHouse/ClickHouse/issues/58819): Add `SYSTEM JEMALLOC PURGE` for purging unused jemalloc pages, `SYSTEM JEMALLOC [ ENABLE | DISABLE | FLUSH ] PROFILE` for controlling jemalloc profile if the profiler is enabled. Add jemalloc-related 4LW command in Keeper: `jmst` for dumping jemalloc stats, `jmfp`, `jmep`, `jmdp` for controlling jemalloc profile if the profiler is enabled. [#58665](https://github.com/ClickHouse/ClickHouse/pull/58665) ([Antonio Andelic](https://github.com/antonio2368)).
* Backported in [#60286](https://github.com/ClickHouse/ClickHouse/issues/60286): Copy S3 file GCP fallback to buffer copy in case GCP returned `Internal Error` with `GATEWAY_TIMEOUT` HTTP error code. [#60164](https://github.com/ClickHouse/ClickHouse/pull/60164) ([Maksim Kita](https://github.com/kitaisreal)).
#### Build/Testing/Packaging Improvement
* Backported in [#59879](https://github.com/ClickHouse/ClickHouse/issues/59879): If you want to run initdb scripts every time when ClickHouse container is starting you shoud initialize environment varible CLICKHOUSE_ALWAYS_RUN_INITDB_SCRIPTS. [#59808](https://github.com/ClickHouse/ClickHouse/pull/59808) ([Alexander Nikolaev](https://github.com/AlexNik)).
#### Bug Fix (user-visible misbehavior in an official stable release)
* Background merges correctly use temporary data storage in the cache [#57275](https://github.com/ClickHouse/ClickHouse/pull/57275) ([vdimir](https://github.com/vdimir)).
* MergeTree mutations reuse source part index granularity [#57352](https://github.com/ClickHouse/ClickHouse/pull/57352) ([Maksim Kita](https://github.com/kitaisreal)).
* Fix double destroy call on exception throw in addBatchLookupTable8 [#58745](https://github.com/ClickHouse/ClickHouse/pull/58745) ([Raúl Marín](https://github.com/Algunenano)).
* Fix JSONExtract function for LowCardinality(Nullable) columns [#58808](https://github.com/ClickHouse/ClickHouse/pull/58808) ([vdimir](https://github.com/vdimir)).
* Fix: LIMIT BY and LIMIT in distributed query [#59153](https://github.com/ClickHouse/ClickHouse/pull/59153) ([Igor Nikonov](https://github.com/devcrafter)).
* Fix translate() with FixedString input [#59356](https://github.com/ClickHouse/ClickHouse/pull/59356) ([Raúl Marín](https://github.com/Algunenano)).
* Fix error "Read beyond last offset" for AsynchronousBoundedReadBuffer [#59630](https://github.com/ClickHouse/ClickHouse/pull/59630) ([Vitaly Baranov](https://github.com/vitlibar)).
* Fix query start time on non initial queries [#59662](https://github.com/ClickHouse/ClickHouse/pull/59662) ([Raúl Marín](https://github.com/Algunenano)).
* Fix leftPad / rightPad function with FixedString input [#59739](https://github.com/ClickHouse/ClickHouse/pull/59739) ([Raúl Marín](https://github.com/Algunenano)).
* rabbitmq: fix having neither acked nor nacked messages [#59775](https://github.com/ClickHouse/ClickHouse/pull/59775) ([Kseniia Sumarokova](https://github.com/kssenii)).
* Fix cosineDistance crash with Nullable [#60150](https://github.com/ClickHouse/ClickHouse/pull/60150) ([Raúl Marín](https://github.com/Algunenano)).
#### NOT FOR CHANGELOG / INSIGNIFICANT
* Fix rare race in external sort/aggregation with temporary data in cache [#58013](https://github.com/ClickHouse/ClickHouse/pull/58013) ([Anton Popov](https://github.com/CurtizJ)).
* Fix possible race in ManyAggregatedData dtor. [#58624](https://github.com/ClickHouse/ClickHouse/pull/58624) ([Nikolai Kochetov](https://github.com/KochetovNicolai)).
* Fix 02720_row_policy_column_with_dots [#59453](https://github.com/ClickHouse/ClickHouse/pull/59453) ([Duc Canh Le](https://github.com/canhld94)).
* Pin python dependencies in stateless tests [#59663](https://github.com/ClickHouse/ClickHouse/pull/59663) ([Raúl Marín](https://github.com/Algunenano)).
* Make ZooKeeper actually sequentialy consistent [#59735](https://github.com/ClickHouse/ClickHouse/pull/59735) ([Alexander Tokmakov](https://github.com/tavplubix)).
* Remove broken test while we fix it [#60547](https://github.com/ClickHouse/ClickHouse/pull/60547) ([Raúl Marín](https://github.com/Algunenano)).

View File

@ -14,20 +14,6 @@ Supported platforms:
- PowerPC 64 LE (experimental) - PowerPC 64 LE (experimental)
- RISC-V 64 (experimental) - RISC-V 64 (experimental)
## Building in docker
We use the docker image `clickhouse/binary-builder` for our CI builds. It contains everything necessary to build the binary and packages. There is a script `docker/packager/packager` to ease the image usage:
```bash
# define a directory for the output artifacts
output_dir="build_results"
# a simplest build
./docker/packager/packager --package-type=binary --output-dir "$output_dir"
# build debian packages
./docker/packager/packager --package-type=deb --output-dir "$output_dir"
# by default, debian packages use thin LTO, so we can override it to speed up the build
CMAKE_FLAGS='-DENABLE_THINLTO=' ./docker/packager/packager --package-type=deb --output-dir "./$(git rev-parse --show-cdup)/build_results"
```
## Building on Ubuntu ## Building on Ubuntu
The following tutorial is based on Ubuntu Linux. The following tutorial is based on Ubuntu Linux.
@ -37,6 +23,7 @@ The minimum recommended Ubuntu version for development is 22.04 LTS.
### Install Prerequisites {#install-prerequisites} ### Install Prerequisites {#install-prerequisites}
``` bash ``` bash
sudo apt-get update
sudo apt-get install git cmake ccache python3 ninja-build nasm yasm gawk lsb-release wget software-properties-common gnupg sudo apt-get install git cmake ccache python3 ninja-build nasm yasm gawk lsb-release wget software-properties-common gnupg
``` ```
@ -57,7 +44,7 @@ sudo add-apt-repository -y ppa:ubuntu-toolchain-r/test
For other Linux distributions - check the availability of LLVM's [prebuild packages](https://releases.llvm.org/download.html). For other Linux distributions - check the availability of LLVM's [prebuild packages](https://releases.llvm.org/download.html).
As of August 2023, clang-16 or higher will work. As of March 2024, clang-17 or higher will work.
GCC as a compiler is not supported. GCC as a compiler is not supported.
To build with a specific Clang version: To build with a specific Clang version:
@ -67,8 +54,8 @@ to see what version you have installed before setting this environment variable.
::: :::
``` bash ``` bash
export CC=clang-17 export CC=clang-18
export CXX=clang++-17 export CXX=clang++-18
``` ```
### Checkout ClickHouse Sources {#checkout-clickhouse-sources} ### Checkout ClickHouse Sources {#checkout-clickhouse-sources}
@ -133,3 +120,17 @@ mkdir build
cmake -S . -B build cmake -S . -B build
cmake --build build cmake --build build
``` ```
## Building in docker
We use the docker image `clickhouse/binary-builder` for our CI builds. It contains everything necessary to build the binary and packages. There is a script `docker/packager/packager` to ease the image usage:
```bash
# define a directory for the output artifacts
output_dir="build_results"
# a simplest build
./docker/packager/packager --package-type=binary --output-dir "$output_dir"
# build debian packages
./docker/packager/packager --package-type=deb --output-dir "$output_dir"
# by default, debian packages use thin LTO, so we can override it to speed up the build
CMAKE_FLAGS='-DENABLE_THINLTO=' ./docker/packager/packager --package-type=deb --output-dir "./$(git rev-parse --show-cdup)/build_results"
```

View File

@ -7,6 +7,7 @@ title: Formats for Input and Output Data
ClickHouse can accept and return data in various formats. A format supported for input can be used to parse the data provided to `INSERT`s, to perform `SELECT`s from a file-backed table such as File, URL or HDFS, or to read a dictionary. A format supported for output can be used to arrange the ClickHouse can accept and return data in various formats. A format supported for input can be used to parse the data provided to `INSERT`s, to perform `SELECT`s from a file-backed table such as File, URL or HDFS, or to read a dictionary. A format supported for output can be used to arrange the
results of a `SELECT`, and to perform `INSERT`s into a file-backed table. results of a `SELECT`, and to perform `INSERT`s into a file-backed table.
All format names are case insensitive.
The supported formats are: The supported formats are:

View File

@ -549,6 +549,48 @@ Result:
└───────┴─────────────────────────────────────────────────────────────────────────────────────────────────┴──────────────┴────────────────────┴─────────┴──────────────────┴────────────────┘ └───────┴─────────────────────────────────────────────────────────────────────────────────────────────────┴──────────────┴────────────────────┴─────────┴──────────────────┴────────────────┘
``` ```
##### input_format_json_use_string_type_for_ambiguous_paths_in_named_tuples_inference_from_objects
Enabling this setting allows to use String type for ambiguous paths during named tuples inference from JSON objects (when `input_format_json_try_infer_named_tuples_from_objects` is enabled) instead of an exception.
It allows to read JSON objects as named Tuples even if there are ambiguous paths.
Disabled by default.
**Examples**
With disabled setting:
```sql
SET input_format_json_try_infer_named_tuples_from_objects = 1;
SET input_format_json_use_string_type_for_ambiguous_paths_in_named_tuples_inference_from_objects = 0;
DESC format(JSONEachRow, '{"obj" : {"a" : 42}}, {"obj" : {"a" : {"b" : "Hello"}}}');
```
Result:
```text
Code: 636. DB::Exception: The table structure cannot be extracted from a JSONEachRow format file. Error:
Code: 117. DB::Exception: JSON objects have ambiguous data: in some objects path 'a' has type 'Int64' and in some - 'Tuple(b String)'. You can enable setting input_format_json_use_string_type_for_ambiguous_paths_in_named_tuples_inference_from_objects to use String type for path 'a'. (INCORRECT_DATA) (version 24.3.1.1).
You can specify the structure manually. (CANNOT_EXTRACT_TABLE_STRUCTURE)
```
With enabled setting:
```sql
SET input_format_json_try_infer_named_tuples_from_objects = 1;
SET input_format_json_use_string_type_for_ambiguous_paths_in_named_tuples_inference_from_objects = 1;
DESC format(JSONEachRow, '{"obj" : "a" : 42}, {"obj" : {"a" : {"b" : "Hello"}}}');
SELECT * FROM format(JSONEachRow, '{"obj" : {"a" : 42}}, {"obj" : {"a" : {"b" : "Hello"}}}');
```
Result:
```text
┌─name─┬─type──────────────────────────┬─default_type─┬─default_expression─┬─comment─┬─codec_expression─┬─ttl_expression─┐
│ obj │ Tuple(a Nullable(String)) │ │ │ │ │ │
└──────┴───────────────────────────────┴──────────────┴────────────────────┴─────────┴──────────────────┴────────────────┘
┌─obj─────────────────┐
│ ('42') │
│ ('{"b" : "Hello"}') │
└─────────────────────┘
```
##### input_format_json_read_objects_as_strings ##### input_format_json_read_objects_as_strings
Enabling this setting allows reading nested JSON objects as strings. Enabling this setting allows reading nested JSON objects as strings.
@ -1554,6 +1596,28 @@ DESC format(JSONEachRow, $$
└──────┴──────────────────┴──────────────┴────────────────────┴─────────┴──────────────────┴────────────────┘ └──────┴──────────────────┴──────────────┴────────────────────┴─────────┴──────────────────┴────────────────┘
``` ```
#### input_format_try_infer_exponent_floats
If enabled, ClickHouse will try to infer floats in exponential form for text formats (except JSON where numbers in exponential form are always inferred).
Disabled by default.
**Example**
```sql
SET input_format_try_infer_exponent_floats = 1;
DESC format(CSV,
$$1.1E10
2.3e-12
42E00
$$)
```
```response
┌─name─┬─type──────────────┬─default_type─┬─default_expression─┬─comment─┬─codec_expression─┬─ttl_expression─┐
│ c1 │ Nullable(Float64) │ │ │ │ │ │
└──────┴───────────────────┴──────────────┴────────────────────┴─────────┴──────────────────┴────────────────┘
```
## Self describing formats {#self-describing-formats} ## Self describing formats {#self-describing-formats}
Self-describing formats contain information about the structure of the data in the data itself, Self-describing formats contain information about the structure of the data in the data itself,

View File

@ -275,6 +275,16 @@ Cache profile events:
- `CachedWriteBufferCacheWriteBytes`, `CachedWriteBufferCacheWriteMicroseconds` - `CachedWriteBufferCacheWriteBytes`, `CachedWriteBufferCacheWriteMicroseconds`
## Using in-memory cache (userspace page cache) {#userspace-page-cache}
The File Cache described above stores cached data in local files. Alternatively, object-store-based disks can be configured to use "Userspace Page Cache", which is RAM-only. Userspace page cache is recommended only if file cache can't be used for some reason, e.g. if the machine doesn't have a local disk at all. Note that file cache effectively uses RAM for caching too, since the OS caches contents of local files.
To enable userspace page cache for disks that don't use file cache, use setting `use_page_cache_for_disks_without_file_cache`.
By default, on Linux, the userspace page cache will use all available memory, similar to the OS page cache. In tools like `top` and `ps`, the clickhouse server process will typically show resident set size near 100% of the machine's RAM - this is normal, and most of this memory is actually reclaimable by the OS on memory pressure (`MADV_FREE`). This behavior can be disabled with server setting `page_cache_use_madv_free = 0`, making the userspace page cache just use a fixed amount of memory `page_cache_size` with no special interaction with the OS. On Mac OS, `page_cache_use_madv_free` is always disabled as it doesn't have lazy `MADV_FREE`.
Unfortunately, `page_cache_use_madv_free` makes it difficult to tell if the server is close to running out of memory, since the RSS metric becomes useless. Async metric `UnreclaimableRSS` shows the amount of physical memory used by the server, excluding the memory reclaimable by the OS: `select value from system.asynchronous_metrics where metric = 'UnreclaimableRSS'`. Use it for monitoring instead of RSS. This metric is only available if `page_cache_use_madv_free` is enabled.
## Storing Data on Web Server {#storing-data-on-webserver} ## Storing Data on Web Server {#storing-data-on-webserver}
There is a tool `clickhouse-static-files-uploader`, which prepares a data directory for a given table (`SELECT data_paths FROM system.tables WHERE name = 'table_name'`). For each table you need, you get a directory of files. These files can be uploaded to, for example, a web server with static files. After this preparation, you can load this table into any ClickHouse server via `DiskWeb`. There is a tool `clickhouse-static-files-uploader`, which prepares a data directory for a given table (`SELECT data_paths FROM system.tables WHERE name = 'table_name'`). For each table you need, you get a directory of files. These files can be uploaded to, for example, a web server with static files. After this preparation, you can load this table into any ClickHouse server via `DiskWeb`.

View File

@ -56,9 +56,9 @@ ls -1 flightlist_*.csv.gz | xargs -P100 -I{} bash -c 'gzip -c -d "{}" | clickhou
- 这里我们将文件列表(`ls -1 flightlist_*.csv.gz`)传递给`xargs`以进行并行处理。 `xargs -P100` 指定最多使用 100 个并行工作程序,但由于我们只有 30 个文件,工作程序的数量将只有 30 个。 - 这里我们将文件列表(`ls -1 flightlist_*.csv.gz`)传递给`xargs`以进行并行处理。 `xargs -P100` 指定最多使用 100 个并行工作程序,但由于我们只有 30 个文件,工作程序的数量将只有 30 个。
- 对于每个文件,`xargs` 将通过 `bash -c` 为每个文件运行一个脚本文件。该脚本通过使用 `{}` 表示文件名占位符,然后 `xargs` 由命令进行填充(使用 `-I{}`)。 - 对于每个文件,`xargs` 将通过 `bash -c` 为每个文件运行一个脚本文件。该脚本通过使用 `{}` 表示文件名占位符,然后 `xargs` 由命令进行填充(使用 `-I{}`)。
- 该脚本会将文件 (`gzip -c -d "{}"`) 解压缩到标准输出(`-c` 参数),并将输出重定向到 `clickhouse-client`。 - 该脚本会将文件 (`gzip -c -d "{}"`) 解压缩到标准输出(`-c` 参数),并将输出重定向到 `clickhouse-client`。
- 我们还要求使用扩展解析器解析 [DateTime](../../sql-reference/data-types/datetime.md) 字段 ([--date_time_input_format best_effort](../../operations/settings/ settings.md#settings-date_time_input_format)) 以识别具有时区偏移的 ISO-8601 格式。 - 我们还要求使用扩展解析器解析 [DateTime](/docs/zh/sql-reference/data-types/datetime.md) 字段 ([--date_time_input_format best_effort](/docs/zh/operations/settings/settings.md#settings-date_time_input_format)) 以识别具有时区偏移的 ISO-8601 格式。
最后,`clickhouse-client` 会以 [CSVWithNames](../../interfaces/formats.md#csvwithnames) 格式读取输入数据然后执行插入。 最后,`clickhouse-client` 会以 [CSVWithNames](/docs/zh/interfaces/formats.md#csvwithnames) 格式读取输入数据然后执行插入。
并行导入需要 24 秒。 并行导入需要 24 秒。

View File

@ -1228,6 +1228,13 @@ try
} }
global_context->setMarkCache(mark_cache_policy, mark_cache_size, mark_cache_size_ratio); global_context->setMarkCache(mark_cache_policy, mark_cache_size, mark_cache_size_ratio);
size_t page_cache_size = server_settings.page_cache_size;
if (page_cache_size != 0)
global_context->setPageCache(
server_settings.page_cache_chunk_size, server_settings.page_cache_mmap_size,
page_cache_size, server_settings.page_cache_use_madv_free,
server_settings.page_cache_use_transparent_huge_pages);
String index_uncompressed_cache_policy = server_settings.index_uncompressed_cache_policy; String index_uncompressed_cache_policy = server_settings.index_uncompressed_cache_policy;
size_t index_uncompressed_cache_size = server_settings.index_uncompressed_cache_size; size_t index_uncompressed_cache_size = server_settings.index_uncompressed_cache_size;
double index_uncompressed_cache_size_ratio = server_settings.index_uncompressed_cache_size_ratio; double index_uncompressed_cache_size_ratio = server_settings.index_uncompressed_cache_size_ratio;
@ -1874,7 +1881,6 @@ try
{ {
total_memory_tracker.setSampleMaxAllocationSize(server_settings.total_memory_profiler_sample_max_allocation_size); total_memory_tracker.setSampleMaxAllocationSize(server_settings.total_memory_profiler_sample_max_allocation_size);
} }
} }
#endif #endif
@ -1889,10 +1895,6 @@ try
" when two different stack unwinding methods will interfere with each other."); " when two different stack unwinding methods will interfere with each other.");
#endif #endif
#if !defined(__x86_64__)
LOG_INFO(log, "Query Profiler and TraceCollector is only tested on x86_64. It also known to not work under qemu-user.");
#endif
if (!hasPHDRCache()) if (!hasPHDRCache())
LOG_INFO(log, "Query Profiler and TraceCollector are disabled because they require PHDR cache to be created" LOG_INFO(log, "Query Profiler and TraceCollector are disabled because they require PHDR cache to be created"
" (otherwise the function 'dl_iterate_phdr' is not lock free and not async-signal safe)."); " (otherwise the function 'dl_iterate_phdr' is not lock free and not async-signal safe).");

View File

@ -163,6 +163,7 @@ enum class AccessType
M(SYSTEM_DROP_FILESYSTEM_CACHE, "SYSTEM DROP FILESYSTEM CACHE, DROP FILESYSTEM CACHE", GLOBAL, SYSTEM_DROP_CACHE) \ M(SYSTEM_DROP_FILESYSTEM_CACHE, "SYSTEM DROP FILESYSTEM CACHE, DROP FILESYSTEM CACHE", GLOBAL, SYSTEM_DROP_CACHE) \
M(SYSTEM_DROP_DISTRIBUTED_CACHE, "SYSTEM DROP DISTRIBUTED CACHE, DROP DISTRIBUTED CACHE", GLOBAL, SYSTEM_DROP_CACHE) \ M(SYSTEM_DROP_DISTRIBUTED_CACHE, "SYSTEM DROP DISTRIBUTED CACHE, DROP DISTRIBUTED CACHE", GLOBAL, SYSTEM_DROP_CACHE) \
M(SYSTEM_SYNC_FILESYSTEM_CACHE, "SYSTEM REPAIR FILESYSTEM CACHE, REPAIR FILESYSTEM CACHE, SYNC FILESYSTEM CACHE", GLOBAL, SYSTEM) \ M(SYSTEM_SYNC_FILESYSTEM_CACHE, "SYSTEM REPAIR FILESYSTEM CACHE, REPAIR FILESYSTEM CACHE, SYNC FILESYSTEM CACHE", GLOBAL, SYSTEM) \
M(SYSTEM_DROP_PAGE_CACHE, "SYSTEM DROP PAGE CACHE, DROP PAGE CACHE", GLOBAL, SYSTEM_DROP_CACHE) \
M(SYSTEM_DROP_SCHEMA_CACHE, "SYSTEM DROP SCHEMA CACHE, DROP SCHEMA CACHE", GLOBAL, SYSTEM_DROP_CACHE) \ M(SYSTEM_DROP_SCHEMA_CACHE, "SYSTEM DROP SCHEMA CACHE, DROP SCHEMA CACHE", GLOBAL, SYSTEM_DROP_CACHE) \
M(SYSTEM_DROP_FORMAT_SCHEMA_CACHE, "SYSTEM DROP FORMAT SCHEMA CACHE, DROP FORMAT SCHEMA CACHE", GLOBAL, SYSTEM_DROP_CACHE) \ M(SYSTEM_DROP_FORMAT_SCHEMA_CACHE, "SYSTEM DROP FORMAT SCHEMA CACHE, DROP FORMAT SCHEMA CACHE", GLOBAL, SYSTEM_DROP_CACHE) \
M(SYSTEM_DROP_S3_CLIENT_CACHE, "SYSTEM DROP S3 CLIENT, DROP S3 CLIENT CACHE", GLOBAL, SYSTEM_DROP_CACHE) \ M(SYSTEM_DROP_S3_CLIENT_CACHE, "SYSTEM DROP S3 CLIENT, DROP S3 CLIENT CACHE", GLOBAL, SYSTEM_DROP_CACHE) \

View File

@ -361,7 +361,7 @@ private:
if (table_node.hasTableExpressionModifiers() && table_node.getTableExpressionModifiers()->hasFinal()) if (table_node.hasTableExpressionModifiers() && table_node.getTableExpressionModifiers()->hasFinal())
return; return;
auto column = first_argument_column_node.getColumn(); const auto & column = first_argument_column_node.getColumn();
auto table_name = table_node.getStorage()->getStorageID().getFullTableName(); auto table_name = table_node.getStorage()->getStorageID().getFullTableName();
Identifier qualified_name({table_name, column.name}); Identifier qualified_name({table_name, column.name});

View File

@ -264,7 +264,17 @@
M(RefreshingViews, "Number of materialized views currently executing a refresh") \ M(RefreshingViews, "Number of materialized views currently executing a refresh") \
M(StorageBufferFlushThreads, "Number of threads for background flushes in StorageBuffer") \ M(StorageBufferFlushThreads, "Number of threads for background flushes in StorageBuffer") \
M(StorageBufferFlushThreadsActive, "Number of threads for background flushes in StorageBuffer running a task") \ M(StorageBufferFlushThreadsActive, "Number of threads for background flushes in StorageBuffer running a task") \
M(StorageBufferFlushThreadsScheduled, "Number of queued or active threads for background flushes in StorageBuffer") M(StorageBufferFlushThreadsScheduled, "Number of queued or active threads for background flushes in StorageBuffer") \
M(SharedMergeTreeThreads, "Number of threads in the thread pools in internals of SharedMergeTree") \
M(SharedMergeTreeThreadsActive, "Number of threads in the thread pools in internals of SharedMergeTree running a task") \
M(SharedMergeTreeThreadsScheduled, "Number of queued or active threads in the thread pools in internals of SharedMergeTree") \
M(SharedMergeTreeFetch, "Number of fetches in progress") \
M(CacheWarmerBytesInProgress, "Total size of remote file segments waiting to be asynchronously loaded into filesystem cache.") \
M(DistrCacheOpenedConnections, "Number of open connections to Distributed Cache") \
M(DistrCacheUsedConnections, "Number of currently used connections to Distributed Cache") \
M(DistrCacheReadRequests, "Number of executed Read requests to Distributed Cache") \
M(DistrCacheWriteRequests, "Number of executed Write requests to Distributed Cache") \
M(DistrCacheServerConnections, "Number of open connections to ClickHouse server from Distributed Cache")
#ifdef APPLY_FOR_EXTERNAL_METRICS #ifdef APPLY_FOR_EXTERNAL_METRICS
#define APPLY_FOR_METRICS(M) APPLY_FOR_BUILTIN_METRICS(M) APPLY_FOR_EXTERNAL_METRICS(M) #define APPLY_FOR_METRICS(M) APPLY_FOR_BUILTIN_METRICS(M) APPLY_FOR_EXTERNAL_METRICS(M)

View File

@ -379,7 +379,6 @@
M(467, CANNOT_PARSE_BOOL) \ M(467, CANNOT_PARSE_BOOL) \
M(468, CANNOT_PTHREAD_ATTR) \ M(468, CANNOT_PTHREAD_ATTR) \
M(469, VIOLATED_CONSTRAINT) \ M(469, VIOLATED_CONSTRAINT) \
M(470, QUERY_IS_NOT_SUPPORTED_IN_LIVE_VIEW) \
M(471, INVALID_SETTING_VALUE) \ M(471, INVALID_SETTING_VALUE) \
M(472, READONLY_SETTING) \ M(472, READONLY_SETTING) \
M(473, DEADLOCK_AVOIDED) \ M(473, DEADLOCK_AVOIDED) \
@ -585,6 +584,10 @@
M(703, INVALID_IDENTIFIER) \ M(703, INVALID_IDENTIFIER) \
M(704, QUERY_CACHE_USED_WITH_NONDETERMINISTIC_FUNCTIONS) \ M(704, QUERY_CACHE_USED_WITH_NONDETERMINISTIC_FUNCTIONS) \
M(705, TABLE_NOT_EMPTY) \ M(705, TABLE_NOT_EMPTY) \
\
M(900, DISTRIBUTED_CACHE_ERROR) \
M(901, CANNOT_USE_DISTRIBUTED_CACHE) \
\
M(706, LIBSSH_ERROR) \ M(706, LIBSSH_ERROR) \
M(707, GCP_ERROR) \ M(707, GCP_ERROR) \
M(708, ILLEGAL_STATISTIC) \ M(708, ILLEGAL_STATISTIC) \

View File

@ -39,6 +39,14 @@ static struct InitFiu
REGULAR(replicated_merge_tree_commit_zk_fail_when_recovering_from_hw_fault) \ REGULAR(replicated_merge_tree_commit_zk_fail_when_recovering_from_hw_fault) \
REGULAR(use_delayed_remote_source) \ REGULAR(use_delayed_remote_source) \
REGULAR(cluster_discovery_faults) \ REGULAR(cluster_discovery_faults) \
ONCE(smt_commit_merge_mutate_zk_fail_after_op) \
ONCE(smt_commit_merge_mutate_zk_fail_before_op) \
ONCE(smt_commit_write_zk_fail_after_op) \
ONCE(smt_commit_write_zk_fail_before_op) \
ONCE(smt_commit_merge_change_version_before_op) \
ONCE(smt_merge_mutate_intention_freeze_in_destructor) \
ONCE(meta_in_keeper_create_metadata_failure) \
REGULAR(cache_warmer_stall) \
REGULAR(check_table_query_delay_for_part) \ REGULAR(check_table_query_delay_for_part) \
REGULAR(dummy_failpoint) \ REGULAR(dummy_failpoint) \
REGULAR(prefetched_reader_pool_failpoint) \ REGULAR(prefetched_reader_pool_failpoint) \

688
src/Common/PageCache.cpp Normal file
View File

@ -0,0 +1,688 @@
#include "PageCache.h"
#include <unistd.h>
#include <sys/mman.h>
#include <Common/logger_useful.h>
#include <Common/formatReadable.h>
#include <Common/ProfileEvents.h>
#include <Common/SipHash.h>
#include <base/hex.h>
#include <base/errnoToString.h>
#include <base/getPageSize.h>
#include <IO/ReadBufferFromFile.h>
#include <IO/ReadHelpers.h>
namespace ProfileEvents
{
extern const Event PageCacheChunkMisses;
extern const Event PageCacheChunkShared;
extern const Event PageCacheChunkDataHits;
extern const Event PageCacheChunkDataPartialHits;
extern const Event PageCacheChunkDataMisses;
extern const Event PageCacheBytesUnpinnedRoundedToPages;
extern const Event PageCacheBytesUnpinnedRoundedToHugePages;
}
namespace DB
{
namespace ErrorCodes
{
extern const int SYSTEM_ERROR;
extern const int MEMORY_LIMIT_EXCEEDED;
extern const int CANNOT_ALLOCATE_MEMORY;
extern const int INVALID_SETTING_VALUE;
extern const int FILE_DOESNT_EXIST;
}
#pragma clang diagnostic push
#pragma clang diagnostic ignored "-Wunknown-warning-option"
#pragma clang diagnostic ignored "-Wreadability-make-member-function-const"
PinnedPageChunk::PinnedPageChunk(PinnedPageChunk && c) noexcept
: cache(std::exchange(c.cache, nullptr)), chunk(std::exchange(c.chunk, nullptr)) {}
PinnedPageChunk & PinnedPageChunk::operator=(PinnedPageChunk && c) noexcept
{
if (cache)
cache->removeRef(chunk);
cache = std::exchange(c.cache, nullptr);
chunk = std::exchange(c.chunk, nullptr);
return *this;
}
PinnedPageChunk::~PinnedPageChunk() noexcept
{
if (cache)
cache->removeRef(chunk);
}
PinnedPageChunk::PinnedPageChunk(PageCache * cache_, PageChunk * chunk_) noexcept : cache(cache_), chunk(chunk_) {}
const PageChunk * PinnedPageChunk::getChunk() const { return chunk; }
bool PinnedPageChunk::markPagePopulated(size_t page_idx)
{
bool r = chunk->pages_populated.set(page_idx);
return r;
}
void PinnedPageChunk::markPrefixPopulated(size_t bytes)
{
for (size_t i = 0; i < (bytes + chunk->page_size - 1) / chunk->page_size; ++i)
markPagePopulated(i);
}
bool PinnedPageChunk::isPrefixPopulated(size_t bytes) const
{
for (size_t i = 0; i < (bytes + chunk->page_size - 1) / chunk->page_size; ++i)
if (!chunk->pages_populated.get(i))
return false;
return true;
}
AtomicBitSet::AtomicBitSet() = default;
void AtomicBitSet::init(size_t nn)
{
n = nn;
v = std::make_unique<std::atomic<UInt8>[]>((n + 7) / 8);
}
bool AtomicBitSet::get(size_t i) const
{
return (v[i / 8] & (1 << (i % 8))) != 0;
}
bool AtomicBitSet::any() const
{
for (size_t i = 0; i < (n + 7) / 8; ++i)
if (v[i])
return true;
return false;
}
bool AtomicBitSet::set(size_t i) const
{
UInt8 prev = v[i / 8].fetch_or(1 << (i % 8));
return (prev & (1 << (i % 8))) == 0;
}
bool AtomicBitSet::set(size_t i, bool val) const
{
if (val)
return set(i);
else
return unset(i);
}
bool AtomicBitSet::unset(size_t i) const
{
UInt8 prev = v[i / 8].fetch_and(~(1 << (i % 8)));
return (prev & (1 << (i % 8))) != 0;
}
void AtomicBitSet::unsetAll() const
{
for (size_t i = 0; i < (n + 7) / 8; ++i)
v[i].store(0, std::memory_order_relaxed);
}
PageCache::PageCache(size_t bytes_per_chunk, size_t bytes_per_mmap, size_t bytes_total, bool use_madv_free_, bool use_huge_pages_)
: bytes_per_page(getPageSize())
, use_madv_free(use_madv_free_)
, use_huge_pages(use_huge_pages_)
, rng(randomSeed())
{
if (bytes_per_chunk == 0 || bytes_per_mmap == 0)
throw Exception(ErrorCodes::INVALID_SETTING_VALUE, "Userspace page cache chunk size and mmap size can't be zero.");
if (use_huge_pages)
{
use_huge_pages = false;
bool print_warning = false;
#ifdef OS_LINUX
try
{
ReadBufferFromFile in("/sys/kernel/mm/transparent_hugepage/hpage_pmd_size");
size_t huge_page_size;
readIntText(huge_page_size, in);
if (huge_page_size == 0 || huge_page_size % bytes_per_page != 0)
throw Exception(ErrorCodes::SYSTEM_ERROR, "Invalid huge page size reported by the OS: {}", huge_page_size);
/// THP can be configured to be 2 MiB or 1 GiB in size. 1 GiB is way too big for us.
if (huge_page_size <= (16 << 20))
{
pages_per_big_page = huge_page_size / bytes_per_page;
use_huge_pages = true;
}
else
{
LOG_WARNING(&Poco::Logger::get("PageCache"), "The OS huge page size is too large for our purposes: {} KiB. Using regular pages. Userspace page cache will be relatively slow.", huge_page_size);
}
}
catch (Exception & e)
{
if (e.code() != ErrorCodes::FILE_DOESNT_EXIST)
throw;
print_warning = true;
}
#else
print_warning = true;
#endif
if (print_warning)
LOG_WARNING(&Poco::Logger::get("PageCache"), "The OS doesn't support transparent huge pages. Userspace page cache will be relatively slow.");
}
pages_per_chunk = ((bytes_per_chunk - 1) / (bytes_per_page * pages_per_big_page) + 1) * pages_per_big_page;
chunks_per_mmap_target = (bytes_per_mmap - 1) / (bytes_per_page * pages_per_chunk) + 1;
max_mmaps = (bytes_total - 1) / (bytes_per_page * pages_per_chunk * chunks_per_mmap_target) + 1;
}
PageCache::~PageCache()
{
chassert(getPinnedSize() == 0);
}
size_t PageCache::pageSize() const { return bytes_per_page; }
size_t PageCache::chunkSize() const { return bytes_per_page * pages_per_chunk; }
size_t PageCache::maxChunks() const { return chunks_per_mmap_target * max_mmaps; }
size_t PageCache::getPinnedSize() const
{
std::unique_lock lock(global_mutex);
return (total_chunks - lru.size()) * bytes_per_page * pages_per_chunk;
}
PageCache::MemoryStats PageCache::getResidentSetSize() const
{
MemoryStats stats;
#ifdef OS_LINUX
if (use_madv_free)
{
std::unordered_set<UInt64> cache_mmap_addrs;
for (const auto & m : mmaps)
cache_mmap_addrs.insert(reinterpret_cast<UInt64>(m.ptr));
ReadBufferFromFile in("/proc/self/smaps");
/// Parse the smaps contents, which is text consisting of entries like this:
///
/// 117ba4a00000-117be4a00000 rw-p 00000000 00:00 0
/// Size: 1048576 kB
/// KernelPageSize: 4 kB
/// MMUPageSize: 4 kB
/// Rss: 539516 kB
/// Pss: 539516 kB
/// ...
auto read_token = [&]
{
String res;
while (!in.eof())
{
char c = *in.position();
if (c == '\n' || c == '\t' || c == ' ' || c == '-')
break;
res += c;
++in.position();
}
return res;
};
auto skip_whitespace = [&]
{
while (!in.eof())
{
char c = *in.position();
if (c != ' ' && c != '\t')
break;
++in.position();
}
};
bool current_range_is_cache = false;
size_t total_rss = 0;
size_t total_lazy_free = 0;
while (!in.eof())
{
String s = read_token();
if (!in.eof() && *in.position() == '-')
{
if (s.size() < 16)
s.insert(0, 16 - s.size(), '0');
UInt64 addr = unhexUInt<UInt64>(s.c_str());
current_range_is_cache = cache_mmap_addrs.contains(addr);
}
else if (s == "Rss:" || s == "LazyFree")
{
skip_whitespace();
size_t val;
readIntText(val, in);
skip_whitespace();
String unit = read_token();
if (unit != "kB")
throw Exception(ErrorCodes::SYSTEM_ERROR, "Unexpected units in /proc/self/smaps: {}", unit);
size_t bytes = val * 1024;
if (s == "Rss:")
{
total_rss += bytes;
if (current_range_is_cache)
stats.page_cache_rss += bytes;
}
else
total_lazy_free += bytes;
}
skipToNextLineOrEOF(in);
}
stats.unreclaimable_rss = total_rss - std::min(total_lazy_free, total_rss);
return stats;
}
#endif
stats.page_cache_rss = bytes_per_page * pages_per_chunk * total_chunks;
return stats;
}
PinnedPageChunk PageCache::getOrSet(PageCacheKey key, bool detached_if_missing, bool inject_eviction)
{
PageChunk * chunk;
/// Make sure we increment exactly one of the counters about the fate of a chunk lookup.
bool incremented_profile_events = false;
{
std::unique_lock lock(global_mutex);
auto * it = chunk_by_key.find(key);
if (it == chunk_by_key.end())
{
chunk = getFreeChunk(lock);
chassert(!chunk->key.has_value());
if (!detached_if_missing)
{
chunk->key = key;
chunk_by_key.insert({key, chunk});
}
ProfileEvents::increment(ProfileEvents::PageCacheChunkMisses);
incremented_profile_events = true;
}
else
{
chunk = it->getMapped();
size_t prev_pin_count = chunk->pin_count.fetch_add(1);
if (prev_pin_count == 0)
{
/// Not eligible for LRU eviction while pinned.
chassert(chunk->is_linked());
lru.erase(lru.iterator_to(*chunk));
if (detached_if_missing)
{
/// Peek the first page to see if it's evicted.
/// (Why not use the full probing procedure instead, restoreChunkFromLimbo()?
/// Right here we can't do it because of how the two mutexes are organized.
/// And we want to do the check+detach before unlocking global_mutex, because
/// otherwise we may detach a chunk pinned by someone else, which may be unexpected
/// for that someone else. Or maybe the latter is fine, dropCache() already does it.)
if (chunk->pages_populated.get(0) && reinterpret_cast<volatile std::atomic<char>*>(chunk->data)->load(std::memory_order_relaxed) == 0)
evictChunk(chunk, lock);
}
if (inject_eviction && chunk->key.has_value() && rng() % 10 == 0)
{
/// Simulate eviction of the chunk or some of its pages.
if (rng() % 2 == 0)
evictChunk(chunk, lock);
else
for (size_t i = 0; i < 20; ++i)
chunk->pages_populated.unset(rng() % (chunk->size / chunk->page_size));
}
}
else
{
ProfileEvents::increment(ProfileEvents::PageCacheChunkShared);
incremented_profile_events = true;
}
}
}
{
std::unique_lock chunk_lock(chunk->chunk_mutex);
if (chunk->pages_state == PageChunkState::Limbo)
{
auto [pages_restored, pages_evicted] = restoreChunkFromLimbo(chunk, chunk_lock);
chunk->pages_state = PageChunkState::Stable;
if (!incremented_profile_events)
{
if (pages_evicted == 0)
ProfileEvents::increment(ProfileEvents::PageCacheChunkDataHits);
else if (pages_evicted < pages_restored)
ProfileEvents::increment(ProfileEvents::PageCacheChunkDataPartialHits);
else
ProfileEvents::increment(ProfileEvents::PageCacheChunkDataMisses);
}
}
}
return PinnedPageChunk(this, chunk);
}
void PageCache::removeRef(PageChunk * chunk) noexcept
{
/// Fast path if this is not the last reference.
size_t prev_pin_count = chunk->pin_count.load();
if (prev_pin_count > 1 && chunk->pin_count.compare_exchange_strong(prev_pin_count, prev_pin_count - 1))
return;
{
std::unique_lock lock(global_mutex);
prev_pin_count = chunk->pin_count.fetch_sub(1);
if (prev_pin_count > 1)
return;
chassert(!chunk->is_linked());
if (chunk->key.has_value())
lru.push_back(*chunk);
else
/// Unpinning detached chunk. We'd rather reuse it soon, so put it at the front.
lru.push_front(*chunk);
}
{
std::unique_lock chunk_lock(chunk->chunk_mutex);
/// Need to be extra careful here because we unlocked global_mutex above, so other
/// getOrSet()/removeRef() calls could have happened during this brief period.
if (use_madv_free && chunk->pages_state == PageChunkState::Stable && chunk->pin_count.load() == 0)
{
sendChunkToLimbo(chunk, chunk_lock);
chunk->pages_state = PageChunkState::Limbo;
}
}
}
static void logUnexpectedSyscallError(std::string name)
{
std::string message = fmt::format("{} failed: {}", name, errnoToString());
LOG_WARNING(&Poco::Logger::get("PageCache"), "{}", message);
#if defined(ABORT_ON_LOGICAL_ERROR)
volatile bool true_ = true;
if (true_) // suppress warning about missing [[noreturn]]
abortOnFailedAssertion(message);
#endif
}
void PageCache::sendChunkToLimbo(PageChunk * chunk [[maybe_unused]], std::unique_lock<std::mutex> & /* chunk_mutex */) const noexcept
{
#ifdef MADV_FREE // if we're not on a very old version of Linux
chassert(chunk->size == bytes_per_page * pages_per_chunk);
size_t populated_pages = 0;
size_t populated_big_pages = 0;
for (size_t big_page_idx = 0; big_page_idx < pages_per_chunk / pages_per_big_page; ++big_page_idx)
{
bool big_page_populated = false;
for (size_t sub_idx = 0; sub_idx < pages_per_big_page; ++sub_idx)
{
size_t idx = big_page_idx * pages_per_big_page + sub_idx;
if (!chunk->pages_populated.get(idx))
continue;
big_page_populated = true;
populated_pages += 1;
auto & byte = reinterpret_cast<volatile std::atomic<char> &>(chunk->data[idx * bytes_per_page]);
chunk->first_bit_of_each_page.set(idx, (byte.load(std::memory_order_relaxed) & 1) != 0);
byte.fetch_or(1, std::memory_order_relaxed);
}
if (big_page_populated)
populated_big_pages += 1;
}
int r = madvise(chunk->data, chunk->size, MADV_FREE);
if (r != 0)
logUnexpectedSyscallError("madvise(MADV_FREE)");
ProfileEvents::increment(ProfileEvents::PageCacheBytesUnpinnedRoundedToPages, bytes_per_page * populated_pages);
ProfileEvents::increment(ProfileEvents::PageCacheBytesUnpinnedRoundedToHugePages, bytes_per_page * pages_per_big_page * populated_big_pages);
#endif
}
std::pair<size_t, size_t> PageCache::restoreChunkFromLimbo(PageChunk * chunk, std::unique_lock<std::mutex> & /* chunk_mutex */) const noexcept
{
static_assert(sizeof(std::atomic<char>) == 1, "char is not atomic?");
// Make sure our strategic memory reads/writes are not reordered or optimized out.
auto * data = reinterpret_cast<volatile std::atomic<char> *>(chunk->data);
size_t pages_restored = 0;
size_t pages_evicted = 0;
for (size_t idx = 0; idx < chunk->size / bytes_per_page; ++idx)
{
if (!chunk->pages_populated.get(idx))
continue;
/// After MADV_FREE, it's guaranteed that:
/// * writing to the page makes it non-freeable again (reading doesn't),
/// * after the write, the page contents are either fully intact or fully zero-filled,
/// * even before the write, reads return either intact data (if the page wasn't freed) or zeroes (if it was, and the read page-faulted).
/// (And when doing the write there's no way to tell whether it page-faulted or not, AFAICT; that would make our life much easier!)
///
/// With that in mind, we do the following dance to bring the page back from the MADV_FREE limbo:
/// 0. [in advance] Before doing MADV_FREE, make sure the page's first byte is not zero.
/// We do it by setting the lowest bit of the first byte to 1, after saving the original value of that bit into a bitset.
/// 1. Read the second byte.
/// 2. Write the second byte back. This makes the page non-freeable.
/// 3. Read the first byte.
/// 3a. If it's zero, the page was freed.
/// Set the second byte to 0, to keep the buffer zero-filled if the page was freed
/// between steps 1 and 2.
/// 3b. If it's nonzero, the page is intact.
/// Restore the lowest bit of the first byte to the saved original value from the bitset.
char second_byte = data[idx * bytes_per_page + 1].load(std::memory_order_relaxed);
data[idx * bytes_per_page + 1].store(second_byte, std::memory_order_relaxed);
char first_byte = data[idx * bytes_per_page].load(std::memory_order_relaxed);
if (first_byte == 0)
{
pages_evicted += 1;
data[idx * bytes_per_page + 1].store(0, std::memory_order_relaxed);
chunk->pages_populated.unset(idx);
}
else
{
pages_restored += 1;
chassert(first_byte & 1);
if (!chunk->first_bit_of_each_page.get(idx))
data[idx * bytes_per_page].fetch_and(~1, std::memory_order_relaxed);
}
}
return {pages_restored, pages_evicted};
}
PageChunk * PageCache::getFreeChunk(std::unique_lock<std::mutex> & lock /* global_mutex */)
{
if (lru.empty() || (mmaps.size() < max_mmaps && lru.front().key.has_value()))
addMmap(lock);
if (lru.empty())
throw Exception(ErrorCodes::MEMORY_LIMIT_EXCEEDED, "All chunks in the entire page cache ({:.3} GiB) are pinned.",
bytes_per_page * pages_per_chunk * total_chunks * 1. / (1l << 30));
PageChunk * chunk = &lru.front();
lru.erase(lru.iterator_to(*chunk));
size_t prev_pin_count = chunk->pin_count.fetch_add(1);
chassert(prev_pin_count == 0);
evictChunk(chunk, lock);
return chunk;
}
void PageCache::evictChunk(PageChunk * chunk, std::unique_lock<std::mutex> & /* global_mutex */)
{
if (chunk->key.has_value())
{
size_t erased = chunk_by_key.erase(chunk->key.value());
chassert(erased);
chunk->key.reset();
}
chunk->state.reset();
/// This is tricky. We're not holding the chunk_mutex, so another thread might be running
/// sendChunkToLimbo() or even restoreChunkFromLimbo() on this chunk right now.
///
/// Nevertheless, it's correct and sufficient to clear pages_populated here because sendChunkToLimbo()
/// and restoreChunkFromLimbo() only touch pages_populated (only unsetting the bits),
/// first_bit_of_each_page, and the data; and we don't care about first_bit_of_each_page and the data.
///
/// This is precarious, but I don't have better ideas. Note that this clearing (or something else)
/// must be done before unlocking the global_mutex because otherwise another call to getOrSet() might
/// return this chunk before we clear it.
chunk->pages_populated.unsetAll();
}
void PageCache::addMmap(std::unique_lock<std::mutex> & /* global_mutex */)
{
/// ASLR by hand.
void * address_hint = reinterpret_cast<void *>(std::uniform_int_distribution<size_t>(0x100000000000UL, 0x700000000000UL)(rng));
mmaps.emplace_back(bytes_per_page, pages_per_chunk, pages_per_big_page, chunks_per_mmap_target, address_hint, use_huge_pages);
size_t num_chunks = mmaps.back().num_chunks;
total_chunks += num_chunks;
for (size_t i = 0; i < num_chunks; ++i)
/// Link in reverse order, so they get assigned in increasing order. Not important, just seems nice.
lru.push_front(mmaps.back().chunks[num_chunks - 1 - i]);
}
void PageCache::dropCache()
{
std::unique_lock lock(global_mutex);
/// Detach and free unpinned chunks.
bool logged_error = false;
for (PageChunk & chunk : lru)
{
evictChunk(&chunk, lock);
if (use_madv_free)
{
/// This might happen in parallel with sendChunkToLimbo() or restoreChunkFromLimbo(), but it's ok.
int r = madvise(chunk.data, chunk.size, MADV_DONTNEED);
if (r != 0 && !logged_error)
{
logUnexpectedSyscallError("madvise(MADV_DONTNEED)");
logged_error = true;
}
}
}
/// Detach pinned chunks.
for (auto [key, chunk] : chunk_by_key)
{
chassert(chunk->key == key);
chassert(chunk->pin_count > 0); // otherwise it would have been evicted above
chunk->key.reset();
}
chunk_by_key.clear();
}
PageCache::Mmap::Mmap(size_t bytes_per_page_, size_t pages_per_chunk_, size_t pages_per_big_page_, size_t num_chunks_, void * address_hint, bool use_huge_pages_)
{
num_chunks = num_chunks_;
size = bytes_per_page_ * pages_per_chunk_ * num_chunks;
size_t alignment = bytes_per_page_ * pages_per_big_page_;
address_hint = reinterpret_cast<void*>(reinterpret_cast<UInt64>(address_hint) / alignment * alignment);
auto temp_chunks = std::make_unique<PageChunk[]>(num_chunks);
int flags = MAP_PRIVATE | MAP_ANONYMOUS;
#ifdef OS_LINUX
flags |= MAP_NORESERVE;
#endif
ptr = mmap(address_hint, size, PROT_READ | PROT_WRITE, flags, -1, 0);
if (MAP_FAILED == ptr)
throw ErrnoException(ErrorCodes::CANNOT_ALLOCATE_MEMORY, fmt::format("Cannot mmap {}.", ReadableSize(size)));
if (reinterpret_cast<UInt64>(ptr) % bytes_per_page_ != 0)
{
munmap(ptr, size);
throw Exception(ErrorCodes::SYSTEM_ERROR, "mmap returned unaligned address: {}", ptr);
}
void * chunks_start = ptr;
#ifdef OS_LINUX
if (madvise(ptr, size, MADV_DONTDUMP) != 0)
logUnexpectedSyscallError("madvise(MADV_DONTDUMP)");
if (madvise(ptr, size, MADV_DONTFORK) != 0)
logUnexpectedSyscallError("madvise(MADV_DONTFORK)");
if (use_huge_pages_)
{
if (reinterpret_cast<UInt64>(ptr) % alignment != 0)
{
LOG_DEBUG(&Poco::Logger::get("PageCache"), "mmap() returned address not aligned on huge page boundary.");
chunks_start = reinterpret_cast<void*>((reinterpret_cast<UInt64>(ptr) / alignment + 1) * alignment);
chassert(reinterpret_cast<UInt64>(chunks_start) % alignment == 0);
num_chunks -= 1;
}
if (madvise(ptr, size, MADV_HUGEPAGE) != 0)
LOG_WARNING(&Poco::Logger::get("PageCache"),
"madvise(MADV_HUGEPAGE) failed: {}. Userspace page cache will be relatively slow.", errnoToString());
}
#else
(void)use_huge_pages_;
#endif
chunks = std::move(temp_chunks);
for (size_t i = 0; i < num_chunks; ++i)
{
PageChunk * chunk = &chunks[i];
chunk->data = reinterpret_cast<char *>(chunks_start) + bytes_per_page_ * pages_per_chunk_ * i;
chunk->size = bytes_per_page_ * pages_per_chunk_;
chunk->page_size = bytes_per_page_;
chunk->big_page_size = bytes_per_page_ * pages_per_big_page_;
chunk->pages_populated.init(pages_per_chunk_);
chunk->first_bit_of_each_page.init(pages_per_chunk_);
}
}
PageCache::Mmap::Mmap(Mmap && m) noexcept : ptr(std::exchange(m.ptr, nullptr)), size(std::exchange(m.size, 0)), chunks(std::move(m.chunks)), num_chunks(std::exchange(m.num_chunks, 0)) {}
PageCache::Mmap::~Mmap() noexcept
{
if (ptr && 0 != munmap(ptr, size))
logUnexpectedSyscallError("munmap");
}
void FileChunkState::reset() {}
PageCacheKey FileChunkAddress::hash() const
{
SipHash hash(offset);
hash.update(path.data(), path.size());
if (!file_version.empty())
{
hash.update("\0", 1);
hash.update(file_version.data(), file_version.size());
}
return hash.get128();
}
std::string FileChunkAddress::toString() const
{
return fmt::format("{}:{}{}{}", path, offset, file_version.empty() ? "" : ":", file_version);
}
#pragma clang diagnostic pop
}

299
src/Common/PageCache.h Normal file
View File

@ -0,0 +1,299 @@
#pragma once
#include <boost/intrusive/list.hpp>
#include <pcg_random.hpp>
#include <Common/randomSeed.h>
#include <Core/Types.h>
#include <Common/HashTable/HashMap.h>
/// "Userspace page cache"
/// A cache for contents of remote files.
/// Uses MADV_FREE to allow Linux to evict pages from our cache under memory pressure.
/// Typically takes up almost all of the available memory, similar to the actual page cache.
///
/// Intended for caching data retrieved from distributed cache, but can be used for other things too,
/// just replace FileChunkState with a discriminated union, or something, if needed.
///
/// There are two fixed-size units of caching here:
/// * OS pages, typically 4 KiB each.
/// * Page chunks, 2 MiB each (configurable with page_cache_block_size setting).
///
/// Each file is logically split into aligned 2 MiB blocks, which are mapped to page chunks inside the cache.
/// They are cached independently from each other.
///
/// Each page chunk has a contiguous 2 MiB buffer that can be pinned and directly used e.g. by ReadBuffers.
/// While pinned (by at least one PinnedPageChunk), the pages are not reclaimable by the OS.
///
/// Inside each page chunk, any subset of pages may be populated. Unpopulated pages may or not be
/// mapped to any physical RAM. We maintain a bitmask that keeps track of which pages are populated.
/// Pages become unpopulated if they're reclaimed by the OS (when the page chunk is not pinned),
/// or if we just never populate them in the first place (e.g. if a file is shorter than 2 MiB we
/// still create a 2 MiB page chunk, but use only a prefix of it).
///
/// There are two separate eviction mechanisms at play:
/// * LRU eviction of page chunks in PageCache.
/// * OS reclaiming pages on memory pressure. We have no control over the eviction policy.
/// It probably picks the pages in the same order in which they were marked with MADV_FREE, so
/// effectively in the same LRU order as our policy in PageCache.
/// When using PageCache in oversubscribed fashion, using all available memory and relying on OS eviction,
/// the PageCache's eviction policy mostly doesn't matter. It just needs to be similar enough to the OS's
/// policy that we rarely evict chunks with unevicted pages.
///
/// We mmap memory directly instead of using allocator because this enables:
/// * knowing how much RAM the cache is using, via /proc/self/smaps,
/// * MADV_HUGEPAGE (use transparent huge pages - this makes MADV_FREE 10x less slow),
/// * MAP_NORESERVE (don't reserve swap space - otherwise large mmaps usually fail),
/// * MADV_DONTDUMP (don't include in core dumps),
/// * page-aligned addresses without padding.
///
/// madvise(MADV_FREE) call is slow: ~6 GiB/s (doesn't scale with more threads). Enabling transparent
/// huge pages (MADV_HUGEPAGE) makes it 10x less slow, so we do that. That makes the physical RAM allocation
/// work at 2 MiB granularity instead of 4 KiB, so the cache becomes less suitable for small files.
/// If this turns out to be a problem, we may consider allowing different mmaps to have different flags,
/// some having no huge pages.
/// Note that we do our bookkeeping at small-page granularity even if huge pages are enabled.
///
/// It's unfortunate that Linux's MADV_FREE eviction doesn't use the two-list strategy like the real
/// page cache (IIUC, MADV_FREE puts the pages at the head of the inactive list, and they can never
/// get to the active list).
/// If this turns out to be a problem, we could make PageCache do chunk eviction based on observed
/// system memory usage, so that most eviction is done by us, and the MADV_FREE eviction kicks in
/// only as a last resort. Then we can make PageCache's eviction policy arbitrarily more sophisticated.
namespace DB
{
/// Hash of FileChunkAddress.
using PageCacheKey = UInt128;
/// Identifies a chunk of a file or object.
/// We assume that contents of such file/object don't change (without file_version changing), so
/// cache invalidation is needed.
struct FileChunkAddress
{
/// Path, usually prefixed with storage system name and anything else needed to make it unique.
/// E.g. "s3:<bucket>/<path>"
std::string path;
/// Optional string with ETag, or file modification time, or anything else.
std::string file_version;
size_t offset = 0;
PageCacheKey hash() const;
std::string toString() const;
};
struct AtomicBitSet
{
size_t n = 0;
std::unique_ptr<std::atomic<UInt8>[]> v;
AtomicBitSet();
void init(size_t n);
bool get(size_t i) const;
bool any() const;
/// These return true if the bit was changed, false if it already had the target value.
/// (These methods are logically not const, but clang insists that I make them const, and
/// '#pragma clang diagnostic ignored' doesn't seem to work.)
bool set(size_t i) const;
bool set(size_t i, bool val) const;
bool unset(size_t i) const;
void unsetAll() const;
};
enum class PageChunkState
{
/// Pages are not reclaimable by the OS, the buffer has correct contents.
Stable,
/// Pages are reclaimable by the OS, the buffer contents are altered (first bit of each page set to 1).
Limbo,
};
/// (This is a separate struct just in case we want to use this cache for other things in future.
/// Then this struct would be the customization point, while the rest of PageChunk can stay unchanged.)
struct FileChunkState
{
std::mutex download_mutex;
void reset();
};
using PageChunkLRUListHook = boost::intrusive::list_base_hook<>;
/// Cache entry.
struct PageChunk : public PageChunkLRUListHook
{
char * data;
size_t size; // in bytes
/// Page size for use in pages_populated and first_bit_of_each_page. Same as PageCache::pageSize().
size_t page_size;
/// Actual eviction granularity. Just for information. If huge pages are used, huge page size, otherwise page_size.
size_t big_page_size;
mutable FileChunkState state;
AtomicBitSet pages_populated;
private:
friend class PinnedPageChunk;
friend class PageCache;
/// If nullopt, the chunk is "detached", i.e. not associated with any key.
/// Detached chunks may still be pinned. Chunk may get detached even while pinned, in particular when dropping cache.
/// Protected by global_mutex.
std::optional<PageCacheKey> key;
/// Refcount for usage of this chunk. When zero, the pages are reclaimable by the OS, and
/// the PageChunk itself is evictable (linked into PageCache::lru).
std::atomic<size_t> pin_count {0};
/// Bit mask containing the first bit of data from each page. Needed for the weird probing procedure when un-MADV_FREE-ing the pages.
AtomicBitSet first_bit_of_each_page;
/// Locked when changing pages_state, along with the corresponding expensive MADV_FREE/un-MADV_FREE operation.
mutable std::mutex chunk_mutex;
/// Normally pin_count == 0 <=> state == PageChunkState::Limbo,
/// pin_count > 0 <=> state == PageChunkState::Stable.
/// This separate field is needed because of synchronization: pin_count is changed with global_mutex locked,
/// this field is changed with chunk_mutex locked, and we never have to lock both mutexes at once.
PageChunkState pages_state = PageChunkState::Stable;
};
class PageCache;
/// Handle for a cache entry. Neither the entry nor its pages can get evicted while there's at least one PinnedPageChunk pointing to it.
class PinnedPageChunk
{
public:
const PageChunk * getChunk() const;
/// Sets the bit in pages_populated. Returns true if it actually changed (i.e. was previously 0).
bool markPagePopulated(size_t page_idx);
/// Calls markPagePopulated() for pages 0..ceil(bytes/page_size).
void markPrefixPopulated(size_t bytes);
bool isPrefixPopulated(size_t bytes) const;
PinnedPageChunk() = default;
~PinnedPageChunk() noexcept;
PinnedPageChunk(PinnedPageChunk &&) noexcept;
PinnedPageChunk & operator=(PinnedPageChunk &&) noexcept;
private:
friend class PageCache;
PageCache * cache = nullptr;
PageChunk * chunk = nullptr;
PinnedPageChunk(PageCache * cache_, PageChunk * chunk_) noexcept;
};
class PageCache
{
public:
PageCache(size_t bytes_per_chunk, size_t bytes_per_mmap, size_t bytes_total, bool use_madv_free, bool use_huge_pages);
~PageCache();
/// Get or insert a chunk for the given key.
///
/// If detached_if_missing = true, and the key is not present in the cache, the returned chunk
/// won't be associated with the key and will be evicted as soon as it's unpinned.
/// It's like "get if exists, otherwise return null", but instead of null we return a usable
/// temporary buffer, for convenience. Pinning and page eviction make the story more complicated:
/// * If the chunk for this key is pinned, we return it even if it's not fully populated
/// (because PageCache doesn't know what "fully populated" means).
/// * If the chunk exists, but some of its pages were evicted, we detach it. (Currently we only
/// check the first page here.)
PinnedPageChunk getOrSet(PageCacheKey key, bool detached_if_missing, bool inject_eviction);
/// OS page size, e.g. 4 KiB on x86, 4 KiB or 64 KiB on aarch64.
///
/// If transparent huge pages are enabled, this is still the regular page size, and all our bookkeeping
/// is still based on regular page size (e.g. pages_populated), because (a) it's cheap anyway,
/// and (b) I'm not sure if Linux guarantees that MADV_FREE reclamation always happens at huge page
/// granularity, and wouldn't want to rely on this even if it does.
size_t pageSize() const;
size_t chunkSize() const;
size_t maxChunks() const;
struct MemoryStats
{
/// How many bytes of actual RAM are used for the cache pages. Doesn't include metadata
/// and overhead (e.g. PageChunk structs).
size_t page_cache_rss = 0;
/// Resident set size for the whole process, excluding any MADV_FREE pages (PageCache's or not).
/// This can be used as a more useful memory usage number for clickhouse server, instead of RSS.
/// Populated only if MADV_FREE is used, otherwise zero.
std::optional<size_t> unreclaimable_rss;
};
/// Reads /proc/self/smaps, so not very fast.
MemoryStats getResidentSetSize() const;
/// Total length of memory ranges currently pinned by PinnedPageChunk-s, including unpopulated pages.
size_t getPinnedSize() const;
/// Clears the key -> chunk mapping. Frees memory (MADV_DONTNEED) of all chunks that are not pinned.
/// Doesn't unmap any virtual memory. Detaches but doesn't free the pinned chunks.
/// Locks the global mutex for the duration of the operation, which may block queries for hundreds of milliseconds.
void dropCache();
private:
friend class PinnedPageChunk;
struct Mmap
{
void * ptr = nullptr;
size_t size = 0;
std::unique_ptr<PageChunk[]> chunks;
size_t num_chunks = 0; // might be smaller than chunks_per_mmap_target because of alignment
Mmap(Mmap &&) noexcept;
Mmap(size_t bytes_per_page, size_t pages_per_chunk, size_t pages_per_big_page, size_t num_chunks, void * address_hint, bool use_huge_pages_);
~Mmap() noexcept;
};
size_t bytes_per_page;
size_t pages_per_chunk;
size_t chunks_per_mmap_target;
size_t max_mmaps;
size_t pages_per_big_page = 1; // if huge pages are used, huge_page_size/page_size, otherwise 1
bool use_madv_free = true;
bool use_huge_pages = true;
mutable std::mutex global_mutex;
pcg64 rng;
std::vector<Mmap> mmaps;
size_t total_chunks = 0;
/// All non-pinned chunks, including ones not assigned to any file. Least recently used is begin().
boost::intrusive::list<PageChunk, boost::intrusive::base_hook<PageChunkLRUListHook>, boost::intrusive::constant_time_size<true>> lru;
HashMap<PageCacheKey, PageChunk *> chunk_by_key;
/// Get a usable chunk, doing eviction or allocation if needed.
/// Caller is responsible for clearing pages_populated.
PageChunk * getFreeChunk(std::unique_lock<std::mutex> & /* global_mutex */);
void addMmap(std::unique_lock<std::mutex> & /* global_mutex */);
void evictChunk(PageChunk * chunk, std::unique_lock<std::mutex> & /* global_mutex */);
void removeRef(PageChunk * chunk) noexcept;
/// These may run in parallel with getFreeChunk(), so be very careful about which fields of the PageChunk we touch here.
void sendChunkToLimbo(PageChunk * chunk, std::unique_lock<std::mutex> & /* chunk_mutex */) const noexcept;
/// Returns {pages_restored, pages_evicted}.
std::pair<size_t, size_t> restoreChunkFromLimbo(PageChunk * chunk, std::unique_lock<std::mutex> & /* chunk_mutex */) const noexcept;
};
using PageCachePtr = std::shared_ptr<PageCache>;
}

View File

@ -63,6 +63,15 @@
M(MarkCacheMisses, "Number of times an entry has not been found in the mark cache, so we had to load a mark file in memory, which is a costly operation, adding to query latency.") \ M(MarkCacheMisses, "Number of times an entry has not been found in the mark cache, so we had to load a mark file in memory, which is a costly operation, adding to query latency.") \
M(QueryCacheHits, "Number of times a query result has been found in the query cache (and query computation was avoided). Only updated for SELECT queries with SETTING use_query_cache = 1.") \ M(QueryCacheHits, "Number of times a query result has been found in the query cache (and query computation was avoided). Only updated for SELECT queries with SETTING use_query_cache = 1.") \
M(QueryCacheMisses, "Number of times a query result has not been found in the query cache (and required query computation). Only updated for SELECT queries with SETTING use_query_cache = 1.") \ M(QueryCacheMisses, "Number of times a query result has not been found in the query cache (and required query computation). Only updated for SELECT queries with SETTING use_query_cache = 1.") \
/* Each page cache chunk access increments exactly one of the following 5 PageCacheChunk* counters. */ \
/* Something like hit rate: (PageCacheChunkShared + PageCacheChunkDataHits) / [sum of all 5]. */ \
M(PageCacheChunkMisses, "Number of times a chunk has not been found in the userspace page cache.") \
M(PageCacheChunkShared, "Number of times a chunk has been found in the userspace page cache, already in use by another thread.") \
M(PageCacheChunkDataHits, "Number of times a chunk has been found in the userspace page cache, not in use, with all pages intact.") \
M(PageCacheChunkDataPartialHits, "Number of times a chunk has been found in the userspace page cache, not in use, but some of its pages were evicted by the OS.") \
M(PageCacheChunkDataMisses, "Number of times a chunk has been found in the userspace page cache, not in use, but all its pages were evicted by the OS.") \
M(PageCacheBytesUnpinnedRoundedToPages, "Total size of populated pages in chunks that became evictable in PageCache. Rounded up to whole pages.") \
M(PageCacheBytesUnpinnedRoundedToHugePages, "See PageCacheBytesUnpinnedRoundedToPages, but rounded to huge pages. Use the ratio between the two as a measure of memory waste from using huge pages.") \
M(CreatedReadBufferOrdinary, "Number of times ordinary read buffer was created for reading data (while choosing among other read methods).") \ M(CreatedReadBufferOrdinary, "Number of times ordinary read buffer was created for reading data (while choosing among other read methods).") \
M(CreatedReadBufferDirectIO, "Number of times a read buffer with O_DIRECT was created for reading data (while choosing among other read methods).") \ M(CreatedReadBufferDirectIO, "Number of times a read buffer with O_DIRECT was created for reading data (while choosing among other read methods).") \
M(CreatedReadBufferDirectIOFailed, "Number of times a read buffer with O_DIRECT was attempted to be created for reading data (while choosing among other read methods), but the OS did not allow it (due to lack of filesystem support or other reasons) and we fallen back to the ordinary reading method.") \ M(CreatedReadBufferDirectIOFailed, "Number of times a read buffer with O_DIRECT was attempted to be created for reading data (while choosing among other read methods), but the OS did not allow it (due to lack of filesystem support or other reasons) and we fallen back to the ordinary reading method.") \
@ -92,6 +101,8 @@
M(LocalWriteThrottlerBytes, "Bytes passed through 'max_local_write_bandwidth_for_server'/'max_local_write_bandwidth' throttler.") \ M(LocalWriteThrottlerBytes, "Bytes passed through 'max_local_write_bandwidth_for_server'/'max_local_write_bandwidth' throttler.") \
M(LocalWriteThrottlerSleepMicroseconds, "Total time a query was sleeping to conform 'max_local_write_bandwidth_for_server'/'max_local_write_bandwidth' throttling.") \ M(LocalWriteThrottlerSleepMicroseconds, "Total time a query was sleeping to conform 'max_local_write_bandwidth_for_server'/'max_local_write_bandwidth' throttling.") \
M(ThrottlerSleepMicroseconds, "Total time a query was sleeping to conform all throttling settings.") \ M(ThrottlerSleepMicroseconds, "Total time a query was sleeping to conform all throttling settings.") \
M(PartsWithAppliedMutationsOnFly, "Total number of parts for which there was any mutation applied on fly") \
M(MutationsAppliedOnFlyInAllParts, "The sum of number of applied mutations on-fly for part among all read parts") \
\ \
M(QueryMaskingRulesMatch, "Number of times query masking rules was successfully matched.") \ M(QueryMaskingRulesMatch, "Number of times query masking rules was successfully matched.") \
\ \
@ -311,6 +322,12 @@ The server successfully detected this situation and will download merged part fr
M(ParallelReplicasProcessingPartsMicroseconds, "Time spent processing data parts") \ M(ParallelReplicasProcessingPartsMicroseconds, "Time spent processing data parts") \
M(ParallelReplicasStealingLeftoversMicroseconds, "Time spent collecting orphaned segments") \ M(ParallelReplicasStealingLeftoversMicroseconds, "Time spent collecting orphaned segments") \
M(ParallelReplicasCollectingOwnedSegmentsMicroseconds, "Time spent collecting segments meant by hash") \ M(ParallelReplicasCollectingOwnedSegmentsMicroseconds, "Time spent collecting segments meant by hash") \
M(ParallelReplicasNumRequests, "Number of requests to the initiator.") \
M(ParallelReplicasDeniedRequests, "Number of completely denied requests to the initiator") \
M(CacheWarmerBytesDownloaded, "Amount of data fetched into filesystem cache by dedicated background threads.") \
M(CacheWarmerDataPartsDownloaded, "Number of data parts that were fully fetched by CacheWarmer.") \
M(IgnoredColdParts, "See setting ignore_cold_parts_seconds. Number of times read queries ignored very new parts that weren't pulled into cache by CacheWarmer yet.") \
M(PreferredWarmedUnmergedParts, "See setting prefer_warmed_unmerged_parts_seconds. Number of times read queries used outdated pre-merge parts that are in cache instead of merged part that wasn't pulled into cache by CacheWarmer yet.") \
\ \
M(PerfCPUCycles, "Total cycles. Be wary of what happens during CPU frequency scaling.") \ M(PerfCPUCycles, "Total cycles. Be wary of what happens during CPU frequency scaling.") \
M(PerfInstructions, "Retired instructions. Be careful, these can be affected by various issues, most notably hardware interrupt counts.") \ M(PerfInstructions, "Retired instructions. Be careful, these can be affected by various issues, most notably hardware interrupt counts.") \
@ -516,6 +533,21 @@ The server successfully detected this situation and will download merged part fr
M(AggregationPreallocatedElementsInHashTables, "How many elements were preallocated in hash tables for aggregation.") \ M(AggregationPreallocatedElementsInHashTables, "How many elements were preallocated in hash tables for aggregation.") \
M(AggregationHashTablesInitializedAsTwoLevel, "How many hash tables were inited as two-level for aggregation.") \ M(AggregationHashTablesInitializedAsTwoLevel, "How many hash tables were inited as two-level for aggregation.") \
\ \
M(MetadataFromKeeperCacheHit, "Number of times an object storage metadata request was answered from cache without making request to Keeper") \
M(MetadataFromKeeperCacheMiss, "Number of times an object storage metadata request had to be answered from Keeper") \
M(MetadataFromKeeperCacheUpdateMicroseconds, "Total time spent in updating the cache including waiting for responses from Keeper") \
M(MetadataFromKeeperUpdateCacheOneLevel, "Number of times a cache update for one level of directory tree was done") \
M(MetadataFromKeeperTransactionCommit, "Number of times metadata transaction commit was attempted") \
M(MetadataFromKeeperTransactionCommitRetry, "Number of times metadata transaction commit was retried") \
M(MetadataFromKeeperCleanupTransactionCommit, "Number of times metadata transaction commit for deleted objects cleanup was attempted") \
M(MetadataFromKeeperCleanupTransactionCommitRetry, "Number of times metadata transaction commit for deleted objects cleanup was retried") \
M(MetadataFromKeeperOperations, "Number of times a request was made to Keeper") \
M(MetadataFromKeeperIndividualOperations, "Number of paths read or written by single or multi requests to Keeper") \
M(MetadataFromKeeperReconnects, "Number of times a reconnect to Keeper was done") \
M(MetadataFromKeeperBackgroundCleanupObjects, "Number of times a old deleted object clean up was performed by background task") \
M(MetadataFromKeeperBackgroundCleanupTransactions, "Number of times old transaction idempotency token was cleaned up by background task") \
M(MetadataFromKeeperBackgroundCleanupErrors, "Number of times an error was encountered in background cleanup task") \
\
M(KafkaRebalanceRevocations, "Number of partition revocations (the first stage of consumer group rebalance)") \ M(KafkaRebalanceRevocations, "Number of partition revocations (the first stage of consumer group rebalance)") \
M(KafkaRebalanceAssignments, "Number of partition assignments (the final stage of consumer group rebalance)") \ M(KafkaRebalanceAssignments, "Number of partition assignments (the final stage of consumer group rebalance)") \
M(KafkaRebalanceErrors, "Number of failed consumer group rebalances") \ M(KafkaRebalanceErrors, "Number of failed consumer group rebalances") \
@ -607,9 +639,32 @@ The server successfully detected this situation and will download merged part fr
M(MergeTreeAllRangesAnnouncementsSentElapsedMicroseconds, "Time spent in sending the announcement from the remote server to the initiator server about the set of data parts (for MergeTree tables). Measured on the remote server side.") \ M(MergeTreeAllRangesAnnouncementsSentElapsedMicroseconds, "Time spent in sending the announcement from the remote server to the initiator server about the set of data parts (for MergeTree tables). Measured on the remote server side.") \
\ \
M(ConnectionPoolIsFullMicroseconds, "Total time spent waiting for a slot in connection pool.") \ M(ConnectionPoolIsFullMicroseconds, "Total time spent waiting for a slot in connection pool.") \
\
M(AsyncLoaderWaitMicroseconds, "Total time a query was waiting for async loader jobs.") \ M(AsyncLoaderWaitMicroseconds, "Total time a query was waiting for async loader jobs.") \
\ \
M(DistrCacheServerSwitches, "Number of server switches between distributed cache servers in read/write-through cache") \
M(DistrCacheReadMicroseconds, "Time spent reading from distributed cache") \
M(DistrCacheFallbackReadMicroseconds, "Time spend reading from fallback buffer instead of distribted cache") \
M(DistrCachePrecomputeRangesMicroseconds, "Time spent to precompute read ranges") \
M(DistrCacheNextImplMicroseconds, "Time spend in ReadBufferFromDistributedCache::nextImpl") \
M(DistrCacheOpenedConnections, "The number of open connections to distributed cache") \
M(DistrCacheReusedConnections, "The number of reused connections to distributed cache") \
M(DistrCacheHoldConnections, "The number of used connections to distributed cache") \
\
M(DistrCacheGetResponseMicroseconds, "Time spend to wait for response from distributed cache") \
M(DistrCacheStartRangeMicroseconds, "Time spent to start a new read range with distributed cache") \
M(DistrCacheLockRegistryMicroseconds, "Time spent to take DistributedCacheRegistry lock") \
M(DistrCacheUnusedPackets, "Number of skipped unused packets from distributed cache") \
M(DistrCachePackets, "Total number of packets received from distributed cache") \
M(DistrCacheUnusedPacketsBytes, "The number of bytes in Data packets which were ignored") \
M(DistrCacheRegistryUpdateMicroseconds, "Time spent updating distributed cache registry") \
M(DistrCacheRegistryUpdates, "Number of distributed cache registry updates") \
\
M(DistrCacheConnectMicroseconds, "The time spent to connect to distributed cache") \
M(DistrCacheConnectAttempts, "The number of connection attempts to distributed cache") \
M(DistrCacheGetClient, "Number of client access times") \
\
M(DistrCacheServerProcessRequestMicroseconds, "Time spent processing request on DistributedCache server side") \
\
M(LogTest, "Number of log messages with level Test") \ M(LogTest, "Number of log messages with level Test") \
M(LogTrace, "Number of log messages with level Trace") \ M(LogTrace, "Number of log messages with level Trace") \
M(LogDebug, "Number of log messages with level Debug") \ M(LogDebug, "Number of log messages with level Debug") \

View File

@ -196,8 +196,9 @@ bool ThreadStatus::isQueryCanceled() const
if (!thread_group) if (!thread_group)
return false; return false;
chassert(local_data.query_is_canceled_predicate); if (local_data.query_is_canceled_predicate)
return local_data.query_is_canceled_predicate(); return local_data.query_is_canceled_predicate();
return false;
} }
ThreadStatus::~ThreadStatus() ThreadStatus::~ThreadStatus()

View File

@ -8,6 +8,7 @@
#include <vector> #include <vector>
#include <memory> #include <memory>
#include <cstdint> #include <cstdint>
#include <span>
#include <functional> #include <functional>
/** Generic interface for ZooKeeper-like services. /** Generic interface for ZooKeeper-like services.
@ -622,6 +623,10 @@ public:
int32_t version, int32_t version,
ReconfigCallback callback) = 0; ReconfigCallback callback) = 0;
virtual void multi(
std::span<const RequestPtr> requests,
MultiCallback callback) = 0;
virtual void multi( virtual void multi(
const Requests & requests, const Requests & requests,
MultiCallback callback) = 0; MultiCallback callback) = 0;

View File

@ -157,6 +157,10 @@ struct TestKeeperReconfigRequest final : ReconfigRequest, TestKeeperRequest
struct TestKeeperMultiRequest final : MultiRequest, TestKeeperRequest struct TestKeeperMultiRequest final : MultiRequest, TestKeeperRequest
{ {
explicit TestKeeperMultiRequest(const Requests & generic_requests) explicit TestKeeperMultiRequest(const Requests & generic_requests)
: TestKeeperMultiRequest(std::span(generic_requests))
{}
explicit TestKeeperMultiRequest(std::span<const RequestPtr> generic_requests)
{ {
requests.reserve(generic_requests.size()); requests.reserve(generic_requests.size());
@ -883,6 +887,13 @@ void TestKeeper::reconfig(
void TestKeeper::multi( void TestKeeper::multi(
const Requests & requests, const Requests & requests,
MultiCallback callback) MultiCallback callback)
{
return multi(std::span(requests), std::move(callback));
}
void TestKeeper::multi(
std::span<const RequestPtr> requests,
MultiCallback callback)
{ {
TestKeeperMultiRequest request(requests); TestKeeperMultiRequest request(requests);

View File

@ -101,6 +101,10 @@ public:
const Requests & requests, const Requests & requests,
MultiCallback callback) override; MultiCallback callback) override;
void multi(
std::span<const RequestPtr> requests,
MultiCallback callback) override;
void finalize(const String & reason) override; void finalize(const String & reason) override;
bool isFeatureEnabled(DB::KeeperFeatureFlag) const override bool isFeatureEnabled(DB::KeeperFeatureFlag) const override

View File

@ -1266,6 +1266,11 @@ std::future<Coordination::RemoveResponse> ZooKeeper::asyncTryRemoveNoThrow(const
} }
std::future<Coordination::MultiResponse> ZooKeeper::asyncTryMultiNoThrow(const Coordination::Requests & ops) std::future<Coordination::MultiResponse> ZooKeeper::asyncTryMultiNoThrow(const Coordination::Requests & ops)
{
return asyncTryMultiNoThrow(std::span(ops));
}
std::future<Coordination::MultiResponse> ZooKeeper::asyncTryMultiNoThrow(std::span<const Coordination::RequestPtr> ops)
{ {
auto promise = std::make_shared<std::promise<Coordination::MultiResponse>>(); auto promise = std::make_shared<std::promise<Coordination::MultiResponse>>();
auto future = promise->get_future(); auto future = promise->get_future();

View File

@ -550,6 +550,7 @@ public:
FutureMulti asyncMulti(const Coordination::Requests & ops); FutureMulti asyncMulti(const Coordination::Requests & ops);
/// Like the previous one but don't throw any exceptions on future.get() /// Like the previous one but don't throw any exceptions on future.get()
FutureMulti asyncTryMultiNoThrow(const Coordination::Requests & ops); FutureMulti asyncTryMultiNoThrow(const Coordination::Requests & ops);
FutureMulti asyncTryMultiNoThrow(std::span<const Coordination::RequestPtr> ops);
using FutureSync = std::future<Coordination::SyncResponse>; using FutureSync = std::future<Coordination::SyncResponse>;
FutureSync asyncSync(const std::string & path); FutureSync asyncSync(const std::string & path);

View File

@ -156,6 +156,12 @@ std::string ZooKeeperAuthRequest::toStringImpl() const
void ZooKeeperCreateRequest::writeImpl(WriteBuffer & out) const void ZooKeeperCreateRequest::writeImpl(WriteBuffer & out) const
{ {
/// See https://github.com/ClickHouse/clickhouse-private/issues/3029
if (path.starts_with("/clickhouse/tables/") && path.find("/parts/") != std::string::npos)
{
LOG_TRACE(getLogger(__PRETTY_FUNCTION__), "Creating part at path {}", path);
}
Coordination::write(path, out); Coordination::write(path, out);
Coordination::write(data, out); Coordination::write(data, out);
Coordination::write(acls, out); Coordination::write(acls, out);
@ -480,6 +486,10 @@ OpNum ZooKeeperMultiRequest::getOpNum() const
} }
ZooKeeperMultiRequest::ZooKeeperMultiRequest(const Requests & generic_requests, const ACLs & default_acls) ZooKeeperMultiRequest::ZooKeeperMultiRequest(const Requests & generic_requests, const ACLs & default_acls)
: ZooKeeperMultiRequest(std::span{generic_requests}, default_acls)
{}
ZooKeeperMultiRequest::ZooKeeperMultiRequest(std::span<const Coordination::RequestPtr> generic_requests, const ACLs & default_acls)
{ {
/// Convert nested Requests to ZooKeeperRequests. /// Convert nested Requests to ZooKeeperRequests.
/// Note that deep copy is required to avoid modifying path in presence of chroot prefix. /// Note that deep copy is required to avoid modifying path in presence of chroot prefix.

View File

@ -7,17 +7,13 @@
#include <boost/noncopyable.hpp> #include <boost/noncopyable.hpp>
#include <IO/ReadBuffer.h> #include <IO/ReadBuffer.h>
#include <IO/WriteBuffer.h> #include <IO/WriteBuffer.h>
#include <map>
#include <unordered_map> #include <unordered_map>
#include <mutex>
#include <chrono>
#include <vector> #include <vector>
#include <memory> #include <memory>
#include <thread>
#include <atomic>
#include <cstdint> #include <cstdint>
#include <optional> #include <optional>
#include <functional> #include <functional>
#include <span>
namespace Coordination namespace Coordination
@ -516,6 +512,7 @@ struct ZooKeeperMultiRequest final : MultiRequest, ZooKeeperRequest
ZooKeeperMultiRequest() = default; ZooKeeperMultiRequest() = default;
ZooKeeperMultiRequest(const Requests & generic_requests, const ACLs & default_acls); ZooKeeperMultiRequest(const Requests & generic_requests, const ACLs & default_acls);
ZooKeeperMultiRequest(std::span<const Coordination::RequestPtr> generic_requests, const ACLs & default_acls);
void writeImpl(WriteBuffer & out) const override; void writeImpl(WriteBuffer & out) const override;
void readImpl(ReadBuffer & in) override; void readImpl(ReadBuffer & in) override;

View File

@ -1454,6 +1454,13 @@ void ZooKeeper::reconfig(
void ZooKeeper::multi( void ZooKeeper::multi(
const Requests & requests, const Requests & requests,
MultiCallback callback) MultiCallback callback)
{
multi(std::span(requests), std::move(callback));
}
void ZooKeeper::multi(
std::span<const RequestPtr> requests,
MultiCallback callback)
{ {
ZooKeeperMultiRequest request(requests, default_acls); ZooKeeperMultiRequest request(requests, default_acls);

View File

@ -194,6 +194,10 @@ public:
int32_t version, int32_t version,
ReconfigCallback callback) final; ReconfigCallback callback) final;
void multi(
std::span<const RequestPtr> requests,
MultiCallback callback) override;
void multi( void multi(
const Requests & requests, const Requests & requests,
MultiCallback callback) override; MultiCallback callback) override;

View File

@ -147,6 +147,11 @@ public:
user_error = UserError{}; user_error = UserError{};
} }
void setKeeperError(const zkutil::KeeperException & exception)
{
setKeeperError(std::make_exception_ptr(exception), exception.code, exception.message());
}
void stopRetries() { stop_retries = true; } void stopRetries() { stop_retries = true; }
bool isLastRetry() const { return total_failures >= retries_info.max_retries; } bool isLastRetry() const { return total_failures >= retries_info.max_retries; }
@ -180,6 +185,12 @@ private:
bool canTry() bool canTry()
{ {
if (unconditional_retry)
{
unconditional_retry = false;
return true;
}
if (iteration_succeeded) if (iteration_succeeded)
{ {
if (logger && total_failures > 0) if (logger && total_failures > 0)
@ -275,6 +286,10 @@ private:
UInt64 current_iteration = 0; UInt64 current_iteration = 0;
UInt64 current_backoff_ms = 0; UInt64 current_backoff_ms = 0;
public:
/// This is used in SharedMergeTree
bool unconditional_retry = false;
}; };
} }

View File

@ -5,6 +5,8 @@
#include <syscall.h> #include <syscall.h>
#include <unistd.h> #include <unistd.h>
#include <linux/capability.h> #include <linux/capability.h>
#include <cstdint>
#include <base/types.h>
#include <Common/Exception.h> #include <Common/Exception.h>
@ -16,25 +18,48 @@ namespace ErrorCodes
extern const int NETLINK_ERROR; extern const int NETLINK_ERROR;
} }
static __user_cap_data_struct getCapabilities() struct Capabilities
{
UInt64 effective;
UInt64 permitted;
UInt64 inheritable;
};
static Capabilities getCapabilities()
{ {
/// See man getcap. /// See man getcap.
__user_cap_header_struct request{}; __user_cap_header_struct request{};
request.version = _LINUX_CAPABILITY_VERSION_1; /// It's enough to check just single CAP_NET_ADMIN capability we are interested. request.version = _LINUX_CAPABILITY_VERSION_3;
request.pid = getpid(); request.pid = getpid();
__user_cap_data_struct response{}; Capabilities ret{};
__user_cap_data_struct response[2] = {};
/// Avoid dependency on 'libcap'. /// Avoid dependency on 'libcap'.
if (0 != syscall(SYS_capget, &request, &response)) if (0 == syscall(SYS_capget, &request, response))
throw ErrnoException(ErrorCodes::NETLINK_ERROR, "Cannot do 'capget' syscall"); {
ret.effective = static_cast<UInt64>(response[1].effective) << 32 | response[0].effective;
ret.permitted = static_cast<UInt64>(response[1].permitted) << 32 | response[0].permitted;
ret.inheritable = static_cast<UInt64>(response[1].inheritable) << 32 | response[0].inheritable;
return ret;
}
return response; /// Does not supports V3, fallback to V1.
/// It's enough to check just single CAP_NET_ADMIN capability we are interested.
if (errno == EINVAL && 0 == syscall(SYS_capget, &request, response))
{
ret.effective = response[0].effective;
ret.permitted = response[0].permitted;
ret.inheritable = response[0].inheritable;
return ret;
}
throw ErrnoException(ErrorCodes::NETLINK_ERROR, "Cannot do 'capget' syscall");
} }
bool hasLinuxCapability(int cap) bool hasLinuxCapability(int cap)
{ {
static __user_cap_data_struct capabilities = getCapabilities(); static Capabilities capabilities = getCapabilities();
return (1 << cap) & capabilities.effective; return (1 << cap) & capabilities.effective;
} }

View File

@ -1,14 +1,9 @@
#include <Compression/CompressionCodecMultiple.h> #include <Compression/CompressionCodecMultiple.h>
#include <Compression/CompressionInfo.h> #include <Compression/CompressionInfo.h>
#include <Common/PODArray.h> #include <Common/PODArray.h>
#include <base/unaligned.h>
#include <Compression/CompressionFactory.h> #include <Compression/CompressionFactory.h>
#include <Parsers/ASTExpressionList.h>
#include <Parsers/ASTFunction.h>
#include <IO/WriteHelpers.h> #include <IO/WriteHelpers.h>
#include <IO/WriteBufferFromString.h> #include <IO/WriteBufferFromString.h>
#include <IO/Operators.h>
#include <base/hex.h>
namespace DB namespace DB
@ -88,14 +83,34 @@ void CompressionCodecMultiple::doDecompressData(const char * source, UInt32 sour
const auto codec = CompressionCodecFactory::instance().get(compression_method); const auto codec = CompressionCodecFactory::instance().get(compression_method);
auto additional_size_at_the_end_of_buffer = codec->getAdditionalSizeAtTheEndOfBuffer(); auto additional_size_at_the_end_of_buffer = codec->getAdditionalSizeAtTheEndOfBuffer();
compressed_buf.resize(compressed_buf.size() + additional_size_at_the_end_of_buffer); if (compressed_buf.size() >= 1_GiB)
throw Exception(decompression_error_code, "Too large compressed size: {}", compressed_buf.size());
{
UInt32 bytes_to_resize;
if (common::addOverflow(static_cast<UInt32>(compressed_buf.size()), additional_size_at_the_end_of_buffer, bytes_to_resize))
throw Exception(decompression_error_code, "Too large compressed size: {}", compressed_buf.size());
compressed_buf.resize(compressed_buf.size() + additional_size_at_the_end_of_buffer);
}
UInt32 uncompressed_size = readDecompressedBlockSize(compressed_buf.data()); UInt32 uncompressed_size = readDecompressedBlockSize(compressed_buf.data());
if (uncompressed_size >= 1_GiB)
throw Exception(decompression_error_code, "Too large uncompressed size: {}", uncompressed_size);
if (idx == 0 && uncompressed_size != decompressed_size) if (idx == 0 && uncompressed_size != decompressed_size)
throw Exception(decompression_error_code, "Wrong final decompressed size in codec Multiple, got {}, expected {}", throw Exception(decompression_error_code, "Wrong final decompressed size in codec Multiple, got {}, expected {}",
uncompressed_size, decompressed_size); uncompressed_size, decompressed_size);
uncompressed_buf.resize(uncompressed_size + additional_size_at_the_end_of_buffer); {
UInt32 bytes_to_resize;
if (common::addOverflow(uncompressed_size, additional_size_at_the_end_of_buffer, bytes_to_resize))
throw Exception(decompression_error_code, "Too large uncompressed size: {}", uncompressed_size);
uncompressed_buf.resize(bytes_to_resize);
}
codec->decompress(compressed_buf.data(), source_size, uncompressed_buf.data()); codec->decompress(compressed_buf.data(), source_size, uncompressed_buf.data());
uncompressed_buf.swap(compressed_buf); uncompressed_buf.swap(compressed_buf);
source_size = uncompressed_size; source_size = uncompressed_size;

View File

@ -27,8 +27,12 @@ UInt32 CompressionCodecNone::doCompressData(const char * source, UInt32 source_s
return source_size; return source_size;
} }
void CompressionCodecNone::doDecompressData(const char * source, UInt32 /*source_size*/, char * dest, UInt32 uncompressed_size) const void CompressionCodecNone::doDecompressData(const char * source, UInt32 source_size, char * dest, UInt32 uncompressed_size) const
{ {
if (source_size != uncompressed_size)
throw Exception(decompression_error_code, "Wrong data for compression codec NONE: source_size ({}) != uncompressed_size ({})",
source_size, uncompressed_size);
memcpy(dest, source, uncompressed_size); memcpy(dest, source, uncompressed_size);
} }

View File

@ -18,9 +18,7 @@ public:
void updateHash(SipHash & hash) const override; void updateHash(SipHash & hash) const override;
protected: protected:
UInt32 doCompressData(const char * source, UInt32 source_size, char * dest) const override; UInt32 doCompressData(const char * source, UInt32 source_size, char * dest) const override;
void doDecompressData(const char * source, UInt32 source_size, char * dest, UInt32 uncompressed_size) const override; void doDecompressData(const char * source, UInt32 source_size, char * dest, UInt32 uncompressed_size) const override;
bool isCompression() const override { return false; } bool isCompression() const override { return false; }

View File

@ -217,6 +217,7 @@ void KeeperSnapshotManagerS3::uploadSnapshotImpl(const SnapshotFileInfo & snapsh
} }
/// To avoid reference to binding /// To avoid reference to binding
const auto & snapshot_path_ref = snapshot_path; const auto & snapshot_path_ref = snapshot_path;
SCOPE_EXIT( SCOPE_EXIT(

View File

@ -440,10 +440,11 @@ nuraft::ptr<nuraft::buffer> KeeperStateMachine::commit(const uint64_t log_idx, n
} }
ProfileEvents::increment(ProfileEvents::KeeperCommits); ProfileEvents::increment(ProfileEvents::KeeperCommits);
keeper_context->setLastCommitIndex(log_idx);
if (commit_callback) if (commit_callback)
commit_callback(log_idx, *request_for_session); commit_callback(log_idx, *request_for_session);
keeper_context->setLastCommitIndex(log_idx);
} }
catch (...) catch (...)
{ {

View File

@ -382,4 +382,9 @@ std::shared_ptr<zkutil::ZooKeeper> Context::getZooKeeper() const
throw Exception(ErrorCodes::UNSUPPORTED_METHOD, "Cannot connect to ZooKeeper from Keeper"); throw Exception(ErrorCodes::UNSUPPORTED_METHOD, "Cannot connect to ZooKeeper from Keeper");
} }
const ServerSettings & Context::getServerSettings() const
{
return shared->server_settings;
}
} }

View File

@ -11,6 +11,7 @@
#include <Disks/IO/getThreadPoolReader.h> #include <Disks/IO/getThreadPoolReader.h>
#include <Core/Settings.h> #include <Core/Settings.h>
#include <Core/ServerSettings.h>
#include <Core/BackgroundSchedulePool.h> #include <Core/BackgroundSchedulePool.h>
#include <IO/AsyncReadCounters.h> #include <IO/AsyncReadCounters.h>
@ -160,6 +161,8 @@ public:
void updateKeeperConfiguration(const Poco::Util::AbstractConfiguration & config); void updateKeeperConfiguration(const Poco::Util::AbstractConfiguration & config);
zkutil::ZooKeeperPtr getZooKeeper() const; zkutil::ZooKeeperPtr getZooKeeper() const;
const ServerSettings & getServerSettings() const;
}; };
} }

View File

@ -177,7 +177,6 @@ using BlockPtr = std::shared_ptr<Block>;
using Blocks = std::vector<Block>; using Blocks = std::vector<Block>;
using BlocksList = std::list<Block>; using BlocksList = std::list<Block>;
using BlocksPtr = std::shared_ptr<Blocks>; using BlocksPtr = std::shared_ptr<Blocks>;
using BlocksPtrs = std::shared_ptr<std::vector<BlocksPtr>>;
/// Extends block with extra data in derived classes /// Extends block with extra data in derived classes
struct ExtraBlock struct ExtraBlock

View File

@ -70,6 +70,15 @@ static constexpr auto DBMS_DEFAULT_MAX_QUERY_SIZE = 262144;
/// Max depth of hierarchical dictionary /// Max depth of hierarchical dictionary
static constexpr auto DBMS_HIERARCHICAL_DICTIONARY_MAX_DEPTH = 1000; static constexpr auto DBMS_HIERARCHICAL_DICTIONARY_MAX_DEPTH = 1000;
#ifdef OS_LINUX
#define DBMS_DEFAULT_PAGE_CACHE_USE_MADV_FREE true
#else
/// On Mac OS, MADV_FREE is not lazy, so page_cache_use_madv_free should be disabled.
/// On FreeBSD, it may work but we haven't tested it.
#define DBMS_DEFAULT_PAGE_CACHE_USE_MADV_FREE false
#endif
/// Default maximum (total and entry) sizes and policies of various caches /// Default maximum (total and entry) sizes and policies of various caches
static constexpr auto DEFAULT_UNCOMPRESSED_CACHE_POLICY = "SLRU"; static constexpr auto DEFAULT_UNCOMPRESSED_CACHE_POLICY = "SLRU";
static constexpr auto DEFAULT_UNCOMPRESSED_CACHE_MAX_SIZE = 0_MiB; static constexpr auto DEFAULT_UNCOMPRESSED_CACHE_MAX_SIZE = 0_MiB;

View File

@ -9,6 +9,11 @@
#include <Common/OpenSSLHelpers.h> #include <Common/OpenSSLHelpers.h>
#include <base/scope_guard.h> #include <base/scope_guard.h>
#include <base/defines.h>
#include <string_view>
using namespace std::literals;
namespace DB namespace DB
{ {

View File

@ -65,7 +65,7 @@ namespace DB
M(UInt64, max_concurrent_insert_queries, 0, "Maximum number of concurrently INSERT queries. Zero means unlimited.", 0) \ M(UInt64, max_concurrent_insert_queries, 0, "Maximum number of concurrently INSERT queries. Zero means unlimited.", 0) \
M(UInt64, max_concurrent_select_queries, 0, "Maximum number of concurrently SELECT queries. Zero means unlimited.", 0) \ M(UInt64, max_concurrent_select_queries, 0, "Maximum number of concurrently SELECT queries. Zero means unlimited.", 0) \
\ \
M(Double, cache_size_to_ram_max_ratio, 0.5, "Set cache size ro RAM max ratio. Allows to lower cache size on low-memory systems.", 0) \ M(Double, cache_size_to_ram_max_ratio, 0.5, "Set cache size to RAM max ratio. Allows to lower cache size on low-memory systems.", 0) \
M(String, uncompressed_cache_policy, DEFAULT_UNCOMPRESSED_CACHE_POLICY, "Uncompressed cache policy name.", 0) \ M(String, uncompressed_cache_policy, DEFAULT_UNCOMPRESSED_CACHE_POLICY, "Uncompressed cache policy name.", 0) \
M(UInt64, uncompressed_cache_size, DEFAULT_UNCOMPRESSED_CACHE_MAX_SIZE, "Size of cache for uncompressed blocks. Zero means disabled.", 0) \ M(UInt64, uncompressed_cache_size, DEFAULT_UNCOMPRESSED_CACHE_MAX_SIZE, "Size of cache for uncompressed blocks. Zero means disabled.", 0) \
M(Double, uncompressed_cache_size_ratio, DEFAULT_UNCOMPRESSED_CACHE_SIZE_RATIO, "The size of the protected queue in the uncompressed cache relative to the cache's total size.", 0) \ M(Double, uncompressed_cache_size_ratio, DEFAULT_UNCOMPRESSED_CACHE_SIZE_RATIO, "The size of the protected queue in the uncompressed cache relative to the cache's total size.", 0) \
@ -78,6 +78,11 @@ namespace DB
M(String, index_mark_cache_policy, DEFAULT_INDEX_MARK_CACHE_POLICY, "Secondary index mark cache policy name.", 0) \ M(String, index_mark_cache_policy, DEFAULT_INDEX_MARK_CACHE_POLICY, "Secondary index mark cache policy name.", 0) \
M(UInt64, index_mark_cache_size, DEFAULT_INDEX_MARK_CACHE_MAX_SIZE, "Size of cache for secondary index marks. Zero means disabled.", 0) \ M(UInt64, index_mark_cache_size, DEFAULT_INDEX_MARK_CACHE_MAX_SIZE, "Size of cache for secondary index marks. Zero means disabled.", 0) \
M(Double, index_mark_cache_size_ratio, DEFAULT_INDEX_MARK_CACHE_SIZE_RATIO, "The size of the protected queue in the secondary index mark cache relative to the cache's total size.", 0) \ M(Double, index_mark_cache_size_ratio, DEFAULT_INDEX_MARK_CACHE_SIZE_RATIO, "The size of the protected queue in the secondary index mark cache relative to the cache's total size.", 0) \
M(UInt64, page_cache_chunk_size, 2 << 20, "Bytes per chunk in userspace page cache. Rounded up to a multiple of page size (typically 4 KiB) or huge page size (typically 2 MiB, only if page_cache_use_thp is enabled).", 0) \
M(UInt64, page_cache_mmap_size, 1 << 30, "Bytes per memory mapping in userspace page cache. Not important.", 0) \
M(UInt64, page_cache_size, 10ul << 30, "Amount of virtual memory to map for userspace page cache. If page_cache_use_madv_free is enabled, it's recommended to set this higher than the machine's RAM size. Use 0 to disable userspace page cache.", 0) \
M(Bool, page_cache_use_madv_free, DBMS_DEFAULT_PAGE_CACHE_USE_MADV_FREE, "If true, the userspace page cache will allow the OS to automatically reclaim memory from the cache on memory pressure (using MADV_FREE).", 0) \
M(Bool, page_cache_use_transparent_huge_pages, true, "Userspace will attempt to use transparent huge pages on Linux. This is best-effort.", 0) \
M(UInt64, mmap_cache_size, DEFAULT_MMAP_CACHE_MAX_SIZE, "A cache for mmapped files.", 0) \ M(UInt64, mmap_cache_size, DEFAULT_MMAP_CACHE_MAX_SIZE, "A cache for mmapped files.", 0) \
\ \
M(Bool, disable_internal_dns_cache, false, "Disable internal DNS caching at all.", 0) \ M(Bool, disable_internal_dns_cache, false, "Disable internal DNS caching at all.", 0) \

View File

@ -777,6 +777,10 @@ class IColumn;
M(Bool, throw_on_error_from_cache_on_write_operations, false, "Ignore error from cache when caching on write operations (INSERT, merges)", 0) \ M(Bool, throw_on_error_from_cache_on_write_operations, false, "Ignore error from cache when caching on write operations (INSERT, merges)", 0) \
M(UInt64, filesystem_cache_segments_batch_size, 20, "Limit on size of a single batch of file segments that a read buffer can request from cache. Too low value will lead to excessive requests to cache, too large may slow down eviction from cache", 0) \ M(UInt64, filesystem_cache_segments_batch_size, 20, "Limit on size of a single batch of file segments that a read buffer can request from cache. Too low value will lead to excessive requests to cache, too large may slow down eviction from cache", 0) \
\ \
M(Bool, use_page_cache_for_disks_without_file_cache, false, "Use userspace page cache for remote disks that don't have filesystem cache enabled.", 0) \
M(Bool, read_from_page_cache_if_exists_otherwise_bypass_cache, false, "Use userspace page cache in passive mode, similar to read_from_filesystem_cache_if_exists_otherwise_bypass_cache.", 0) \
M(Bool, page_cache_inject_eviction, false, "Userspace page cache will sometimes invalidate some pages at random. Intended for testing.", 0) \
\
M(Bool, load_marks_asynchronously, false, "Load MergeTree marks asynchronously", 0) \ M(Bool, load_marks_asynchronously, false, "Load MergeTree marks asynchronously", 0) \
M(Bool, enable_filesystem_read_prefetches_log, false, "Log to system.filesystem prefetch_log during query. Should be used only for testing or debugging, not recommended to be turned on by default", 0) \ M(Bool, enable_filesystem_read_prefetches_log, false, "Log to system.filesystem prefetch_log during query. Should be used only for testing or debugging, not recommended to be turned on by default", 0) \
M(Bool, allow_prefetched_read_pool_for_remote_filesystem, true, "Prefer prefetched threadpool if all parts are on remote filesystem", 0) \ M(Bool, allow_prefetched_read_pool_for_remote_filesystem, true, "Prefer prefetched threadpool if all parts are on remote filesystem", 0) \
@ -875,10 +879,9 @@ class IColumn;
M(SQLSecurityType, default_normal_view_sql_security, SQLSecurityType::INVOKER, "Allows to set a default value for SQL SECURITY option when creating a normal view.", 0) \ M(SQLSecurityType, default_normal_view_sql_security, SQLSecurityType::INVOKER, "Allows to set a default value for SQL SECURITY option when creating a normal view.", 0) \
M(SQLSecurityType, default_materialized_view_sql_security, SQLSecurityType::DEFINER, "Allows to set a default value for SQL SECURITY option when creating a materialized view.", 0) \ M(SQLSecurityType, default_materialized_view_sql_security, SQLSecurityType::DEFINER, "Allows to set a default value for SQL SECURITY option when creating a materialized view.", 0) \
M(String, default_view_definer, "CURRENT_USER", "Allows to set a default value for DEFINER option when creating view.", 0) \ M(String, default_view_definer, "CURRENT_USER", "Allows to set a default value for DEFINER option when creating view.", 0) \
M(Bool, allow_experimental_shared_merge_tree, false, "Only available in ClickHouse Cloud", 0) \ M(UInt64, cache_warmer_threads, 4, "Only available in ClickHouse Cloud. Number of background threads for speculatively downloading new data parts into file cache, when cache_populated_by_fetch is enabled. Zero to disable.", 0) \
M(UInt64, cache_warmer_threads, 4, "Only available in ClickHouse Cloud", 0) \ M(Int64, ignore_cold_parts_seconds, 0, "Only available in ClickHouse Cloud. Exclude new data parts from SELECT queries until they're either pre-warmed (see cache_populated_by_fetch) or this many seconds old. Only for Replicated-/SharedMergeTree.", 0) \
M(Int64, ignore_cold_parts_seconds, 0, "Only available in ClickHouse Cloud", 0) \ M(Int64, prefer_warmed_unmerged_parts_seconds, 0, "Only available in ClickHouse Cloud. If a merged part is less than this many seconds old and is not pre-warmed (see cache_populated_by_fetch), but all its source parts are available and pre-warmed, SELECT queries will read from those parts instead. Only for ReplicatedMergeTree. Note that this only checks whether CacheWarmer processed the part; if the part was fetched into cache by something else, it'll still be considered cold until CacheWarmer gets to it; if it was warmed, then evicted from cache, it'll still be considered warm.", 0) \
M(Int64, prefer_warmed_unmerged_parts_seconds, 0, "Only available in ClickHouse Cloud", 0) \
M(Bool, iceberg_engine_ignore_schema_evolution, false, "Ignore schema evolution in Iceberg table engine and read all data using latest schema saved on table creation. Note that it can lead to incorrect result", 0) \ M(Bool, iceberg_engine_ignore_schema_evolution, false, "Ignore schema evolution in Iceberg table engine and read all data using latest schema saved on table creation. Note that it can lead to incorrect result", 0) \
// End of COMMON_SETTINGS // End of COMMON_SETTINGS
@ -902,6 +905,7 @@ class IColumn;
MAKE_OBSOLETE(M, Bool, allow_experimental_geo_types, true) \ MAKE_OBSOLETE(M, Bool, allow_experimental_geo_types, true) \
MAKE_OBSOLETE(M, Bool, allow_experimental_query_cache, true) \ MAKE_OBSOLETE(M, Bool, allow_experimental_query_cache, true) \
MAKE_OBSOLETE(M, Bool, allow_experimental_alter_materialized_view_structure, true) \ MAKE_OBSOLETE(M, Bool, allow_experimental_alter_materialized_view_structure, true) \
MAKE_OBSOLETE(M, Bool, allow_experimental_shared_merge_tree, true) \
\ \
MAKE_OBSOLETE(M, Milliseconds, async_insert_stale_timeout_ms, 0) \ MAKE_OBSOLETE(M, Milliseconds, async_insert_stale_timeout_ms, 0) \
MAKE_OBSOLETE(M, StreamingHandleErrorMode, handle_kafka_error_mode, StreamingHandleErrorMode::DEFAULT) \ MAKE_OBSOLETE(M, StreamingHandleErrorMode, handle_kafka_error_mode, StreamingHandleErrorMode::DEFAULT) \
@ -1018,6 +1022,7 @@ class IColumn;
M(Bool, input_format_json_read_objects_as_strings, true, "Allow to parse JSON objects as strings in JSON input formats", 0) \ M(Bool, input_format_json_read_objects_as_strings, true, "Allow to parse JSON objects as strings in JSON input formats", 0) \
M(Bool, input_format_json_read_arrays_as_strings, true, "Allow to parse JSON arrays as strings in JSON input formats", 0) \ M(Bool, input_format_json_read_arrays_as_strings, true, "Allow to parse JSON arrays as strings in JSON input formats", 0) \
M(Bool, input_format_json_try_infer_named_tuples_from_objects, true, "Try to infer named tuples from JSON objects in JSON input formats", 0) \ M(Bool, input_format_json_try_infer_named_tuples_from_objects, true, "Try to infer named tuples from JSON objects in JSON input formats", 0) \
M(Bool, input_format_json_use_string_type_for_ambiguous_paths_in_named_tuples_inference_from_objects, false, "Use String type instead of an exception in case of ambiguous paths in JSON objects during named tuples inference", 0) \
M(Bool, input_format_json_infer_incomplete_types_as_strings, true, "Use type String for keys that contains only Nulls or empty objects/arrays during schema inference in JSON input formats", 0) \ M(Bool, input_format_json_infer_incomplete_types_as_strings, true, "Use type String for keys that contains only Nulls or empty objects/arrays during schema inference in JSON input formats", 0) \
M(Bool, input_format_json_named_tuples_as_objects, true, "Deserialize named tuple columns as JSON objects", 0) \ M(Bool, input_format_json_named_tuples_as_objects, true, "Deserialize named tuple columns as JSON objects", 0) \
M(Bool, input_format_json_ignore_unknown_keys_in_named_tuple, true, "Ignore unknown keys in json object for named tuples", 0) \ M(Bool, input_format_json_ignore_unknown_keys_in_named_tuple, true, "Ignore unknown keys in json object for named tuples", 0) \
@ -1025,7 +1030,7 @@ class IColumn;
M(Bool, input_format_try_infer_integers, true, "Try to infer integers instead of floats while schema inference in text formats", 0) \ M(Bool, input_format_try_infer_integers, true, "Try to infer integers instead of floats while schema inference in text formats", 0) \
M(Bool, input_format_try_infer_dates, true, "Try to infer dates from string fields while schema inference in text formats", 0) \ M(Bool, input_format_try_infer_dates, true, "Try to infer dates from string fields while schema inference in text formats", 0) \
M(Bool, input_format_try_infer_datetimes, true, "Try to infer datetimes from string fields while schema inference in text formats", 0) \ M(Bool, input_format_try_infer_datetimes, true, "Try to infer datetimes from string fields while schema inference in text formats", 0) \
M(Bool, input_format_try_infer_exponent_floats, false, "Try to infer floats in exponential notation while schema inference in text formats", 0) \ M(Bool, input_format_try_infer_exponent_floats, false, "Try to infer floats in exponential notation while schema inference in text formats (except JSON, where exponent numbers are always inferred)", 0) \
M(Bool, output_format_markdown_escape_special_characters, false, "Escape special characters in Markdown", 0) \ M(Bool, output_format_markdown_escape_special_characters, false, "Escape special characters in Markdown", 0) \
M(Bool, input_format_protobuf_flatten_google_wrappers, false, "Enable Google wrappers for regular non-nested columns, e.g. google.protobuf.StringValue 'str' for String column 'str'. For Nullable columns empty wrappers are recognized as defaults, and missing as nulls", 0) \ M(Bool, input_format_protobuf_flatten_google_wrappers, false, "Enable Google wrappers for regular non-nested columns, e.g. google.protobuf.StringValue 'str' for String column 'str'. For Nullable columns empty wrappers are recognized as defaults, and missing as nulls", 0) \
M(Bool, output_format_protobuf_nullables_with_google_wrappers, false, "When serializing Nullable columns with Google wrappers, serialize default values as empty wrappers. If turned off, default and null values are not serialized", 0) \ M(Bool, output_format_protobuf_nullables_with_google_wrappers, false, "When serializing Nullable columns with Google wrappers, serialize default values as empty wrappers. If turned off, default and null values are not serialized", 0) \

View File

@ -85,11 +85,14 @@ namespace SettingsChangesHistory
/// It's used to implement `compatibility` setting (see https://github.com/ClickHouse/ClickHouse/issues/35972) /// It's used to implement `compatibility` setting (see https://github.com/ClickHouse/ClickHouse/issues/35972)
static std::map<ClickHouseVersion, SettingsChangesHistory::SettingsChanges> settings_changes_history = static std::map<ClickHouseVersion, SettingsChangesHistory::SettingsChanges> settings_changes_history =
{ {
{"24.3", { {"24.3", {{"allow_experimental_shared_merge_tree", false, true, "The setting is obsolete"},
{"use_page_cache_for_disks_without_file_cache", false, false, "Added userspace page cache"},
{"read_from_page_cache_if_exists_otherwise_bypass_cache", false, false, "Added userspace page cache"},
{"page_cache_inject_eviction", false, false, "Added userspace page cache"},
{"input_format_json_use_string_type_for_ambiguous_paths_in_named_tuples_inference_from_objects", false, false, "Allow to use String type for ambiguous paths during named tuple inference from JSON objects"},
{"optimize_functions_to_subcolumns", false, true, "Enable optimization by default"}, {"optimize_functions_to_subcolumns", false, true, "Enable optimization by default"},
}}, }},
{"24.2", { {"24.2", {{"allow_suspicious_variant_types", true, false, "Don't allow creating Variant type with suspicious variants by default"},
{"allow_suspicious_variant_types", true, false, "Don't allow creating Variant type with suspicious variants by default"},
{"validate_experimental_and_suspicious_types_inside_nested_types", false, true, "Validate usage of experimental and suspicious types inside nested types"}, {"validate_experimental_and_suspicious_types_inside_nested_types", false, true, "Validate usage of experimental and suspicious types inside nested types"},
{"output_format_values_escape_quote_with_quote", false, false, "If true escape ' with '', otherwise quoted with \\'"}, {"output_format_values_escape_quote_with_quote", false, false, "If true escape ' with '', otherwise quoted with \\'"},
{"output_format_pretty_single_large_number_tip_threshold", 0, 1'000'000, "Print a readable number tip on the right side of the table if the block consists of a single number which exceeds this value (except 0)"}, {"output_format_pretty_single_large_number_tip_threshold", 0, 1'000'000, "Print a readable number tip on the right side of the table if the block consists of a single number which exceeds this value (except 0)"},

View File

@ -141,6 +141,7 @@ enum class DefaultTableEngine
DECLARE_SETTING_ENUM(DefaultTableEngine) DECLARE_SETTING_ENUM(DefaultTableEngine)
enum class CleanDeletedRows enum class CleanDeletedRows
{ {
Never = 0, /// Disable. Never = 0, /// Disable.

View File

@ -8,7 +8,9 @@
#include <IO/ReadHelpers.h> #include <IO/ReadHelpers.h>
#include <IO/ReadBufferFromString.h> #include <IO/ReadBufferFromString.h>
#include <IO/WriteHelpers.h> #include <IO/WriteHelpers.h>
#include <boost/algorithm/string/predicate.hpp> #include <boost/algorithm/string/predicate.hpp>
#include <cctz/time_zone.h>
#include <cmath> #include <cmath>
@ -544,6 +546,13 @@ void SettingFieldTimezone::readBinary(ReadBuffer & in)
*this = std::move(str); *this = std::move(str);
} }
void SettingFieldTimezone::validateTimezone(const std::string & tz_str)
{
cctz::time_zone validated_tz;
if (!tz_str.empty() && !cctz::load_time_zone(tz_str, &validated_tz))
throw DB::Exception(DB::ErrorCodes::BAD_ARGUMENTS, "Invalid time zone: {}", tz_str);
}
String SettingFieldCustom::toString() const String SettingFieldCustom::toString() const
{ {
return value.dump(); return value.dump();

View File

@ -6,7 +6,6 @@
#include <Core/Field.h> #include <Core/Field.h>
#include <Core/MultiEnum.h> #include <Core/MultiEnum.h>
#include <boost/range/adaptor/map.hpp> #include <boost/range/adaptor/map.hpp>
#include <cctz/time_zone.h>
#include <chrono> #include <chrono>
#include <unordered_map> #include <unordered_map>
#include <string_view> #include <string_view>
@ -608,12 +607,7 @@ struct SettingFieldTimezone
void readBinary(ReadBuffer & in); void readBinary(ReadBuffer & in);
private: private:
void validateTimezone(const std::string & tz_str) void validateTimezone(const std::string & tz_str);
{
cctz::time_zone validated_tz;
if (!tz_str.empty() && !cctz::load_time_zone(tz_str, &validated_tz))
throw DB::Exception(DB::ErrorCodes::BAD_ARGUMENTS, "Invalid time zone: {}", tz_str);
}
}; };
/// Can keep a value of any type. Used for user-defined settings. /// Can keep a value of any type. Used for user-defined settings.

View File

@ -417,4 +417,3 @@ void ISerialization::throwUnexpectedDataAfterParsedValue(IColumn & column, ReadB
} }
} }

View File

@ -474,16 +474,18 @@ DataTypePtr getLeastSupertype(const DataTypes & types)
type_ids.insert(type->getTypeId()); type_ids.insert(type->getTypeId());
/// For String and FixedString, or for different FixedStrings, the common type is String. /// For String and FixedString, or for different FixedStrings, the common type is String.
/// No other types are compatible with Strings. TODO Enums? /// If there are Enums and any type of Strings, the common type is String.
/// No other types are compatible with Strings.
{ {
size_t have_string = type_ids.count(TypeIndex::String); size_t have_string = type_ids.count(TypeIndex::String);
size_t have_fixed_string = type_ids.count(TypeIndex::FixedString); size_t have_fixed_string = type_ids.count(TypeIndex::FixedString);
size_t have_enums = type_ids.count(TypeIndex::Enum8) + type_ids.count(TypeIndex::Enum16);
if (have_string || have_fixed_string) if (have_string || have_fixed_string)
{ {
bool all_strings = type_ids.size() == (have_string + have_fixed_string); bool all_compatible_with_string = type_ids.size() == (have_string + have_fixed_string + have_enums);
if (!all_strings) if (!all_compatible_with_string)
return throwOrReturn<on_error>(types, "because some of them are String/FixedString and some of them are not", ErrorCodes::NO_COMMON_TYPE); return throwOrReturn<on_error>(types, "because some of them are String/FixedString/Enum and some of them are not", ErrorCodes::NO_COMMON_TYPE);
return std::make_shared<DataTypeString>(); return std::make_shared<DataTypeString>();
} }

View File

@ -129,6 +129,7 @@ void AsynchronousBoundedReadBuffer::setReadUntilPosition(size_t position)
/// new read until position is after the current position in the working buffer /// new read until position is after the current position in the working buffer
file_offset_of_buffer_end = position; file_offset_of_buffer_end = position;
working_buffer.resize(working_buffer.size() - (file_offset_of_buffer_end - position)); working_buffer.resize(working_buffer.size() - (file_offset_of_buffer_end - position));
pos = std::min(pos, working_buffer.end());
} }
else else
{ {
@ -235,9 +236,6 @@ bool AsynchronousBoundedReadBuffer::nextImpl()
file_offset_of_buffer_end = impl->getFileOffsetOfBufferEnd(); file_offset_of_buffer_end = impl->getFileOffsetOfBufferEnd();
/// In case of multiple files for the same file in clickhouse (i.e. log family)
/// file_offset_of_buffer_end will not match getImplementationBufferOffset()
/// so we use [impl->getImplementationBufferOffset(), impl->getFileSize()]
chassert(file_offset_of_buffer_end <= impl->getFileSize()); chassert(file_offset_of_buffer_end <= impl->getFileSize());
if (read_until_position && (file_offset_of_buffer_end > *read_until_position)) if (read_until_position && (file_offset_of_buffer_end > *read_until_position))
@ -264,7 +262,7 @@ off_t AsynchronousBoundedReadBuffer::seek(off_t offset, int whence)
size_t new_pos; size_t new_pos;
if (whence == SEEK_SET) if (whence == SEEK_SET)
{ {
assert(offset >= 0); chassert(offset >= 0);
new_pos = offset; new_pos = offset;
} }
else if (whence == SEEK_CUR) else if (whence == SEEK_CUR)
@ -290,8 +288,8 @@ off_t AsynchronousBoundedReadBuffer::seek(off_t offset, int whence)
/// Position is still inside the buffer. /// Position is still inside the buffer.
/// Probably it is at the end of the buffer - then we will load data on the following 'next' call. /// Probably it is at the end of the buffer - then we will load data on the following 'next' call.
pos = working_buffer.end() - file_offset_of_buffer_end + new_pos; pos = working_buffer.end() - file_offset_of_buffer_end + new_pos;
assert(pos >= working_buffer.begin()); chassert(pos >= working_buffer.begin());
assert(pos <= working_buffer.end()); chassert(pos <= working_buffer.end());
return new_pos; return new_pos;
} }
@ -317,7 +315,7 @@ off_t AsynchronousBoundedReadBuffer::seek(off_t offset, int whence)
break; break;
} }
assert(!prefetch_future.valid()); chassert(!prefetch_future.valid());
/// First reset the buffer so the next read will fetch new data to the buffer. /// First reset the buffer so the next read will fetch new data to the buffer.
resetWorkingBuffer(); resetWorkingBuffer();

View File

@ -1215,7 +1215,7 @@ size_t CachedOnDiskReadBufferFromFile::getRemainingSizeToRead()
void CachedOnDiskReadBufferFromFile::setReadUntilPosition(size_t position) void CachedOnDiskReadBufferFromFile::setReadUntilPosition(size_t position)
{ {
if (!allow_seeks_after_first_read) if (initialized && !allow_seeks_after_first_read)
throw Exception(ErrorCodes::LOGICAL_ERROR, "Method `setReadUntilPosition()` not allowed"); throw Exception(ErrorCodes::LOGICAL_ERROR, "Method `setReadUntilPosition()` not allowed");
if (read_until_position == position) if (read_until_position == position)

View File

@ -5,6 +5,7 @@
#include <Disks/IO/CachedOnDiskReadBufferFromFile.h> #include <Disks/IO/CachedOnDiskReadBufferFromFile.h>
#include <Disks/ObjectStorages/Cached/CachedObjectStorage.h> #include <Disks/ObjectStorages/Cached/CachedObjectStorage.h>
#include <Interpreters/Cache/FileCache.h> #include <Interpreters/Cache/FileCache.h>
#include <IO/CachedInMemoryReadBufferFromFile.h>
#include <IO/ReadSettings.h> #include <IO/ReadSettings.h>
#include <IO/SwapHelper.h> #include <IO/SwapHelper.h>
#include <Interpreters/FilesystemCacheLog.h> #include <Interpreters/FilesystemCacheLog.h>
@ -16,12 +17,16 @@ using namespace DB;
namespace namespace
{ {
bool withCache(const ReadSettings & settings) bool withFileCache(const ReadSettings & settings)
{ {
return settings.remote_fs_cache && settings.enable_filesystem_cache return settings.remote_fs_cache && settings.enable_filesystem_cache
&& (!CurrentThread::getQueryId().empty() || settings.read_from_filesystem_cache_if_exists_otherwise_bypass_cache && (!CurrentThread::getQueryId().empty() || settings.read_from_filesystem_cache_if_exists_otherwise_bypass_cache
|| !settings.avoid_readthrough_cache_outside_query_context); || !settings.avoid_readthrough_cache_outside_query_context);
} }
bool withPageCache(const ReadSettings & settings, bool with_file_cache)
{
return settings.page_cache && !with_file_cache && settings.use_page_cache_for_disks_without_file_cache;
}
} }
namespace DB namespace DB
@ -34,7 +39,7 @@ namespace ErrorCodes
size_t chooseBufferSizeForRemoteReading(const DB::ReadSettings & settings, size_t file_size) size_t chooseBufferSizeForRemoteReading(const DB::ReadSettings & settings, size_t file_size)
{ {
/// Only when cache is used we could download bigger portions of FileSegments than what we actually gonna read within particular task. /// Only when cache is used we could download bigger portions of FileSegments than what we actually gonna read within particular task.
if (!withCache(settings)) if (!withFileCache(settings))
return settings.remote_fs_buffer_size; return settings.remote_fs_buffer_size;
/// Buffers used for prefetch and pre-download better to have enough size, but not bigger than the whole file. /// Buffers used for prefetch and pre-download better to have enough size, but not bigger than the whole file.
@ -44,27 +49,30 @@ size_t chooseBufferSizeForRemoteReading(const DB::ReadSettings & settings, size_
ReadBufferFromRemoteFSGather::ReadBufferFromRemoteFSGather( ReadBufferFromRemoteFSGather::ReadBufferFromRemoteFSGather(
ReadBufferCreator && read_buffer_creator_, ReadBufferCreator && read_buffer_creator_,
const StoredObjects & blobs_to_read_, const StoredObjects & blobs_to_read_,
const std::string & cache_path_prefix_,
const ReadSettings & settings_, const ReadSettings & settings_,
std::shared_ptr<FilesystemCacheLog> cache_log_, std::shared_ptr<FilesystemCacheLog> cache_log_,
bool use_external_buffer_) bool use_external_buffer_)
: ReadBufferFromFileBase( : ReadBufferFromFileBase(use_external_buffer_ ? 0 : chooseBufferSizeForRemoteReading(
use_external_buffer_ ? 0 : chooseBufferSizeForRemoteReading(settings_, getTotalSize(blobs_to_read_)), nullptr, 0) settings_, getTotalSize(blobs_to_read_)), nullptr, 0)
, settings(settings_) , settings(settings_)
, blobs_to_read(blobs_to_read_) , blobs_to_read(blobs_to_read_)
, read_buffer_creator(std::move(read_buffer_creator_)) , read_buffer_creator(std::move(read_buffer_creator_))
, cache_path_prefix(cache_path_prefix_)
, cache_log(settings.enable_filesystem_cache_log ? cache_log_ : nullptr) , cache_log(settings.enable_filesystem_cache_log ? cache_log_ : nullptr)
, query_id(CurrentThread::getQueryId()) , query_id(CurrentThread::getQueryId())
, use_external_buffer(use_external_buffer_) , use_external_buffer(use_external_buffer_)
, with_cache(withCache(settings)) , with_file_cache(withFileCache(settings))
, with_page_cache(withPageCache(settings, with_file_cache))
, log(getLogger("ReadBufferFromRemoteFSGather")) , log(getLogger("ReadBufferFromRemoteFSGather"))
{ {
if (!blobs_to_read.empty()) if (!blobs_to_read.empty())
current_object = blobs_to_read.front(); current_object = blobs_to_read.front();
} }
SeekableReadBufferPtr ReadBufferFromRemoteFSGather::createImplementationBuffer(const StoredObject & object) SeekableReadBufferPtr ReadBufferFromRemoteFSGather::createImplementationBuffer(const StoredObject & object, size_t start_offset)
{ {
if (current_buf && !with_cache) if (current_buf && !with_file_cache)
{ {
appendUncachedReadInfo(); appendUncachedReadInfo();
} }
@ -72,30 +80,45 @@ SeekableReadBufferPtr ReadBufferFromRemoteFSGather::createImplementationBuffer(c
current_object = object; current_object = object;
const auto & object_path = object.remote_path; const auto & object_path = object.remote_path;
size_t current_read_until_position = read_until_position ? read_until_position : object.bytes_size; std::unique_ptr<ReadBufferFromFileBase> buf;
auto current_read_buffer_creator = [=, this]() { return read_buffer_creator(object_path, current_read_until_position); };
#ifndef CLICKHOUSE_KEEPER_STANDALONE_BUILD #ifndef CLICKHOUSE_KEEPER_STANDALONE_BUILD
if (with_cache) if (with_file_cache)
{ {
auto cache_key = settings.remote_fs_cache->createKeyForPath(object_path); auto cache_key = settings.remote_fs_cache->createKeyForPath(object_path);
return std::make_shared<CachedOnDiskReadBufferFromFile>( buf = std::make_unique<CachedOnDiskReadBufferFromFile>(
object_path, object_path,
cache_key, cache_key,
settings.remote_fs_cache, settings.remote_fs_cache,
FileCache::getCommonUser(), FileCache::getCommonUser(),
std::move(current_read_buffer_creator), [=, this]() { return read_buffer_creator(/* restricted_seek */true, object_path); },
settings, settings,
query_id, query_id,
object.bytes_size, object.bytes_size,
/* allow_seeks */false, /* allow_seeks */false,
/* use_external_buffer */true, /* use_external_buffer */true,
read_until_position ? std::optional<size_t>(read_until_position) : std::nullopt, /* read_until_position */std::nullopt,
cache_log); cache_log);
} }
#endif #endif
return current_read_buffer_creator(); /// Can't wrap CachedOnDiskReadBufferFromFile in CachedInMemoryReadBufferFromFile because the
/// former doesn't support seeks.
if (with_page_cache && !buf)
{
auto inner = read_buffer_creator(/* restricted_seek */false, object_path);
auto cache_key = FileChunkAddress { .path = cache_path_prefix + object_path };
buf = std::make_unique<CachedInMemoryReadBufferFromFile>(
cache_key, settings.page_cache, std::move(inner), settings);
}
if (!buf)
buf = read_buffer_creator(/* restricted_seek */true, object_path);
if (read_until_position > start_offset && read_until_position < start_offset + object.bytes_size)
buf->setReadUntilPosition(read_until_position - start_offset);
return buf;
} }
void ReadBufferFromRemoteFSGather::appendUncachedReadInfo() void ReadBufferFromRemoteFSGather::appendUncachedReadInfo()
@ -124,12 +147,12 @@ void ReadBufferFromRemoteFSGather::initialize()
return; return;
/// One clickhouse file can be split into multiple files in remote fs. /// One clickhouse file can be split into multiple files in remote fs.
auto current_buf_offset = file_offset_of_buffer_end; size_t start_offset = 0;
for (size_t i = 0; i < blobs_to_read.size(); ++i) for (size_t i = 0; i < blobs_to_read.size(); ++i)
{ {
const auto & object = blobs_to_read[i]; const auto & object = blobs_to_read[i];
if (object.bytes_size > current_buf_offset) if (start_offset + object.bytes_size > file_offset_of_buffer_end)
{ {
LOG_TEST(log, "Reading from file: {} ({})", object.remote_path, object.local_path); LOG_TEST(log, "Reading from file: {} ({})", object.remote_path, object.local_path);
@ -137,14 +160,14 @@ void ReadBufferFromRemoteFSGather::initialize()
if (!current_buf || current_buf_idx != i) if (!current_buf || current_buf_idx != i)
{ {
current_buf_idx = i; current_buf_idx = i;
current_buf = createImplementationBuffer(object); current_buf = createImplementationBuffer(object, start_offset);
} }
current_buf->seek(current_buf_offset, SEEK_SET); current_buf->seek(file_offset_of_buffer_end - start_offset, SEEK_SET);
return; return;
} }
current_buf_offset -= object.bytes_size; start_offset += object.bytes_size;
} }
current_buf_idx = blobs_to_read.size(); current_buf_idx = blobs_to_read.size();
current_buf = nullptr; current_buf = nullptr;
@ -171,14 +194,14 @@ bool ReadBufferFromRemoteFSGather::nextImpl()
bool ReadBufferFromRemoteFSGather::moveToNextBuffer() bool ReadBufferFromRemoteFSGather::moveToNextBuffer()
{ {
/// If there is no available buffers - nothing to read. /// If there is no available buffers - nothing to read.
if (current_buf_idx + 1 >= blobs_to_read.size()) if (current_buf_idx + 1 >= blobs_to_read.size() || (read_until_position && file_offset_of_buffer_end >= read_until_position))
return false; return false;
++current_buf_idx; ++current_buf_idx;
const auto & object = blobs_to_read[current_buf_idx]; const auto & object = blobs_to_read[current_buf_idx];
LOG_TEST(log, "Reading from next file: {} ({})", object.remote_path, object.local_path); LOG_TEST(log, "Reading from next file: {} ({})", object.remote_path, object.local_path);
current_buf = createImplementationBuffer(object); current_buf = createImplementationBuffer(object, file_offset_of_buffer_end);
return true; return true;
} }
@ -263,7 +286,7 @@ off_t ReadBufferFromRemoteFSGather::seek(off_t offset, int whence)
ReadBufferFromRemoteFSGather::~ReadBufferFromRemoteFSGather() ReadBufferFromRemoteFSGather::~ReadBufferFromRemoteFSGather()
{ {
if (!with_cache) if (!with_file_cache)
appendUncachedReadInfo(); appendUncachedReadInfo();
} }

View File

@ -21,11 +21,12 @@ class ReadBufferFromRemoteFSGather final : public ReadBufferFromFileBase
friend class ReadIndirectBufferFromRemoteFS; friend class ReadIndirectBufferFromRemoteFS;
public: public:
using ReadBufferCreator = std::function<std::unique_ptr<ReadBufferFromFileBase>(const std::string & path, size_t read_until_position)>; using ReadBufferCreator = std::function<std::unique_ptr<ReadBufferFromFileBase>(bool restricted_seek, const std::string & path)>;
ReadBufferFromRemoteFSGather( ReadBufferFromRemoteFSGather(
ReadBufferCreator && read_buffer_creator_, ReadBufferCreator && read_buffer_creator_,
const StoredObjects & blobs_to_read_, const StoredObjects & blobs_to_read_,
const std::string & cache_path_prefix_,
const ReadSettings & settings_, const ReadSettings & settings_,
std::shared_ptr<FilesystemCacheLog> cache_log_, std::shared_ptr<FilesystemCacheLog> cache_log_,
bool use_external_buffer_); bool use_external_buffer_);
@ -53,7 +54,7 @@ public:
bool isContentCached(size_t offset, size_t size) override; bool isContentCached(size_t offset, size_t size) override;
private: private:
SeekableReadBufferPtr createImplementationBuffer(const StoredObject & object); SeekableReadBufferPtr createImplementationBuffer(const StoredObject & object, size_t start_offset);
bool nextImpl() override; bool nextImpl() override;
@ -70,10 +71,12 @@ private:
const ReadSettings settings; const ReadSettings settings;
const StoredObjects blobs_to_read; const StoredObjects blobs_to_read;
const ReadBufferCreator read_buffer_creator; const ReadBufferCreator read_buffer_creator;
const std::string cache_path_prefix;
const std::shared_ptr<FilesystemCacheLog> cache_log; const std::shared_ptr<FilesystemCacheLog> cache_log;
const String query_id; const String query_id;
const bool use_external_buffer; const bool use_external_buffer;
const bool with_cache; const bool with_file_cache;
const bool with_page_cache;
size_t read_until_position = 0; size_t read_until_position = 0;
size_t file_offset_of_buffer_end = 0; size_t file_offset_of_buffer_end = 0;

View File

@ -152,6 +152,8 @@ IAsynchronousReader::Result ThreadPoolRemoteFSReader::execute(Request request, b
IAsynchronousReader::Result read_result; IAsynchronousReader::Result read_result;
if (result) if (result)
{ {
chassert(reader.buffer().begin() == request.buf);
chassert(reader.buffer().end() <= request.buf + request.size);
read_result.size = reader.buffer().size(); read_result.size = reader.buffer().size();
read_result.offset = reader.offset(); read_result.offset = reader.offset();
ProfileEvents::increment(ProfileEvents::ThreadpoolReaderReadBytes, read_result.size); ProfileEvents::increment(ProfileEvents::ThreadpoolReaderReadBytes, read_result.size);

View File

@ -29,6 +29,9 @@ private:
class RemoteFSFileDescriptor : public IAsynchronousReader::IFileDescriptor class RemoteFSFileDescriptor : public IAsynchronousReader::IFileDescriptor
{ {
public: public:
/// `reader_` implementation must ensure that next() places data at the start of internal_buffer,
/// even if there was previously a seek. I.e. seek() shouldn't leave pending data (no short seek
/// optimization), and nextImpl() shouldn't assign nextimpl_working_buffer_offset.
explicit RemoteFSFileDescriptor( explicit RemoteFSFileDescriptor(
SeekableReadBuffer & reader_, SeekableReadBuffer & reader_,
std::shared_ptr<AsyncReadCounters> async_read_counters_) std::shared_ptr<AsyncReadCounters> async_read_counters_)

View File

@ -206,7 +206,7 @@ std::unique_ptr<ReadBufferFromFileBase> AzureObjectStorage::readObjects( /// NOL
auto read_buffer_creator = auto read_buffer_creator =
[this, settings_ptr, disk_read_settings] [this, settings_ptr, disk_read_settings]
(const std::string & path, size_t read_until_position) -> std::unique_ptr<ReadBufferFromFileBase> (bool restricted_seek, const std::string & path) -> std::unique_ptr<ReadBufferFromFileBase>
{ {
return std::make_unique<ReadBufferFromAzureBlobStorage>( return std::make_unique<ReadBufferFromAzureBlobStorage>(
client.get(), client.get(),
@ -215,8 +215,7 @@ std::unique_ptr<ReadBufferFromFileBase> AzureObjectStorage::readObjects( /// NOL
settings_ptr->max_single_read_retries, settings_ptr->max_single_read_retries,
settings_ptr->max_single_download_retries, settings_ptr->max_single_download_retries,
/* use_external_buffer */true, /* use_external_buffer */true,
/* restricted_seek */true, restricted_seek);
read_until_position);
}; };
switch (read_settings.remote_fs_method) switch (read_settings.remote_fs_method)
@ -226,16 +225,17 @@ std::unique_ptr<ReadBufferFromFileBase> AzureObjectStorage::readObjects( /// NOL
return std::make_unique<ReadBufferFromRemoteFSGather>( return std::make_unique<ReadBufferFromRemoteFSGather>(
std::move(read_buffer_creator), std::move(read_buffer_creator),
objects, objects,
"azure:",
disk_read_settings, disk_read_settings,
global_context->getFilesystemCacheLog(), global_context->getFilesystemCacheLog(),
/* use_external_buffer */false); /* use_external_buffer */false);
} }
case RemoteFSReadMethod::threadpool: case RemoteFSReadMethod::threadpool:
{ {
auto impl = std::make_unique<ReadBufferFromRemoteFSGather>( auto impl = std::make_unique<ReadBufferFromRemoteFSGather>(
std::move(read_buffer_creator), std::move(read_buffer_creator),
objects, objects,
"azure:",
disk_read_settings, disk_read_settings,
global_context->getFilesystemCacheLog(), global_context->getFilesystemCacheLog(),
/* use_external_buffer */true); /* use_external_buffer */true);

View File

@ -527,10 +527,9 @@ std::unique_ptr<ReadBufferFromFileBase> DiskObjectStorage::readFile(
std::optional<size_t> read_hint, std::optional<size_t> read_hint,
std::optional<size_t> file_size) const std::optional<size_t> file_size) const
{ {
auto storage_objects = metadata_storage->getStorageObjects(path); const auto storage_objects = metadata_storage->getStorageObjects(path);
const bool file_can_be_empty = !file_size.has_value() || *file_size == 0; const bool file_can_be_empty = !file_size.has_value() || *file_size == 0;
if (storage_objects.empty() && file_can_be_empty) if (storage_objects.empty() && file_can_be_empty)
return std::make_unique<ReadBufferFromEmptyFile>(); return std::make_unique<ReadBufferFromEmptyFile>();

View File

@ -60,7 +60,7 @@ std::unique_ptr<ReadBufferFromFileBase> HDFSObjectStorage::readObjects( /// NOLI
auto disk_read_settings = patchSettings(read_settings); auto disk_read_settings = patchSettings(read_settings);
auto read_buffer_creator = auto read_buffer_creator =
[this, disk_read_settings] [this, disk_read_settings]
(const std::string & path, size_t /* read_until_position */) -> std::unique_ptr<ReadBufferFromFileBase> (bool /* restricted_seek */, const std::string & path) -> std::unique_ptr<ReadBufferFromFileBase>
{ {
size_t begin_of_path = path.find('/', path.find("//") + 2); size_t begin_of_path = path.find('/', path.find("//") + 2);
auto hdfs_path = path.substr(begin_of_path); auto hdfs_path = path.substr(begin_of_path);
@ -71,7 +71,7 @@ std::unique_ptr<ReadBufferFromFileBase> HDFSObjectStorage::readObjects( /// NOLI
}; };
return std::make_unique<ReadBufferFromRemoteFSGather>( return std::make_unique<ReadBufferFromRemoteFSGather>(
std::move(read_buffer_creator), objects, disk_read_settings, nullptr, /* use_external_buffer */false); std::move(read_buffer_creator), objects, "hdfs:", disk_read_settings, nullptr, /* use_external_buffer */false);
} }
std::unique_ptr<WriteBufferFromFileBase> HDFSObjectStorage::writeObject( /// NOLINT std::unique_ptr<WriteBufferFromFileBase> HDFSObjectStorage::writeObject( /// NOLINT

View File

@ -47,7 +47,7 @@ std::unique_ptr<ReadBufferFromFileBase> LocalObjectStorage::readObjects( /// NOL
auto modified_settings = patchSettings(read_settings); auto modified_settings = patchSettings(read_settings);
auto global_context = Context::getGlobalContextInstance(); auto global_context = Context::getGlobalContextInstance();
auto read_buffer_creator = auto read_buffer_creator =
[=] (const std::string & file_path, size_t /* read_until_position */) [=] (bool /* restricted_seek */, const std::string & file_path)
-> std::unique_ptr<ReadBufferFromFileBase> -> std::unique_ptr<ReadBufferFromFileBase>
{ {
return createReadBufferFromFileBase(file_path, modified_settings, read_hint, file_size); return createReadBufferFromFileBase(file_path, modified_settings, read_hint, file_size);
@ -58,13 +58,13 @@ std::unique_ptr<ReadBufferFromFileBase> LocalObjectStorage::readObjects( /// NOL
case RemoteFSReadMethod::read: case RemoteFSReadMethod::read:
{ {
return std::make_unique<ReadBufferFromRemoteFSGather>( return std::make_unique<ReadBufferFromRemoteFSGather>(
std::move(read_buffer_creator), objects, modified_settings, std::move(read_buffer_creator), objects, "file:", modified_settings,
global_context->getFilesystemCacheLog(), /* use_external_buffer */false); global_context->getFilesystemCacheLog(), /* use_external_buffer */false);
} }
case RemoteFSReadMethod::threadpool: case RemoteFSReadMethod::threadpool:
{ {
auto impl = std::make_unique<ReadBufferFromRemoteFSGather>( auto impl = std::make_unique<ReadBufferFromRemoteFSGather>(
std::move(read_buffer_creator), objects, modified_settings, std::move(read_buffer_creator), objects, "file:", modified_settings,
global_context->getFilesystemCacheLog(), /* use_external_buffer */true); global_context->getFilesystemCacheLog(), /* use_external_buffer */true);
auto & reader = global_context->getThreadPoolReader(FilesystemReaderType::ASYNCHRONOUS_REMOTE_FS_READER); auto & reader = global_context->getThreadPoolReader(FilesystemReaderType::ASYNCHRONOUS_REMOTE_FS_READER);

View File

@ -171,7 +171,7 @@ std::unique_ptr<ReadBufferFromFileBase> S3ObjectStorage::readObjects( /// NOLINT
auto read_buffer_creator = auto read_buffer_creator =
[this, settings_ptr, disk_read_settings] [this, settings_ptr, disk_read_settings]
(const std::string & path, size_t read_until_position) -> std::unique_ptr<ReadBufferFromFileBase> (bool restricted_seek, const std::string & path) -> std::unique_ptr<ReadBufferFromFileBase>
{ {
return std::make_unique<ReadBufferFromS3>( return std::make_unique<ReadBufferFromS3>(
client.get(), client.get(),
@ -182,8 +182,8 @@ std::unique_ptr<ReadBufferFromFileBase> S3ObjectStorage::readObjects( /// NOLINT
disk_read_settings, disk_read_settings,
/* use_external_buffer */true, /* use_external_buffer */true,
/* offset */0, /* offset */0,
read_until_position, /* read_until_position */0,
/* restricted_seek */true); restricted_seek);
}; };
switch (read_settings.remote_fs_method) switch (read_settings.remote_fs_method)
@ -193,16 +193,17 @@ std::unique_ptr<ReadBufferFromFileBase> S3ObjectStorage::readObjects( /// NOLINT
return std::make_unique<ReadBufferFromRemoteFSGather>( return std::make_unique<ReadBufferFromRemoteFSGather>(
std::move(read_buffer_creator), std::move(read_buffer_creator),
objects, objects,
"s3:" + uri.bucket + "/",
disk_read_settings, disk_read_settings,
global_context->getFilesystemCacheLog(), global_context->getFilesystemCacheLog(),
/* use_external_buffer */false); /* use_external_buffer */false);
} }
case RemoteFSReadMethod::threadpool: case RemoteFSReadMethod::threadpool:
{ {
auto impl = std::make_unique<ReadBufferFromRemoteFSGather>( auto impl = std::make_unique<ReadBufferFromRemoteFSGather>(
std::move(read_buffer_creator), std::move(read_buffer_creator),
objects, objects,
"s3:" + uri.bucket + "/",
disk_read_settings, disk_read_settings,
global_context->getFilesystemCacheLog(), global_context->getFilesystemCacheLog(),
/* use_external_buffer */true); /* use_external_buffer */true);

View File

@ -252,14 +252,13 @@ std::unique_ptr<ReadBufferFromFileBase> WebObjectStorage::readObject( /// NOLINT
{ {
auto read_buffer_creator = auto read_buffer_creator =
[this, read_settings] [this, read_settings]
(const std::string & path_, size_t read_until_position) -> std::unique_ptr<ReadBufferFromFileBase> (bool /* restricted_seek */, const std::string & path_) -> std::unique_ptr<ReadBufferFromFileBase>
{ {
return std::make_unique<ReadBufferFromWebServer>( return std::make_unique<ReadBufferFromWebServer>(
fs::path(url) / path_, fs::path(url) / path_,
getContext(), getContext(),
read_settings, read_settings,
/* use_external_buffer */true, /* use_external_buffer */true);
read_until_position);
}; };
auto global_context = Context::getGlobalContextInstance(); auto global_context = Context::getGlobalContextInstance();
@ -271,6 +270,7 @@ std::unique_ptr<ReadBufferFromFileBase> WebObjectStorage::readObject( /// NOLINT
return std::make_unique<ReadBufferFromRemoteFSGather>( return std::make_unique<ReadBufferFromRemoteFSGather>(
std::move(read_buffer_creator), std::move(read_buffer_creator),
StoredObjects{object}, StoredObjects{object},
"url:" + url + "/",
read_settings, read_settings,
global_context->getFilesystemCacheLog(), global_context->getFilesystemCacheLog(),
/* use_external_buffer */false); /* use_external_buffer */false);
@ -280,6 +280,7 @@ std::unique_ptr<ReadBufferFromFileBase> WebObjectStorage::readObject( /// NOLINT
auto impl = std::make_unique<ReadBufferFromRemoteFSGather>( auto impl = std::make_unique<ReadBufferFromRemoteFSGather>(
std::move(read_buffer_creator), std::move(read_buffer_creator),
StoredObjects{object}, StoredObjects{object},
"url:" + url + "/",
read_settings, read_settings,
global_context->getFilesystemCacheLog(), global_context->getFilesystemCacheLog(),
/* use_external_buffer */true); /* use_external_buffer */true);

View File

@ -450,8 +450,10 @@ String getAdditionalFormatInfoByEscapingRule(const FormatSettings & settings, Fo
break; break;
case FormatSettings::EscapingRule::JSON: case FormatSettings::EscapingRule::JSON:
result += fmt::format( result += fmt::format(
", try_infer_numbers_from_strings={}, read_bools_as_numbers={}, read_bools_as_strings={}, read_objects_as_strings={}, read_numbers_as_strings={}, " ", try_infer_numbers_from_strings={}, read_bools_as_numbers={}, read_bools_as_strings={}, read_objects_as_strings={}, "
"read_arrays_as_strings={}, try_infer_objects_as_tuples={}, infer_incomplete_types_as_strings={}, try_infer_objects={}", "read_numbers_as_strings={}, "
"read_arrays_as_strings={}, try_infer_objects_as_tuples={}, infer_incomplete_types_as_strings={}, try_infer_objects={}, "
"use_string_type_for_ambiguous_paths_in_named_tuples_inference_from_objects={}",
settings.json.try_infer_numbers_from_strings, settings.json.try_infer_numbers_from_strings,
settings.json.read_bools_as_numbers, settings.json.read_bools_as_numbers,
settings.json.read_bools_as_strings, settings.json.read_bools_as_strings,
@ -460,7 +462,8 @@ String getAdditionalFormatInfoByEscapingRule(const FormatSettings & settings, Fo
settings.json.read_arrays_as_strings, settings.json.read_arrays_as_strings,
settings.json.try_infer_objects_as_tuples, settings.json.try_infer_objects_as_tuples,
settings.json.infer_incomplete_types_as_strings, settings.json.infer_incomplete_types_as_strings,
settings.json.allow_object_type); settings.json.allow_object_type,
settings.json.use_string_type_for_ambiguous_paths_in_named_tuples_inference_from_objects);
break; break;
default: default:
break; break;

View File

@ -31,14 +31,31 @@ namespace ErrorCodes
extern const int BAD_ARGUMENTS; extern const int BAD_ARGUMENTS;
} }
bool FormatFactory::exists(const String & name) const
{
return dict.find(boost::to_lower_copy(name)) != dict.end();
}
const FormatFactory::Creators & FormatFactory::getCreators(const String & name) const const FormatFactory::Creators & FormatFactory::getCreators(const String & name) const
{ {
auto it = dict.find(name); auto it = dict.find(boost::to_lower_copy(name));
if (dict.end() != it) if (dict.end() != it)
return it->second; return it->second;
throw Exception(ErrorCodes::UNKNOWN_FORMAT, "Unknown format {}", name); throw Exception(ErrorCodes::UNKNOWN_FORMAT, "Unknown format {}", name);
} }
FormatFactory::Creators & FormatFactory::getOrCreateCreators(const String & name)
{
String lower_case = boost::to_lower_copy(name);
auto it = dict.find(lower_case);
if (dict.end() != it)
return it->second;
auto & creators = dict[lower_case];
creators.name = name;
return creators;
}
FormatSettings getFormatSettings(const ContextPtr & context) FormatSettings getFormatSettings(const ContextPtr & context)
{ {
const auto & settings = context->getSettingsRef(); const auto & settings = context->getSettingsRef();
@ -105,6 +122,7 @@ FormatSettings getFormatSettings(const ContextPtr & context, const Settings & se
format_settings.json.write_named_tuples_as_objects = settings.output_format_json_named_tuples_as_objects; format_settings.json.write_named_tuples_as_objects = settings.output_format_json_named_tuples_as_objects;
format_settings.json.skip_null_value_in_named_tuples = settings.output_format_json_skip_null_value_in_named_tuples; format_settings.json.skip_null_value_in_named_tuples = settings.output_format_json_skip_null_value_in_named_tuples;
format_settings.json.read_named_tuples_as_objects = settings.input_format_json_named_tuples_as_objects; format_settings.json.read_named_tuples_as_objects = settings.input_format_json_named_tuples_as_objects;
format_settings.json.use_string_type_for_ambiguous_paths_in_named_tuples_inference_from_objects = settings.input_format_json_use_string_type_for_ambiguous_paths_in_named_tuples_inference_from_objects;
format_settings.json.defaults_for_missing_elements_in_named_tuple = settings.input_format_json_defaults_for_missing_elements_in_named_tuple; format_settings.json.defaults_for_missing_elements_in_named_tuple = settings.input_format_json_defaults_for_missing_elements_in_named_tuple;
format_settings.json.ignore_unknown_keys_in_named_tuple = settings.input_format_json_ignore_unknown_keys_in_named_tuple; format_settings.json.ignore_unknown_keys_in_named_tuple = settings.input_format_json_ignore_unknown_keys_in_named_tuple;
format_settings.json.quote_64bit_integers = settings.output_format_json_quote_64bit_integers; format_settings.json.quote_64bit_integers = settings.output_format_json_quote_64bit_integers;
@ -543,7 +561,7 @@ SchemaReaderPtr FormatFactory::getSchemaReader(
const ContextPtr & context, const ContextPtr & context,
const std::optional<FormatSettings> & _format_settings) const const std::optional<FormatSettings> & _format_settings) const
{ {
const auto & schema_reader_creator = dict.at(name).schema_reader_creator; const auto & schema_reader_creator = getCreators(name).schema_reader_creator;
if (!schema_reader_creator) if (!schema_reader_creator)
throw Exception(ErrorCodes::LOGICAL_ERROR, "FormatFactory: Format {} doesn't support schema inference.", name); throw Exception(ErrorCodes::LOGICAL_ERROR, "FormatFactory: Format {} doesn't support schema inference.", name);
@ -559,7 +577,7 @@ ExternalSchemaReaderPtr FormatFactory::getExternalSchemaReader(
const ContextPtr & context, const ContextPtr & context,
const std::optional<FormatSettings> & _format_settings) const const std::optional<FormatSettings> & _format_settings) const
{ {
const auto & external_schema_reader_creator = dict.at(name).external_schema_reader_creator; const auto & external_schema_reader_creator = getCreators(name).external_schema_reader_creator;
if (!external_schema_reader_creator) if (!external_schema_reader_creator)
throw Exception(ErrorCodes::LOGICAL_ERROR, "FormatFactory: Format {} doesn't support schema inference.", name); throw Exception(ErrorCodes::LOGICAL_ERROR, "FormatFactory: Format {} doesn't support schema inference.", name);
@ -570,28 +588,28 @@ ExternalSchemaReaderPtr FormatFactory::getExternalSchemaReader(
void FormatFactory::registerInputFormat(const String & name, InputCreator input_creator) void FormatFactory::registerInputFormat(const String & name, InputCreator input_creator)
{ {
chassert(input_creator); chassert(input_creator);
auto & creators = dict[name]; auto & creators = getOrCreateCreators(name);
if (creators.input_creator || creators.random_access_input_creator) if (creators.input_creator || creators.random_access_input_creator)
throw Exception(ErrorCodes::LOGICAL_ERROR, "FormatFactory: Input format {} is already registered", name); throw Exception(ErrorCodes::LOGICAL_ERROR, "FormatFactory: Input format {} is already registered", name);
creators.input_creator = std::move(input_creator); creators.input_creator = std::move(input_creator);
registerFileExtension(name, name); registerFileExtension(name, name);
KnownFormatNames::instance().add(name); KnownFormatNames::instance().add(name, /* case_insensitive = */ true);
} }
void FormatFactory::registerRandomAccessInputFormat(const String & name, RandomAccessInputCreator input_creator) void FormatFactory::registerRandomAccessInputFormat(const String & name, RandomAccessInputCreator input_creator)
{ {
chassert(input_creator); chassert(input_creator);
auto & creators = dict[name]; auto & creators = getOrCreateCreators(name);
if (creators.input_creator || creators.random_access_input_creator) if (creators.input_creator || creators.random_access_input_creator)
throw Exception(ErrorCodes::LOGICAL_ERROR, "FormatFactory: Input format {} is already registered", name); throw Exception(ErrorCodes::LOGICAL_ERROR, "FormatFactory: Input format {} is already registered", name);
creators.random_access_input_creator = std::move(input_creator); creators.random_access_input_creator = std::move(input_creator);
registerFileExtension(name, name); registerFileExtension(name, name);
KnownFormatNames::instance().add(name); KnownFormatNames::instance().add(name, /* case_insensitive = */ true);
} }
void FormatFactory::registerNonTrivialPrefixAndSuffixChecker(const String & name, NonTrivialPrefixAndSuffixChecker non_trivial_prefix_and_suffix_checker) void FormatFactory::registerNonTrivialPrefixAndSuffixChecker(const String & name, NonTrivialPrefixAndSuffixChecker non_trivial_prefix_and_suffix_checker)
{ {
auto & target = dict[name].non_trivial_prefix_and_suffix_checker; auto & target = getOrCreateCreators(name).non_trivial_prefix_and_suffix_checker;
if (target) if (target)
throw Exception(ErrorCodes::LOGICAL_ERROR, "FormatFactory: Non trivial prefix and suffix checker {} is already registered", name); throw Exception(ErrorCodes::LOGICAL_ERROR, "FormatFactory: Non trivial prefix and suffix checker {} is already registered", name);
target = std::move(non_trivial_prefix_and_suffix_checker); target = std::move(non_trivial_prefix_and_suffix_checker);
@ -599,7 +617,7 @@ void FormatFactory::registerNonTrivialPrefixAndSuffixChecker(const String & name
void FormatFactory::registerAppendSupportChecker(const String & name, AppendSupportChecker append_support_checker) void FormatFactory::registerAppendSupportChecker(const String & name, AppendSupportChecker append_support_checker)
{ {
auto & target = dict[name].append_support_checker; auto & target = getOrCreateCreators(name).append_support_checker;
if (target) if (target)
throw Exception(ErrorCodes::LOGICAL_ERROR, "FormatFactory: Suffix checker {} is already registered", name); throw Exception(ErrorCodes::LOGICAL_ERROR, "FormatFactory: Suffix checker {} is already registered", name);
target = std::move(append_support_checker); target = std::move(append_support_checker);
@ -613,19 +631,19 @@ void FormatFactory::markFormatHasNoAppendSupport(const String & name)
bool FormatFactory::checkIfFormatSupportAppend(const String & name, const ContextPtr & context, const std::optional<FormatSettings> & format_settings_) bool FormatFactory::checkIfFormatSupportAppend(const String & name, const ContextPtr & context, const std::optional<FormatSettings> & format_settings_)
{ {
auto format_settings = format_settings_ ? *format_settings_ : getFormatSettings(context); auto format_settings = format_settings_ ? *format_settings_ : getFormatSettings(context);
auto & append_support_checker = dict[name].append_support_checker; const auto & append_support_checker = getCreators(name).append_support_checker;
/// By default we consider that format supports append /// By default we consider that format supports append
return !append_support_checker || append_support_checker(format_settings); return !append_support_checker || append_support_checker(format_settings);
} }
void FormatFactory::registerOutputFormat(const String & name, OutputCreator output_creator) void FormatFactory::registerOutputFormat(const String & name, OutputCreator output_creator)
{ {
auto & target = dict[name].output_creator; auto & target = getOrCreateCreators(name).output_creator;
if (target) if (target)
throw Exception(ErrorCodes::LOGICAL_ERROR, "FormatFactory: Output format {} is already registered", name); throw Exception(ErrorCodes::LOGICAL_ERROR, "FormatFactory: Output format {} is already registered", name);
target = std::move(output_creator); target = std::move(output_creator);
registerFileExtension(name, name); registerFileExtension(name, name);
KnownFormatNames::instance().add(name); KnownFormatNames::instance().add(name, /* case_insensitive = */ true);
} }
void FormatFactory::registerFileExtension(const String & extension, const String & format_name) void FormatFactory::registerFileExtension(const String & extension, const String & format_name)
@ -697,7 +715,7 @@ String FormatFactory::getFormatFromFileDescriptor(int fd)
void FormatFactory::registerFileSegmentationEngine(const String & name, FileSegmentationEngine file_segmentation_engine) void FormatFactory::registerFileSegmentationEngine(const String & name, FileSegmentationEngine file_segmentation_engine)
{ {
auto & target = dict[name].file_segmentation_engine_creator; auto & target = getOrCreateCreators(name).file_segmentation_engine_creator;
if (target) if (target)
throw Exception(ErrorCodes::LOGICAL_ERROR, "FormatFactory: File segmentation engine {} is already registered", name); throw Exception(ErrorCodes::LOGICAL_ERROR, "FormatFactory: File segmentation engine {} is already registered", name);
auto creator = [file_segmentation_engine](const FormatSettings &) auto creator = [file_segmentation_engine](const FormatSettings &)
@ -709,7 +727,7 @@ void FormatFactory::registerFileSegmentationEngine(const String & name, FileSegm
void FormatFactory::registerFileSegmentationEngineCreator(const String & name, FileSegmentationEngineCreator file_segmentation_engine_creator) void FormatFactory::registerFileSegmentationEngineCreator(const String & name, FileSegmentationEngineCreator file_segmentation_engine_creator)
{ {
auto & target = dict[name].file_segmentation_engine_creator; auto & target = getOrCreateCreators(name).file_segmentation_engine_creator;
if (target) if (target)
throw Exception(ErrorCodes::LOGICAL_ERROR, "FormatFactory: File segmentation engine creator {} is already registered", name); throw Exception(ErrorCodes::LOGICAL_ERROR, "FormatFactory: File segmentation engine creator {} is already registered", name);
target = std::move(file_segmentation_engine_creator); target = std::move(file_segmentation_engine_creator);
@ -717,7 +735,7 @@ void FormatFactory::registerFileSegmentationEngineCreator(const String & name, F
void FormatFactory::registerSchemaReader(const String & name, SchemaReaderCreator schema_reader_creator) void FormatFactory::registerSchemaReader(const String & name, SchemaReaderCreator schema_reader_creator)
{ {
auto & target = dict[name].schema_reader_creator; auto & target = getOrCreateCreators(name).schema_reader_creator;
if (target) if (target)
throw Exception(ErrorCodes::LOGICAL_ERROR, "FormatFactory: Schema reader {} is already registered", name); throw Exception(ErrorCodes::LOGICAL_ERROR, "FormatFactory: Schema reader {} is already registered", name);
target = std::move(schema_reader_creator); target = std::move(schema_reader_creator);
@ -725,7 +743,7 @@ void FormatFactory::registerSchemaReader(const String & name, SchemaReaderCreato
void FormatFactory::registerExternalSchemaReader(const String & name, ExternalSchemaReaderCreator external_schema_reader_creator) void FormatFactory::registerExternalSchemaReader(const String & name, ExternalSchemaReaderCreator external_schema_reader_creator)
{ {
auto & target = dict[name].external_schema_reader_creator; auto & target = getOrCreateCreators(name).external_schema_reader_creator;
if (target) if (target)
throw Exception(ErrorCodes::LOGICAL_ERROR, "FormatFactory: Schema reader {} is already registered", name); throw Exception(ErrorCodes::LOGICAL_ERROR, "FormatFactory: Schema reader {} is already registered", name);
target = std::move(external_schema_reader_creator); target = std::move(external_schema_reader_creator);
@ -733,7 +751,7 @@ void FormatFactory::registerExternalSchemaReader(const String & name, ExternalSc
void FormatFactory::markOutputFormatSupportsParallelFormatting(const String & name) void FormatFactory::markOutputFormatSupportsParallelFormatting(const String & name)
{ {
auto & target = dict[name].supports_parallel_formatting; auto & target = getOrCreateCreators(name).supports_parallel_formatting;
if (target) if (target)
throw Exception(ErrorCodes::LOGICAL_ERROR, "FormatFactory: Output format {} is already marked as supporting parallel formatting", name); throw Exception(ErrorCodes::LOGICAL_ERROR, "FormatFactory: Output format {} is already marked as supporting parallel formatting", name);
target = true; target = true;
@ -742,7 +760,7 @@ void FormatFactory::markOutputFormatSupportsParallelFormatting(const String & na
void FormatFactory::markFormatSupportsSubsetOfColumns(const String & name) void FormatFactory::markFormatSupportsSubsetOfColumns(const String & name)
{ {
auto & target = dict[name].subset_of_columns_support_checker; auto & target = getOrCreateCreators(name).subset_of_columns_support_checker;
if (target) if (target)
throw Exception(ErrorCodes::LOGICAL_ERROR, "FormatFactory: Format {} is already marked as supporting subset of columns", name); throw Exception(ErrorCodes::LOGICAL_ERROR, "FormatFactory: Format {} is already marked as supporting subset of columns", name);
target = [](const FormatSettings &){ return true; }; target = [](const FormatSettings &){ return true; };
@ -750,7 +768,7 @@ void FormatFactory::markFormatSupportsSubsetOfColumns(const String & name)
void FormatFactory::registerSubsetOfColumnsSupportChecker(const String & name, SubsetOfColumnsSupportChecker subset_of_columns_support_checker) void FormatFactory::registerSubsetOfColumnsSupportChecker(const String & name, SubsetOfColumnsSupportChecker subset_of_columns_support_checker)
{ {
auto & target = dict[name].subset_of_columns_support_checker; auto & target = getOrCreateCreators(name).subset_of_columns_support_checker;
if (target) if (target)
throw Exception(ErrorCodes::LOGICAL_ERROR, "FormatFactory: Format {} is already marked as supporting subset of columns", name); throw Exception(ErrorCodes::LOGICAL_ERROR, "FormatFactory: Format {} is already marked as supporting subset of columns", name);
target = std::move(subset_of_columns_support_checker); target = std::move(subset_of_columns_support_checker);
@ -758,7 +776,7 @@ void FormatFactory::registerSubsetOfColumnsSupportChecker(const String & name, S
void FormatFactory::markOutputFormatPrefersLargeBlocks(const String & name) void FormatFactory::markOutputFormatPrefersLargeBlocks(const String & name)
{ {
auto & target = dict[name].prefers_large_blocks; auto & target = getOrCreateCreators(name).prefers_large_blocks;
if (target) if (target)
throw Exception(ErrorCodes::LOGICAL_ERROR, "FormatFactory: Format {} is already marked as preferring large blocks", name); throw Exception(ErrorCodes::LOGICAL_ERROR, "FormatFactory: Format {} is already marked as preferring large blocks", name);
target = true; target = true;
@ -774,7 +792,7 @@ bool FormatFactory::checkIfFormatSupportsSubsetOfColumns(const String & name, co
void FormatFactory::registerAdditionalInfoForSchemaCacheGetter( void FormatFactory::registerAdditionalInfoForSchemaCacheGetter(
const String & name, AdditionalInfoForSchemaCacheGetter additional_info_for_schema_cache_getter) const String & name, AdditionalInfoForSchemaCacheGetter additional_info_for_schema_cache_getter)
{ {
auto & target = dict[name].additional_info_for_schema_cache_getter; auto & target = getOrCreateCreators(name).additional_info_for_schema_cache_getter;
if (target) if (target)
throw Exception(ErrorCodes::LOGICAL_ERROR, "FormatFactory: additional info for schema cache getter {} is already registered", name); throw Exception(ErrorCodes::LOGICAL_ERROR, "FormatFactory: additional info for schema cache getter {} is already registered", name);
target = std::move(additional_info_for_schema_cache_getter); target = std::move(additional_info_for_schema_cache_getter);
@ -792,13 +810,13 @@ String FormatFactory::getAdditionalInfoForSchemaCache(const String & name, const
bool FormatFactory::isInputFormat(const String & name) const bool FormatFactory::isInputFormat(const String & name) const
{ {
auto it = dict.find(name); auto it = dict.find(boost::to_lower_copy(name));
return it != dict.end() && (it->second.input_creator || it->second.random_access_input_creator); return it != dict.end() && (it->second.input_creator || it->second.random_access_input_creator);
} }
bool FormatFactory::isOutputFormat(const String & name) const bool FormatFactory::isOutputFormat(const String & name) const
{ {
auto it = dict.find(name); auto it = dict.find(boost::to_lower_copy(name));
return it != dict.end() && it->second.output_creator; return it != dict.end() && it->second.output_creator;
} }
@ -827,7 +845,8 @@ bool FormatFactory::checkIfOutputFormatPrefersLargeBlocks(const String & name) c
bool FormatFactory::checkParallelizeOutputAfterReading(const String & name, const ContextPtr & context) const bool FormatFactory::checkParallelizeOutputAfterReading(const String & name, const ContextPtr & context) const
{ {
if (name == "Parquet" && context->getSettingsRef().input_format_parquet_preserve_order) auto format_name = boost::to_lower_copy(name);
if (format_name == "parquet" && context->getSettingsRef().input_format_parquet_preserve_order)
return false; return false;
return true; return true;
@ -835,7 +854,7 @@ bool FormatFactory::checkParallelizeOutputAfterReading(const String & name, cons
void FormatFactory::checkFormatName(const String & name) const void FormatFactory::checkFormatName(const String & name) const
{ {
auto it = dict.find(name); auto it = dict.find(boost::to_lower_copy(name));
if (it == dict.end()) if (it == dict.end())
throw Exception(ErrorCodes::UNKNOWN_FORMAT, "Unknown format {}", name); throw Exception(ErrorCodes::UNKNOWN_FORMAT, "Unknown format {}", name);
} }

View File

@ -132,6 +132,7 @@ private:
struct Creators struct Creators
{ {
String name;
InputCreator input_creator; InputCreator input_creator;
RandomAccessInputCreator random_access_input_creator; RandomAccessInputCreator random_access_input_creator;
OutputCreator output_creator; OutputCreator output_creator;
@ -263,12 +264,14 @@ public:
/// Check that format with specified name exists and throw an exception otherwise. /// Check that format with specified name exists and throw an exception otherwise.
void checkFormatName(const String & name) const; void checkFormatName(const String & name) const;
bool exists(const String & name) const;
private: private:
FormatsDictionary dict; FormatsDictionary dict;
FileExtensionFormats file_extension_formats; FileExtensionFormats file_extension_formats;
const Creators & getCreators(const String & name) const; const Creators & getCreators(const String & name) const;
Creators & getOrCreateCreators(const String & name);
// Creates a ReadBuffer to give to an input format. Returns nullptr if we should use `buf` directly. // Creates a ReadBuffer to give to an input format. Returns nullptr if we should use `buf` directly.
std::unique_ptr<ReadBuffer> wrapReadBufferIfNeeded( std::unique_ptr<ReadBuffer> wrapReadBufferIfNeeded(

View File

@ -202,6 +202,7 @@ struct FormatSettings
bool quote_decimals = false; bool quote_decimals = false;
bool escape_forward_slashes = true; bool escape_forward_slashes = true;
bool read_named_tuples_as_objects = false; bool read_named_tuples_as_objects = false;
bool use_string_type_for_ambiguous_paths_in_named_tuples_inference_from_objects = false;
bool write_named_tuples_as_objects = false; bool write_named_tuples_as_objects = false;
bool skip_null_value_in_named_tuples = false; bool skip_null_value_in_named_tuples = false;
bool defaults_for_missing_elements_in_named_tuple = false; bool defaults_for_missing_elements_in_named_tuple = false;

View File

@ -136,7 +136,7 @@ namespace
bool empty() const { return paths.empty(); } bool empty() const { return paths.empty(); }
DataTypePtr finalize() const DataTypePtr finalize(bool use_string_type_for_ambiguous_paths = false) const
{ {
if (paths.empty()) if (paths.empty())
throw Exception(ErrorCodes::ONLY_NULLS_WHILE_READING_SCHEMA, "Cannot infer named Tuple from JSON object because object is empty"); throw Exception(ErrorCodes::ONLY_NULLS_WHILE_READING_SCHEMA, "Cannot infer named Tuple from JSON object because object is empty");
@ -167,7 +167,7 @@ namespace
current_node->leaf_type = type; current_node->leaf_type = type;
} }
return root_node.getType(); return root_node.getType(use_string_type_for_ambiguous_paths);
} }
private: private:
@ -180,19 +180,8 @@ namespace
/// Store path to this node for better exception message in case of ambiguous paths. /// Store path to this node for better exception message in case of ambiguous paths.
String path; String path;
DataTypePtr getType() const DataTypePtr getType(bool use_string_type_for_ambiguous_paths) const
{ {
/// Check if we have ambiguous paths.
/// For example:
/// 'a.b.c' : Int32 and 'a.b' : String
/// Also check if leaf type is Nothing, because the next situation is possible:
/// {"a" : {"b" : null}} -> 'a.b' : Nullable(Nothing)
/// {"a" : {"b" : {"c" : 42}}} -> 'a.b.c' : Int32
/// And after merge we will have ambiguous paths 'a.b.c' : Int32 and 'a.b' : Nullable(Nothing),
/// but it's a valid case and we should ignore path 'a.b'.
if (leaf_type && !isNothing(removeNullable(leaf_type)) && !nodes.empty())
throw Exception(ErrorCodes::INCORRECT_DATA, "JSON objects have ambiguous paths: '{}' with type {} and '{}'", path, leaf_type->getName(), nodes.begin()->second.path);
if (nodes.empty()) if (nodes.empty())
return leaf_type; return leaf_type;
@ -203,10 +192,33 @@ namespace
for (const auto & [name, node] : nodes) for (const auto & [name, node] : nodes)
{ {
node_names.push_back(name); node_names.push_back(name);
node_types.push_back(node.getType()); node_types.push_back(node.getType(use_string_type_for_ambiguous_paths));
} }
return std::make_shared<DataTypeTuple>(std::move(node_types), std::move(node_names)); auto tuple_type = std::make_shared<DataTypeTuple>(std::move(node_types), std::move(node_names));
/// Check if we have ambiguous paths.
/// For example:
/// 'a.b.c' : Int32 and 'a.b' : String
/// Also check if leaf type is Nothing, because the next situation is possible:
/// {"a" : {"b" : null}} -> 'a.b' : Nullable(Nothing)
/// {"a" : {"b" : {"c" : 42}}} -> 'a.b.c' : Int32
/// And after merge we will have ambiguous paths 'a.b.c' : Int32 and 'a.b' : Nullable(Nothing),
/// but it's a valid case and we should ignore path 'a.b'.
if (leaf_type && !isNothing(removeNullable(leaf_type)) && !nodes.empty())
{
if (use_string_type_for_ambiguous_paths)
return std::make_shared<DataTypeString>();
throw Exception(
ErrorCodes::INCORRECT_DATA,
"JSON objects have ambiguous data: in some objects path '{}' has type '{}' and in some - '{}'. You can enable setting "
"input_format_json_use_string_type_for_ambiguous_paths_in_named_tuples_inference_from_objects to use String type "
"for path '{}'",
path, leaf_type->getName(), tuple_type->getName(), path);
}
return tuple_type;
} }
}; };
@ -866,13 +878,15 @@ namespace
return std::make_shared<DataTypeTuple>(nested_types); return std::make_shared<DataTypeTuple>(nested_types);
} }
template <bool is_json>
bool tryReadFloat(Float64 & value, ReadBuffer & buf, const FormatSettings & settings) bool tryReadFloat(Float64 & value, ReadBuffer & buf, const FormatSettings & settings)
{ {
if (settings.try_infer_exponent_floats) if (is_json || settings.try_infer_exponent_floats)
return tryReadFloatText(value, buf); return tryReadFloatText(value, buf);
return tryReadFloatTextNoExponent(value, buf); return tryReadFloatTextNoExponent(value, buf);
} }
template <bool is_json>
DataTypePtr tryInferNumber(ReadBuffer & buf, const FormatSettings & settings) DataTypePtr tryInferNumber(ReadBuffer & buf, const FormatSettings & settings)
{ {
if (buf.eof()) if (buf.eof())
@ -911,7 +925,7 @@ namespace
buf.position() = number_start; buf.position() = number_start;
} }
if (tryReadFloat(tmp_float, buf, settings)) if (tryReadFloat<is_json>(tmp_float, buf, settings))
{ {
if (read_int && buf.position() == int_end) if (read_int && buf.position() == int_end)
return std::make_shared<DataTypeInt64>(); return std::make_shared<DataTypeInt64>();
@ -945,7 +959,7 @@ namespace
peekable_buf.rollbackToCheckpoint(true); peekable_buf.rollbackToCheckpoint(true);
} }
if (tryReadFloat(tmp_float, peekable_buf, settings)) if (tryReadFloat<is_json>(tmp_float, peekable_buf, settings))
{ {
/// Float parsing reads no fewer bytes than integer parsing, /// Float parsing reads no fewer bytes than integer parsing,
/// so position of the buffer is either the same, or further. /// so position of the buffer is either the same, or further.
@ -957,7 +971,7 @@ namespace
return std::make_shared<DataTypeFloat64>(); return std::make_shared<DataTypeFloat64>();
} }
} }
else if (tryReadFloat(tmp_float, buf, settings)) else if (tryReadFloat<is_json>(tmp_float, buf, settings))
{ {
return std::make_shared<DataTypeFloat64>(); return std::make_shared<DataTypeFloat64>();
} }
@ -966,6 +980,36 @@ namespace
return nullptr; return nullptr;
} }
template <bool is_json>
DataTypePtr tryInferNumberFromStringImpl(std::string_view field, const FormatSettings & settings)
{
ReadBufferFromString buf(field);
if (settings.try_infer_integers)
{
Int64 tmp_int;
if (tryReadIntText(tmp_int, buf) && buf.eof())
return std::make_shared<DataTypeInt64>();
/// We can safely get back to the start of buffer, because we read from a string and we didn't reach eof.
buf.position() = buf.buffer().begin();
/// In case of Int64 overflow, try to infer UInt64
UInt64 tmp_uint;
if (tryReadIntText(tmp_uint, buf) && buf.eof())
return std::make_shared<DataTypeUInt64>();
}
/// We can safely get back to the start of buffer, because we read from a string and we didn't reach eof.
buf.position() = buf.buffer().begin();
Float64 tmp;
if (tryReadFloat<is_json>(tmp, buf, settings) && buf.eof())
return std::make_shared<DataTypeFloat64>();
return nullptr;
}
template <bool is_json> template <bool is_json>
DataTypePtr tryInferString(ReadBuffer & buf, const FormatSettings & settings, JSONInferenceInfo * json_info) DataTypePtr tryInferString(ReadBuffer & buf, const FormatSettings & settings, JSONInferenceInfo * json_info)
{ {
@ -995,7 +1039,7 @@ namespace
{ {
if (settings.json.try_infer_numbers_from_strings) if (settings.json.try_infer_numbers_from_strings)
{ {
if (auto number_type = tryInferNumberFromString(field, settings)) if (auto number_type = tryInferNumberFromStringImpl<true>(field, settings))
{ {
json_info->numbers_parsed_from_json_strings.insert(number_type.get()); json_info->numbers_parsed_from_json_strings.insert(number_type.get());
return number_type; return number_type;
@ -1238,7 +1282,7 @@ namespace
} }
/// Number /// Number
return tryInferNumber(buf, settings); return tryInferNumber<is_json>(buf, settings);
} }
} }
@ -1294,7 +1338,7 @@ void transformFinalInferredJSONTypeIfNeededImpl(DataTypePtr & data_type, const F
return; return;
} }
data_type = json_paths->finalize(); data_type = json_paths->finalize(settings.json.use_string_type_for_ambiguous_paths_in_named_tuples_inference_from_objects);
transformFinalInferredJSONTypeIfNeededImpl(data_type, settings, json_info, remain_nothing_types); transformFinalInferredJSONTypeIfNeededImpl(data_type, settings, json_info, remain_nothing_types);
return; return;
} }
@ -1377,31 +1421,7 @@ void transformFinalInferredJSONTypeIfNeeded(DataTypePtr & data_type, const Forma
DataTypePtr tryInferNumberFromString(std::string_view field, const FormatSettings & settings) DataTypePtr tryInferNumberFromString(std::string_view field, const FormatSettings & settings)
{ {
ReadBufferFromString buf(field); return tryInferNumberFromStringImpl<false>(field, settings);
if (settings.try_infer_integers)
{
Int64 tmp_int;
if (tryReadIntText(tmp_int, buf) && buf.eof())
return std::make_shared<DataTypeInt64>();
/// We can safely get back to the start of buffer, because we read from a string and we didn't reach eof.
buf.position() = buf.buffer().begin();
/// In case of Int64 overflow, try to infer UInt64
UInt64 tmp_uint;
if (tryReadIntText(tmp_uint, buf) && buf.eof())
return std::make_shared<DataTypeUInt64>();
}
/// We can safely get back to the start of buffer, because we read from a string and we didn't reach eof.
buf.position() = buf.buffer().begin();
Float64 tmp;
if (tryReadFloat(tmp, buf, settings) && buf.eof())
return std::make_shared<DataTypeFloat64>();
return nullptr;
} }
DataTypePtr tryInferDateOrDateTimeFromString(std::string_view field, const FormatSettings & settings) DataTypePtr tryInferDateOrDateTimeFromString(std::string_view field, const FormatSettings & settings)

View File

@ -170,7 +170,8 @@ public:
/// DateTime, but if both operands are Dates, their type must be the same (e.g. Date - DateTime is invalid). /// DateTime, but if both operands are Dates, their type must be the same (e.g. Date - DateTime is invalid).
using ResultDataType = Switch< using ResultDataType = Switch<
/// Result must be Integer /// Result must be Integer
Case<IsOperation<Operation>::div_int || IsOperation<Operation>::div_int_or_zero, DataTypeFromFieldType<typename Op::ResultType>>, Case<IsOperation<Operation>::int_div || IsOperation<Operation>::int_div_or_zero,
std::conditional_t<IsDataTypeDecimalOrNumber<LeftDataType> && IsDataTypeDecimalOrNumber<RightDataType>, DataTypeFromFieldType<typename Op::ResultType>, InvalidType>>,
/// Decimal cases /// Decimal cases
Case<IsDataTypeDecimal<LeftDataType> || IsDataTypeDecimal<RightDataType>, DecimalResultDataType>, Case<IsDataTypeDecimal<LeftDataType> || IsDataTypeDecimal<RightDataType>, DecimalResultDataType>,
Case< Case<
@ -672,8 +673,8 @@ private:
IsOperation<Operation>::minus; IsOperation<Operation>::minus;
static constexpr bool is_multiply = IsOperation<Operation>::multiply; static constexpr bool is_multiply = IsOperation<Operation>::multiply;
static constexpr bool is_float_division = IsOperation<Operation>::div_floating; static constexpr bool is_float_division = IsOperation<Operation>::div_floating;
static constexpr bool is_int_division = IsOperation<Operation>::div_int || static constexpr bool is_int_division = IsOperation<Operation>::int_div ||
IsOperation<Operation>::div_int_or_zero; IsOperation<Operation>::int_div_or_zero;
static constexpr bool is_division = is_float_division || is_int_division; static constexpr bool is_division = is_float_division || is_int_division;
static constexpr bool is_compare = IsOperation<Operation>::least || static constexpr bool is_compare = IsOperation<Operation>::least ||
IsOperation<Operation>::greatest; IsOperation<Operation>::greatest;
@ -781,8 +782,8 @@ class FunctionBinaryArithmetic : public IFunction
static constexpr bool is_division = IsOperation<Op>::division; static constexpr bool is_division = IsOperation<Op>::division;
static constexpr bool is_bit_hamming_distance = IsOperation<Op>::bit_hamming_distance; static constexpr bool is_bit_hamming_distance = IsOperation<Op>::bit_hamming_distance;
static constexpr bool is_modulo = IsOperation<Op>::modulo; static constexpr bool is_modulo = IsOperation<Op>::modulo;
static constexpr bool is_div_int = IsOperation<Op>::div_int; static constexpr bool is_int_div = IsOperation<Op>::int_div;
static constexpr bool is_div_int_or_zero = IsOperation<Op>::div_int_or_zero; static constexpr bool is_int_div_or_zero = IsOperation<Op>::int_div_or_zero;
ContextPtr context; ContextPtr context;
bool check_decimal_overflow = true; bool check_decimal_overflow = true;
@ -1007,11 +1008,11 @@ class FunctionBinaryArithmetic : public IFunction
{ {
function_name = "tupleModuloByNumber"; function_name = "tupleModuloByNumber";
} }
else if constexpr (is_div_int) else if constexpr (is_int_div)
{ {
function_name = "tupleIntDivByNumber"; function_name = "tupleIntDivByNumber";
} }
else if constexpr (is_div_int_or_zero) else if constexpr (is_int_div_or_zero)
{ {
function_name = "tupleIntDivOrZeroByNumber"; function_name = "tupleIntDivOrZeroByNumber";
} }
@ -1466,7 +1467,7 @@ public:
bool isSuitableForShortCircuitArgumentsExecution(const DataTypesWithConstInfo & arguments) const override bool isSuitableForShortCircuitArgumentsExecution(const DataTypesWithConstInfo & arguments) const override
{ {
return ((IsOperation<Op>::div_int || IsOperation<Op>::modulo || IsOperation<Op>::positive_modulo) && !arguments[1].is_const) return ((IsOperation<Op>::int_div || IsOperation<Op>::modulo || IsOperation<Op>::positive_modulo) && !arguments[1].is_const)
|| (IsOperation<Op>::div_floating || (IsOperation<Op>::div_floating
&& (isDecimalOrNullableDecimal(arguments[0].type) || isDecimalOrNullableDecimal(arguments[1].type))); && (isDecimalOrNullableDecimal(arguments[0].type) || isDecimalOrNullableDecimal(arguments[1].type)));
} }
@ -1690,7 +1691,7 @@ public:
if constexpr (!std::is_same_v<ResultDataType, InvalidType>) if constexpr (!std::is_same_v<ResultDataType, InvalidType>)
{ {
if constexpr (is_div_int || is_div_int_or_zero) if constexpr (is_int_div || is_int_div_or_zero)
type_res = std::make_shared<ResultDataType>(); type_res = std::make_shared<ResultDataType>();
else if constexpr (IsDataTypeDecimal<LeftDataType> && IsDataTypeDecimal<RightDataType>) else if constexpr (IsDataTypeDecimal<LeftDataType> && IsDataTypeDecimal<RightDataType>)
{ {
@ -2086,7 +2087,7 @@ ColumnPtr executeStringInteger(const ColumnsWithTypeAndName & arguments, const A
right_nullmap); right_nullmap);
} }
/// Here we check if we have `intDiv` or `intDivOrZero` and at least one of the arguments is decimal, because in this case originally we had result as decimal, so we need to convert result into integer after calculations /// Here we check if we have `intDiv` or `intDivOrZero` and at least one of the arguments is decimal, because in this case originally we had result as decimal, so we need to convert result into integer after calculations
else if constexpr (!decimal_with_float && (is_div_int || is_div_int_or_zero) && (IsDataTypeDecimal<LeftDataType> || IsDataTypeDecimal<RightDataType>)) else if constexpr (!decimal_with_float && (is_int_div || is_int_div_or_zero) && (IsDataTypeDecimal<LeftDataType> || IsDataTypeDecimal<RightDataType>))
{ {
if constexpr (!std::is_same_v<DecimalResultType, InvalidType>) if constexpr (!std::is_same_v<DecimalResultType, InvalidType>)
@ -2624,7 +2625,7 @@ public:
/// Check the case when operation is divide, intDiv or modulo and denominator is Nullable(Something). /// Check the case when operation is divide, intDiv or modulo and denominator is Nullable(Something).
/// For divide operation we should check only Nullable(Decimal), because only this case can throw division by zero error. /// For divide operation we should check only Nullable(Decimal), because only this case can throw division by zero error.
bool division_by_nullable = !arguments[0].type->onlyNull() && !arguments[1].type->onlyNull() && arguments[1].type->isNullable() bool division_by_nullable = !arguments[0].type->onlyNull() && !arguments[1].type->onlyNull() && arguments[1].type->isNullable()
&& (IsOperation<Op>::div_int || IsOperation<Op>::modulo || IsOperation<Op>::positive_modulo && (IsOperation<Op>::int_div || IsOperation<Op>::modulo || IsOperation<Op>::positive_modulo
|| (IsOperation<Op>::div_floating || (IsOperation<Op>::div_floating
&& (isDecimalOrNullableDecimal(arguments[0].type) || isDecimalOrNullableDecimal(arguments[1].type)))); && (isDecimalOrNullableDecimal(arguments[0].type) || isDecimalOrNullableDecimal(arguments[1].type))));

View File

@ -26,6 +26,7 @@
#include "config.h" #include "config.h"
namespace DB namespace DB
{ {
namespace ErrorCodes namespace ErrorCodes
@ -114,8 +115,6 @@ private:
}; };
class EmptyJSONStringSerializer{};
class FunctionSQLJSONHelpers class FunctionSQLJSONHelpers
{ {
@ -156,25 +155,11 @@ public:
throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "Second argument (JSONPath) must be constant string"); throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "Second argument (JSONPath) must be constant string");
} }
const ColumnPtr & arg_jsonpath = json_path_column.column;
const auto * arg_jsonpath_const = typeid_cast<const ColumnConst *>(arg_jsonpath.get());
const auto * arg_jsonpath_string = typeid_cast<const ColumnString *>(arg_jsonpath_const->getDataColumnPtr().get());
const ColumnPtr & arg_json = json_column.column;
const auto * col_json_const = typeid_cast<const ColumnConst *>(arg_json.get());
const auto * col_json_string
= typeid_cast<const ColumnString *>(col_json_const ? col_json_const->getDataColumnPtr().get() : arg_json.get());
/// Get data and offsets for 1 argument (JSONPath)
const ColumnString::Chars & chars_path = arg_jsonpath_string->getChars();
const ColumnString::Offsets & offsets_path = arg_jsonpath_string->getOffsets();
/// Prepare to parse 1 argument (JSONPath) /// Prepare to parse 1 argument (JSONPath)
const char * query_begin = reinterpret_cast<const char *>(&chars_path[0]); String query = typeid_cast<const ColumnConst &>(*json_path_column.column).getValue<String>();
const char * query_end = query_begin + offsets_path[0] - 1;
/// Tokenize query /// Tokenize the query
Tokens tokens(query_begin, query_end); Tokens tokens(query.data(), query.data() + query.size());
/// Max depth 0 indicates that depth is not limited /// Max depth 0 indicates that depth is not limited
IParser::Pos token_iterator(tokens, parse_depth); IParser::Pos token_iterator(tokens, parse_depth);
@ -188,10 +173,6 @@ public:
throw Exception(ErrorCodes::BAD_ARGUMENTS, "Unable to parse JSONPath"); throw Exception(ErrorCodes::BAD_ARGUMENTS, "Unable to parse JSONPath");
} }
/// Get data and offsets for 2 argument (JSON)
const ColumnString::Chars & chars_json = col_json_string->getChars();
const ColumnString::Offsets & offsets_json = col_json_string->getOffsets();
JSONParser json_parser; JSONParser json_parser;
using Element = typename JSONParser::Element; using Element = typename JSONParser::Element;
Element document; Element document;
@ -200,10 +181,9 @@ public:
/// Parse JSON for every row /// Parse JSON for every row
Impl impl; Impl impl;
GeneratorJSONPath<JSONParser> generator_json_path(res); GeneratorJSONPath<JSONParser> generator_json_path(res);
for (const auto i : collections::range(0, input_rows_count)) for (size_t i = 0; i < input_rows_count; ++i)
{ {
std::string_view json{ std::string_view json = json_column.column->getDataAt(i).toView();
reinterpret_cast<const char *>(&chars_json[offsets_json[i - 1]]), offsets_json[i] - offsets_json[i - 1] - 1};
document_ok = json_parser.parse(json, document); document_ok = json_parser.parse(json, document);
bool added_to_column = false; bool added_to_column = false;

View File

@ -51,8 +51,8 @@ struct IsOperation
static constexpr bool minus = IsSameOperation<Op, MinusImpl>::value; static constexpr bool minus = IsSameOperation<Op, MinusImpl>::value;
static constexpr bool multiply = IsSameOperation<Op, MultiplyImpl>::value; static constexpr bool multiply = IsSameOperation<Op, MultiplyImpl>::value;
static constexpr bool div_floating = IsSameOperation<Op, DivideFloatingImpl>::value; static constexpr bool div_floating = IsSameOperation<Op, DivideFloatingImpl>::value;
static constexpr bool div_int = IsSameOperation<Op, DivideIntegralImpl>::value; static constexpr bool int_div = IsSameOperation<Op, DivideIntegralImpl>::value;
static constexpr bool div_int_or_zero = IsSameOperation<Op, DivideIntegralOrZeroImpl>::value; static constexpr bool int_div_or_zero = IsSameOperation<Op, DivideIntegralOrZeroImpl>::value;
static constexpr bool modulo = IsSameOperation<Op, ModuloImpl>::value; static constexpr bool modulo = IsSameOperation<Op, ModuloImpl>::value;
static constexpr bool positive_modulo = IsSameOperation<Op, PositiveModuloImpl>::value; static constexpr bool positive_modulo = IsSameOperation<Op, PositiveModuloImpl>::value;
static constexpr bool least = IsSameOperation<Op, LeastBaseImpl>::value; static constexpr bool least = IsSameOperation<Op, LeastBaseImpl>::value;
@ -60,7 +60,7 @@ struct IsOperation
static constexpr bool bit_hamming_distance = IsSameOperation<Op, BitHammingDistanceImpl>::value; static constexpr bool bit_hamming_distance = IsSameOperation<Op, BitHammingDistanceImpl>::value;
static constexpr bool division = div_floating || div_int || div_int_or_zero || modulo; static constexpr bool division = div_floating || int_div || int_div_or_zero || modulo;
// NOTE: allow_decimal should not fully contain `division` because of divInt // NOTE: allow_decimal should not fully contain `division` because of divInt
static constexpr bool allow_decimal = plus || minus || multiply || division || least || greatest; static constexpr bool allow_decimal = plus || minus || multiply || division || least || greatest;
}; };

View File

@ -1,8 +1,8 @@
#include <algorithm>
#include <Columns/ColumnConst.h> #include <Columns/ColumnConst.h>
#include <Functions/array/arrayEnumerateRanked.h>
#include <Common/assert_cast.h> #include <Common/assert_cast.h>
#include "arrayEnumerateRanked.h"
#include <algorithm>
namespace DB namespace DB
{ {
@ -12,88 +12,105 @@ namespace ErrorCodes
extern const int BAD_ARGUMENTS; extern const int BAD_ARGUMENTS;
} }
ArraysDepths getArraysDepths(const ColumnsWithTypeAndName & arguments) ArraysDepths getArraysDepths(const ColumnsWithTypeAndName & arguments, const char * function_name)
{ {
const size_t num_arguments = arguments.size(); const size_t num_arguments = arguments.size();
if (!num_arguments)
throw Exception(ErrorCodes::BAD_ARGUMENTS, "Missing arguments for function arrayEnumerateUniqRanked");
DepthType clear_depth = 1; DepthType clear_depth = 1;
DepthTypes depths; size_t i = 0;
if (const DataTypeArray * type_array = typeid_cast<const DataTypeArray *>(arguments[0].type.get()); !type_array)
{
/// If the first argument is not an array, it must be a const positive and non zero number
const auto & depth_column = arguments[i].column;
if (!depth_column || !isColumnConst(*depth_column))
throw Exception(ErrorCodes::BAD_ARGUMENTS, "First argument of {} must be Const(UInt64)", function_name);
Field f = assert_cast<const ColumnConst &>(*depth_column).getField();
if (f.getType() != Field::Types::UInt64 || f.safeGet<UInt64>() == 0)
throw Exception(ErrorCodes::BAD_ARGUMENTS, "First argument of {} must be a positive integer", function_name);
/// function signature is the following: clear_depth = static_cast<DepthType>(f.safeGet<UInt64>());
/// f(c0, arr1, c1, arr2, c2, ...) i++;
/// }
/// c0 is something called "clear_depth" here.
/// The rest of the arguments must be in the shape: arr1, c1, arr2, c2, ...
/// cN... - how deep to look into the corresponding arrN, (called "depths" here) /// cN... - how deep to look into the corresponding arrN, (called "depths" here)
/// may be omitted - then it means "look at the full depth". /// may be omitted - then it means "look at the full depth"
DepthTypes depths;
size_t array_num = 0; for (; i < num_arguments; i++)
DepthType prev_array_depth = 0;
for (size_t i = 0; i < num_arguments; ++i)
{ {
const DataTypePtr & type = arguments[i].type; const DataTypePtr & type = arguments[i].type;
const DataTypeArray * type_array = typeid_cast<const DataTypeArray *>(type.get()); const DataTypeArray * current_type_array = typeid_cast<const DataTypeArray *>(type.get());
if (!current_type_array)
throw Exception(
ErrorCodes::BAD_ARGUMENTS,
"Incorrect argument {} type of function {}. Expected an Array, got {}",
i + 1,
function_name,
type->getName());
if (type_array) if (i == num_arguments - 1)
{ {
if (depths.size() < array_num && prev_array_depth) depths.emplace_back(current_type_array->getNumberOfDimensions());
depths.emplace_back(prev_array_depth);
prev_array_depth = static_cast<DepthType>(type_array->getNumberOfDimensions());
++array_num;
} }
else else
{ {
const auto & depth_column = arguments[i].column; const DataTypeArray * next_argument_array = typeid_cast<const DataTypeArray *>(arguments[i + 1].type.get());
if (next_argument_array)
if (depth_column && isColumnConst(*depth_column))
{ {
UInt64 value = assert_cast<const ColumnConst &>(*depth_column).getValue<UInt64>(); depths.emplace_back(current_type_array->getNumberOfDimensions());
if (!value) }
throw Exception(ErrorCodes::BAD_ARGUMENTS, else
"Incorrect arguments for function arrayEnumerateUniqRanked " {
"or arrayEnumerateDenseRanked: depth ({}) cannot be less or equal 0.", i++;
std::to_string(value)); /// The following argument is not array, so it must be a const positive integer with the depth
const auto & depth_column = arguments[i].column;
if (i == 0) if (!depth_column || !isColumnConst(*depth_column))
{ throw Exception(
clear_depth = static_cast<DepthType>(value); ErrorCodes::BAD_ARGUMENTS,
} "Incorrect argument {} type of function {}. Expected an Array or Const(UInt64), got {}",
else i + 1,
{ function_name,
if (depths.size() >= array_num) arguments[i].type->getName());
throw Exception(ErrorCodes::BAD_ARGUMENTS, Field f = assert_cast<const ColumnConst &>(*depth_column).getField();
"Incorrect arguments for function arrayEnumerateUniqRanked " if (f.getType() != Field::Types::UInt64 || f.safeGet<UInt64>() == 0)
"or arrayEnumerateDenseRanked: depth ({}) for missing array.", throw Exception(
std::to_string(value)); ErrorCodes::BAD_ARGUMENTS,
if (value > prev_array_depth) "Incorrect argument {} of function {}. Expected a positive integer",
throw Exception(ErrorCodes::BAD_ARGUMENTS, i + 1,
"Arguments for function arrayEnumerateUniqRanked/arrayEnumerateDenseRanked incorrect: depth={}" function_name);
" for array with depth={}.", UInt64 value = f.safeGet<UInt64>();
std::to_string(value), std::to_string(prev_array_depth)); UInt64 prev_array_depth = current_type_array->getNumberOfDimensions();
if (value > prev_array_depth)
depths.emplace_back(value); throw Exception(
} ErrorCodes::BAD_ARGUMENTS,
"Incorrect argument {} of function {}. Required depth '{}' is larger than the array depth ({})",
i + 1,
function_name,
value,
prev_array_depth);
depths.emplace_back(value);
} }
} }
} }
if (depths.size() < array_num)
depths.emplace_back(prev_array_depth);
if (depths.empty()) if (depths.empty())
throw Exception(ErrorCodes::BAD_ARGUMENTS, throw Exception(
"Incorrect arguments for function arrayEnumerateUniqRanked or arrayEnumerateDenseRanked: " ErrorCodes::BAD_ARGUMENTS, "Incorrect arguments for function {}: At least one array should be passed", function_name);
"at least one array should be passed.");
DepthType max_array_depth = 0; DepthType max_array_depth = 0;
for (auto depth : depths) for (auto depth : depths)
max_array_depth = std::max(depth, max_array_depth); max_array_depth = std::max(depth, max_array_depth);
if (clear_depth > max_array_depth) if (clear_depth > max_array_depth)
throw Exception(ErrorCodes::BAD_ARGUMENTS, throw Exception(
"Incorrect arguments for function arrayEnumerateUniqRanked or arrayEnumerateDenseRanked: " ErrorCodes::BAD_ARGUMENTS,
"clear_depth ({}) can't be larger than max_array_depth ({}).", "Incorrect arguments for function {}: clear_depth ({}) can't be larger than max_array_depth ({})",
std::to_string(clear_depth), std::to_string(max_array_depth)); function_name,
clear_depth,
max_array_depth);
return {clear_depth, depths, max_array_depth}; return {clear_depth, depths, max_array_depth};
} }

View File

@ -84,7 +84,7 @@ struct ArraysDepths
}; };
/// Return depth info about passed arrays /// Return depth info about passed arrays
ArraysDepths getArraysDepths(const ColumnsWithTypeAndName & arguments); ArraysDepths getArraysDepths(const ColumnsWithTypeAndName & arguments, const char * function_name);
template <typename Derived> template <typename Derived>
class FunctionArrayEnumerateRankedExtended : public IFunction class FunctionArrayEnumerateRankedExtended : public IFunction
@ -105,7 +105,7 @@ public:
"Number of arguments for function {} doesn't match: passed {}, should be at least 1.", "Number of arguments for function {} doesn't match: passed {}, should be at least 1.",
getName(), arguments.size()); getName(), arguments.size());
const ArraysDepths arrays_depths = getArraysDepths(arguments); const ArraysDepths arrays_depths = getArraysDepths(arguments, Derived::name);
/// Return type is the array of the depth as the maximum effective depth of arguments, containing UInt32. /// Return type is the array of the depth as the maximum effective depth of arguments, containing UInt32.
@ -154,7 +154,7 @@ ColumnPtr FunctionArrayEnumerateRankedExtended<Derived>::executeImpl(
Columns array_holders; Columns array_holders;
ColumnPtr offsets_column; ColumnPtr offsets_column;
const ArraysDepths arrays_depths = getArraysDepths(arguments); const ArraysDepths arrays_depths = getArraysDepths(arguments, Derived::name);
/// If the column is Array - return it. If the const Array - materialize it, keep ownership and return. /// If the column is Array - return it. If the const Array - materialize it, keep ownership and return.
auto get_array_column = [&](const auto & column) -> const DB::ColumnArray * auto get_array_column = [&](const auto & column) -> const DB::ColumnArray *
@ -213,17 +213,23 @@ ColumnPtr FunctionArrayEnumerateRankedExtended<Derived>::executeImpl(
{ {
if (*offsets_by_depth[col_depth] != array->getOffsets()) if (*offsets_by_depth[col_depth] != array->getOffsets())
{ {
throw Exception(ErrorCodes::SIZES_OF_ARRAYS_DONT_MATCH, throw Exception(
"Lengths and effective depths of all arrays passed to {} must be equal.", getName()); ErrorCodes::SIZES_OF_ARRAYS_DONT_MATCH,
"Lengths and effective depths of all arrays passed to {} must be equal",
getName());
} }
} }
} }
if (col_depth < arrays_depths.depths[array_num]) if (col_depth < arrays_depths.depths[array_num])
{ {
throw Exception(ErrorCodes::SIZES_OF_ARRAYS_DONT_MATCH, throw Exception(
"{}: Passed array number {} depth ({}) is more than the actual array depth ({}).", ErrorCodes::SIZES_OF_ARRAYS_DONT_MATCH,
getName(), array_num, std::to_string(arrays_depths.depths[array_num]), col_depth); "{}: Passed array number {} depth ({}) is more than the actual array depth ({})",
getName(),
array_num,
std::to_string(arrays_depths.depths[array_num]),
col_depth);
} }
auto * array_data = &array->getData(); auto * array_data = &array->getData();

View File

@ -18,7 +18,6 @@ namespace ErrorCodes
{ {
extern const int ILLEGAL_TYPE_OF_ARGUMENT; extern const int ILLEGAL_TYPE_OF_ARGUMENT;
extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH; extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH;
extern const int UNKNOWN_FORMAT;
extern const int BAD_ARGUMENTS; extern const int BAD_ARGUMENTS;
} }
@ -40,8 +39,7 @@ public:
, arguments_column_names(std::move(arguments_column_names_)) , arguments_column_names(std::move(arguments_column_names_))
, context(std::move(context_)) , context(std::move(context_))
{ {
if (!FormatFactory::instance().getAllFormats().contains(format_name)) FormatFactory::instance().checkFormatName(format_name);
throw Exception(ErrorCodes::UNKNOWN_FORMAT, "Unknown format {}", format_name);
} }
String getName() const override { return name; } String getName() const override { return name; }

View File

@ -54,6 +54,9 @@ public:
struct Result struct Result
{ {
/// The read data is at [buf + offset, buf + size), where `buf` is from Request struct.
/// (Notice that `offset` is included in `size`.)
/// size /// size
/// Less than requested amount of data can be returned. /// Less than requested amount of data can be returned.
/// If size is zero - the file has ended. /// If size is zero - the file has ended.

View File

@ -60,6 +60,9 @@ public:
BufferBase(Position ptr, size_t size, size_t offset) BufferBase(Position ptr, size_t size, size_t offset)
: pos(ptr + offset), working_buffer(ptr, ptr + size), internal_buffer(ptr, ptr + size) {} : pos(ptr + offset), working_buffer(ptr, ptr + size), internal_buffer(ptr, ptr + size) {}
/// Assign the buffers and pos.
/// Be careful when calling this from ReadBuffer::nextImpl() implementations: `offset` is
/// effectively ignored because ReadBuffer::next() reassigns `pos`.
void set(Position ptr, size_t size, size_t offset) void set(Position ptr, size_t size, size_t offset)
{ {
internal_buffer = Buffer(ptr, ptr + size); internal_buffer = Buffer(ptr, ptr + size);

View File

@ -0,0 +1,188 @@
#include "CachedInMemoryReadBufferFromFile.h"
#include <IO/SwapHelper.h>
#include <base/scope_guard.h>
#include <Common/logger_useful.h>
namespace DB
{
namespace ErrorCodes
{
extern const int UNEXPECTED_END_OF_FILE;
extern const int CANNOT_SEEK_THROUGH_FILE;
extern const int SEEK_POSITION_OUT_OF_BOUND;
}
CachedInMemoryReadBufferFromFile::CachedInMemoryReadBufferFromFile(
FileChunkAddress cache_key_, PageCachePtr cache_, std::unique_ptr<ReadBufferFromFileBase> in_, const ReadSettings & settings_)
: ReadBufferFromFileBase(0, nullptr, 0, in_->getFileSize()), cache_key(cache_key_), cache(cache_), settings(settings_), in(std::move(in_))
, read_until_position(file_size.value())
{
cache_key.offset = 0;
}
String CachedInMemoryReadBufferFromFile::getFileName() const
{
return in->getFileName();
}
off_t CachedInMemoryReadBufferFromFile::seek(off_t off, int whence)
{
if (whence != SEEK_SET)
throw Exception(ErrorCodes::CANNOT_SEEK_THROUGH_FILE, "Only SEEK_SET mode is allowed.");
size_t offset = static_cast<size_t>(off);
if (offset > file_size.value())
throw Exception(ErrorCodes::SEEK_POSITION_OUT_OF_BOUND, "Seek position is out of bounds. Offset: {}", off);
if (offset >= file_offset_of_buffer_end - working_buffer.size() && offset <= file_offset_of_buffer_end)
{
pos = working_buffer.end() - (file_offset_of_buffer_end - offset);
chassert(getPosition() == off);
return off;
}
resetWorkingBuffer();
file_offset_of_buffer_end = offset;
chunk.reset();
chassert(getPosition() == off);
return off;
}
off_t CachedInMemoryReadBufferFromFile::getPosition()
{
return file_offset_of_buffer_end - available();
}
size_t CachedInMemoryReadBufferFromFile::getFileOffsetOfBufferEnd() const
{
return file_offset_of_buffer_end;
}
void CachedInMemoryReadBufferFromFile::setReadUntilPosition(size_t position)
{
read_until_position = position;
if (position < static_cast<size_t>(getPosition()))
{
resetWorkingBuffer();
chunk.reset();
}
else if (position < file_offset_of_buffer_end)
{
size_t diff = file_offset_of_buffer_end - position;
working_buffer.resize(working_buffer.size() - diff);
file_offset_of_buffer_end -= diff;
}
}
void CachedInMemoryReadBufferFromFile::setReadUntilEnd()
{
setReadUntilPosition(file_size.value());
}
bool CachedInMemoryReadBufferFromFile::nextImpl()
{
chassert(read_until_position <= file_size.value());
if (file_offset_of_buffer_end >= read_until_position)
return false;
if (chunk.has_value() && file_offset_of_buffer_end >= cache_key.offset + cache->chunkSize())
{
chassert(file_offset_of_buffer_end == cache_key.offset + cache->chunkSize());
chunk.reset();
}
if (!chunk.has_value())
{
cache_key.offset = file_offset_of_buffer_end / cache->chunkSize() * cache->chunkSize();
chunk = cache->getOrSet(cache_key.hash(), settings.read_from_page_cache_if_exists_otherwise_bypass_cache, settings.page_cache_inject_eviction);
size_t chunk_size = std::min(cache->chunkSize(), file_size.value() - cache_key.offset);
std::unique_lock download_lock(chunk->getChunk()->state.download_mutex);
if (!chunk->isPrefixPopulated(chunk_size))
{
/// A few things could be improved here, which may or may not be worth the added complexity:
/// * If the next file chunk is in cache, use in->setReadUntilPosition() to limit the read to
/// just one chunk. More generally, look ahead in the cache to count how many next chunks
/// need to be downloaded. (Up to some limit? And avoid changing `in`'s until-position if
/// it's already reasonable; otherwise we'd increase it by one chunk every chunk, discarding
/// a half-completed HTTP request every time.)
/// * If only a subset of pages are missing from this chunk, download only them,
/// with some threshold for avoiding short seeks.
/// In particular, if a previous download failed in the middle of the chunk, we could
/// resume from that position instead of from the beginning of the chunk.
/// (It's also possible in principle that a proper subset of chunk's pages was reclaimed
/// by the OS. But, for performance purposes, we should completely ignore that, because
/// (a) PageCache normally uses 2 MiB transparent huge pages and has just one such page
/// per chunk, and (b) even with 4 KiB pages partial chunk eviction is extremely rare.)
/// * If our [position, read_until_position) covers only part of the chunk, we could download
/// just that part. (Which would be bad if someone else needs the rest of the chunk and has
/// to do a whole new HTTP request to get it. Unclear what the policy should be.)
/// * Instead of doing in->next() in a loop until we get the whole chunk, we could return the
/// results as soon as in->next() produces them.
/// (But this would make the download_mutex situation much more complex, similar to the
/// FileSegment::State::PARTIALLY_DOWNLOADED and FileSegment::setRemoteFileReader() stuff.)
Buffer prev_in_buffer = in->internalBuffer();
SCOPE_EXIT({ in->set(prev_in_buffer.begin(), prev_in_buffer.size()); });
size_t pos = 0;
while (pos < chunk_size)
{
char * piece_start = chunk->getChunk()->data + pos;
size_t piece_size = chunk_size - pos;
in->set(piece_start, piece_size);
LOG_INFO(&Poco::Logger::get("asdqwe"), "this {:x}, in {:x}, path {}, size {}, offset {:x}, pos {:x}", reinterpret_cast<uint64_t>(this), reinterpret_cast<uint64_t>(in.get()), cache_key.path, file_size.value(), cache_key.offset, pos);
if (pos == 0)
in->seek(cache_key.offset, SEEK_SET);
else
chassert(!in->available());
if (in->eof())
throw Exception(ErrorCodes::UNEXPECTED_END_OF_FILE, "File {} ended after {} bytes, but we expected {}",
getFileName(), cache_key.offset + pos, file_size.value());
chassert(in->position() >= piece_start && in->buffer().end() <= piece_start + piece_size);
chassert(in->getPosition() == static_cast<off_t>(cache_key.offset + pos));
size_t n = in->available();
chassert(n);
if (in->position() != piece_start)
memmove(piece_start, in->position(), n);
in->position() += n;
pos += n;
LOG_INFO(&Poco::Logger::get("asdqwe"), "this {:x}, got {:x} bytes", reinterpret_cast<uint64_t>(this), n);
}
chunk->markPrefixPopulated(chunk_size);
}
}
nextimpl_working_buffer_offset = file_offset_of_buffer_end - cache_key.offset;
working_buffer = Buffer(
chunk->getChunk()->data,
chunk->getChunk()->data + std::min(chunk->getChunk()->size, read_until_position - cache_key.offset));
pos = working_buffer.begin() + nextimpl_working_buffer_offset;
if (!internal_buffer.empty())
{
/// We were given an external buffer to read into. Copy the data into it.
/// Would be nice to avoid this copy, somehow, maybe by making ReadBufferFromRemoteFSGather
/// and AsynchronousBoundedReadBuffer explicitly aware of the page cache.
size_t n = std::min(available(), internal_buffer.size());
memcpy(internal_buffer.begin(), pos, n);
working_buffer = Buffer(internal_buffer.begin(), internal_buffer.begin() + n);
pos = working_buffer.begin();
nextimpl_working_buffer_offset = 0;
}
file_offset_of_buffer_end += available();
return true;
}
}

View File

@ -0,0 +1,41 @@
#pragma once
#include <Common/PageCache.h>
#include <IO/ReadBufferFromFileBase.h>
namespace DB
{
class CachedInMemoryReadBufferFromFile : public ReadBufferFromFileBase
{
public:
/// `in_` must support using external buffer. I.e. we assign its internal_buffer before each next()
/// call and expect the read data to be put into that buffer.
/// `in_` should be seekable and should be able to read the whole file from 0 to in_->getFileSize();
/// if you set `in_`'s read-until-position bypassing CachedInMemoryReadBufferFromFile then
/// CachedInMemoryReadBufferFromFile will break.
CachedInMemoryReadBufferFromFile(FileChunkAddress cache_key_, PageCachePtr cache_, std::unique_ptr<ReadBufferFromFileBase> in_, const ReadSettings & settings_);
String getFileName() const override;
off_t seek(off_t off, int whence) override;
off_t getPosition() override;
size_t getFileOffsetOfBufferEnd() const override;
bool supportsRightBoundedReads() const override { return true; }
void setReadUntilPosition(size_t position) override;
void setReadUntilEnd() override;
private:
FileChunkAddress cache_key; // .offset is offset of `chunk` start
PageCachePtr cache;
ReadSettings settings;
std::unique_ptr<ReadBufferFromFileBase> in;
size_t file_offset_of_buffer_end = 0;
size_t read_until_position;
std::optional<PinnedPageChunk> chunk;
bool nextImpl() override;
};
}

View File

@ -225,11 +225,22 @@ public:
* - seek() to a position above the until position (even if you setReadUntilPosition() to a * - seek() to a position above the until position (even if you setReadUntilPosition() to a
* higher value right after the seek!), * higher value right after the seek!),
* *
* Typical implementations discard any current buffers and connections, even if the position is * Implementations are recommended to:
* adjusted only a little. * - Allow the read-until-position to go below current position, e.g.:
* // Read block [300, 400)
* setReadUntilPosition(400);
* seek(300);
* next();
* // Read block [100, 200)
* setReadUntilPosition(200); // oh oh, this is below the current position, but should be allowed
* seek(100); // but now everything's fine again
* next();
* // (Swapping the order of seek and setReadUntilPosition doesn't help: then it breaks if the order of blocks is reversed.)
* - Check if new read-until-position value is equal to the current value and do nothing in this case,
* so that the caller doesn't have to.
* *
* Typical usage is to call it right after creating the ReadBuffer, before it started doing any * Typical implementations discard any current buffers and connections when the
* work. * read-until-position changes even by a small (nonzero) amount.
*/ */
virtual void setReadUntilPosition(size_t /* position */) {} virtual void setReadUntilPosition(size_t /* position */) {}

View File

@ -61,6 +61,7 @@ enum class RemoteFSReadMethod
}; };
class MMappedFileCache; class MMappedFileCache;
class PageCache;
struct ReadSettings struct ReadSettings
{ {
@ -102,6 +103,12 @@ struct ReadSettings
bool avoid_readthrough_cache_outside_query_context = true; bool avoid_readthrough_cache_outside_query_context = true;
size_t filesystem_cache_segments_batch_size = 20; size_t filesystem_cache_segments_batch_size = 20;
//asdqwe assign these two
bool use_page_cache_for_disks_without_file_cache = false;
bool read_from_page_cache_if_exists_otherwise_bypass_cache = false;
bool page_cache_inject_eviction = false;
std::shared_ptr<PageCache> page_cache;
size_t filesystem_cache_max_download_size = (128UL * 1024 * 1024 * 1024); size_t filesystem_cache_max_download_size = (128UL * 1024 * 1024 * 1024);
bool skip_download_if_exceeds_query_cache = true; bool skip_download_if_exceeds_query_cache = true;

View File

@ -52,6 +52,20 @@ Aws::Http::HeaderValueCollection CopyObjectRequest::GetRequestSpecificHeaders()
return headers; return headers;
} }
void CompleteMultipartUploadRequest::SetAdditionalCustomHeaderValue(const Aws::String& headerName, const Aws::String& headerValue)
{
// S3's CompleteMultipartUpload doesn't support metadata headers so we skip adding them
if (!headerName.starts_with("x-amz-meta-"))
Model::CompleteMultipartUploadRequest::SetAdditionalCustomHeaderValue(headerName, headerValue);
}
void UploadPartRequest::SetAdditionalCustomHeaderValue(const Aws::String& headerName, const Aws::String& headerValue)
{
// S3's UploadPart doesn't support metadata headers so we skip adding them
if (!headerName.starts_with("x-amz-meta-"))
Model::UploadPartRequest::SetAdditionalCustomHeaderValue(headerName, headerValue);
}
Aws::String ComposeObjectRequest::SerializePayload() const Aws::String ComposeObjectRequest::SerializePayload() const
{ {
if (component_names.empty()) if (component_names.empty())
@ -70,6 +84,7 @@ Aws::String ComposeObjectRequest::SerializePayload() const
return payload_doc.ConvertToString(); return payload_doc.ConvertToString();
} }
void ComposeObjectRequest::AddQueryStringParameters(Aws::Http::URI & /*uri*/) const void ComposeObjectRequest::AddQueryStringParameters(Aws::Http::URI & /*uri*/) const
{ {
} }

View File

@ -107,10 +107,20 @@ using ListObjectsV2Request = ExtendedRequest<Model::ListObjectsV2Request>;
using ListObjectsRequest = ExtendedRequest<Model::ListObjectsRequest>; using ListObjectsRequest = ExtendedRequest<Model::ListObjectsRequest>;
using GetObjectRequest = ExtendedRequest<Model::GetObjectRequest>; using GetObjectRequest = ExtendedRequest<Model::GetObjectRequest>;
class UploadPartRequest : public ExtendedRequest<Model::UploadPartRequest>
{
public:
void SetAdditionalCustomHeaderValue(const Aws::String& headerName, const Aws::String& headerValue) override;
};
class CompleteMultipartUploadRequest : public ExtendedRequest<Model::CompleteMultipartUploadRequest>
{
public:
void SetAdditionalCustomHeaderValue(const Aws::String& headerName, const Aws::String& headerValue) override;
};
using CreateMultipartUploadRequest = ExtendedRequest<Model::CreateMultipartUploadRequest>; using CreateMultipartUploadRequest = ExtendedRequest<Model::CreateMultipartUploadRequest>;
using CompleteMultipartUploadRequest = ExtendedRequest<Model::CompleteMultipartUploadRequest>;
using AbortMultipartUploadRequest = ExtendedRequest<Model::AbortMultipartUploadRequest>; using AbortMultipartUploadRequest = ExtendedRequest<Model::AbortMultipartUploadRequest>;
using UploadPartRequest = ExtendedRequest<Model::UploadPartRequest>;
using UploadPartCopyRequest = ExtendedRequest<Model::UploadPartCopyRequest>; using UploadPartCopyRequest = ExtendedRequest<Model::UploadPartCopyRequest>;
using PutObjectRequest = ExtendedRequest<Model::PutObjectRequest>; using PutObjectRequest = ExtendedRequest<Model::PutObjectRequest>;

View File

@ -17,6 +17,7 @@
#include <Common/getMultipleKeysFromConfig.h> #include <Common/getMultipleKeysFromConfig.h>
#include <Common/callOnce.h> #include <Common/callOnce.h>
#include <Common/SharedLockGuard.h> #include <Common/SharedLockGuard.h>
#include <Common/PageCache.h>
#include <Coordination/KeeperDispatcher.h> #include <Coordination/KeeperDispatcher.h>
#include <Core/BackgroundSchedulePool.h> #include <Core/BackgroundSchedulePool.h>
#include <Formats/FormatFactory.h> #include <Formats/FormatFactory.h>
@ -294,6 +295,7 @@ struct ContextSharedPart : boost::noncopyable
mutable MarkCachePtr index_mark_cache TSA_GUARDED_BY(mutex); /// Cache of marks in compressed files of MergeTree indices. mutable MarkCachePtr index_mark_cache TSA_GUARDED_BY(mutex); /// Cache of marks in compressed files of MergeTree indices.
mutable MMappedFileCachePtr mmap_cache TSA_GUARDED_BY(mutex); /// Cache of mmapped files to avoid frequent open/map/unmap/close and to reuse from several threads. mutable MMappedFileCachePtr mmap_cache TSA_GUARDED_BY(mutex); /// Cache of mmapped files to avoid frequent open/map/unmap/close and to reuse from several threads.
AsynchronousMetrics * asynchronous_metrics TSA_GUARDED_BY(mutex) = nullptr; /// Points to asynchronous metrics AsynchronousMetrics * asynchronous_metrics TSA_GUARDED_BY(mutex) = nullptr; /// Points to asynchronous metrics
mutable PageCachePtr page_cache TSA_GUARDED_BY(mutex); /// Userspace page cache.
ProcessList process_list; /// Executing queries at the moment. ProcessList process_list; /// Executing queries at the moment.
SessionTracker session_tracker; SessionTracker session_tracker;
GlobalOvercommitTracker global_overcommit_tracker; GlobalOvercommitTracker global_overcommit_tracker;
@ -1251,7 +1253,7 @@ void Context::setUser(const UUID & user_id_, const std::optional<const std::vect
{ {
/// Prepare lists of user's profiles, constraints, settings, roles. /// Prepare lists of user's profiles, constraints, settings, roles.
/// NOTE: AccessControl::read<User>() and other AccessControl's functions may require some IO work, /// NOTE: AccessControl::read<User>() and other AccessControl's functions may require some IO work,
/// so Context::getLock() must be unlocked while we're doing this. /// so Context::getLocalLock() and Context::getGlobalLock() must be unlocked while we're doing this.
auto & access_control = getAccessControl(); auto & access_control = getAccessControl();
auto user = access_control.read<User>(user_id_); auto user = access_control.read<User>(user_id_);
@ -1381,7 +1383,7 @@ void Context::checkAccess(const AccessRightsElements & elements) const { return
std::shared_ptr<const ContextAccess> Context::getAccess() const std::shared_ptr<const ContextAccess> Context::getAccess() const
{ {
/// A helper function to collect parameters for calculating access rights, called with Context::getLock() acquired. /// A helper function to collect parameters for calculating access rights, called with Context::getLocalSharedLock() acquired.
auto get_params = [this]() auto get_params = [this]()
{ {
/// If setUserID() was never called then this must be the global context with the full access. /// If setUserID() was never called then this must be the global context with the full access.
@ -1408,7 +1410,8 @@ std::shared_ptr<const ContextAccess> Context::getAccess() const
} }
/// Calculate new access rights according to the collected parameters. /// Calculate new access rights according to the collected parameters.
/// NOTE: AccessControl::getContextAccess() may require some IO work, so Context::getLock() must be unlocked while we're doing this. /// NOTE: AccessControl::getContextAccess() may require some IO work, so Context::getLocalLock()
/// and Context::getGlobalLock() must be unlocked while we're doing this.
auto res = getAccessControl().getContextAccess(*params); auto res = getAccessControl().getContextAccess(*params);
{ {
@ -1813,7 +1816,7 @@ StoragePtr Context::executeTableFunction(const ASTPtr & table_expression, const
} }
uint64_t use_structure_from_insertion_table_in_table_functions = getSettingsRef().use_structure_from_insertion_table_in_table_functions; uint64_t use_structure_from_insertion_table_in_table_functions = getSettingsRef().use_structure_from_insertion_table_in_table_functions;
if (use_structure_from_insertion_table_in_table_functions && table_function_ptr->needStructureHint() && hasInsertionTable()) if (select_query_hint && use_structure_from_insertion_table_in_table_functions && table_function_ptr->needStructureHint() && hasInsertionTable())
{ {
const auto & insert_columns = DatabaseCatalog::instance() const auto & insert_columns = DatabaseCatalog::instance()
.getTable(getInsertionTable(), shared_from_this()) .getTable(getInsertionTable(), shared_from_this())
@ -2737,6 +2740,33 @@ void Context::clearUncompressedCache() const
shared->uncompressed_cache->clear(); shared->uncompressed_cache->clear();
} }
void Context::setPageCache(size_t bytes_per_chunk, size_t bytes_per_mmap, size_t bytes_total, bool use_madv_free, bool use_huge_pages)
{
std::lock_guard lock(shared->mutex);
if (shared->page_cache)
throw Exception(ErrorCodes::LOGICAL_ERROR, "Page cache has been already created.");
shared->page_cache = std::make_shared<PageCache>(bytes_per_chunk, bytes_per_mmap, bytes_total, use_madv_free, use_huge_pages);
}
PageCachePtr Context::getPageCache() const
{
SharedLockGuard lock(shared->mutex);
return shared->page_cache;
}
void Context::dropPageCache() const
{
PageCachePtr cache;
{
SharedLockGuard lock(shared->mutex);
cache = shared->page_cache;
}
if (cache)
cache->dropCache();
}
void Context::setMarkCache(const String & cache_policy, size_t max_cache_size_in_bytes, double size_ratio) void Context::setMarkCache(const String & cache_policy, size_t max_cache_size_in_bytes, double size_ratio)
{ {
std::lock_guard lock(shared->mutex); std::lock_guard lock(shared->mutex);
@ -5153,6 +5183,11 @@ ReadSettings Context::getReadSettings() const
res.filesystem_cache_max_download_size = settings.filesystem_cache_max_download_size; res.filesystem_cache_max_download_size = settings.filesystem_cache_max_download_size;
res.skip_download_if_exceeds_query_cache = settings.skip_download_if_exceeds_query_cache; res.skip_download_if_exceeds_query_cache = settings.skip_download_if_exceeds_query_cache;
res.page_cache = getPageCache();
res.use_page_cache_for_disks_without_file_cache = settings.use_page_cache_for_disks_without_file_cache;
res.read_from_page_cache_if_exists_otherwise_bypass_cache = settings.read_from_page_cache_if_exists_otherwise_bypass_cache;
res.page_cache_inject_eviction = settings.page_cache_inject_eviction;
res.remote_read_min_bytes_for_seek = settings.remote_read_min_bytes_for_seek; res.remote_read_min_bytes_for_seek = settings.remote_read_min_bytes_for_seek;
/// Zero read buffer will not make progress. /// Zero read buffer will not make progress.

View File

@ -79,6 +79,7 @@ class RefreshSet;
class Cluster; class Cluster;
class Compiler; class Compiler;
class MarkCache; class MarkCache;
class PageCache;
class MMappedFileCache; class MMappedFileCache;
class UncompressedCache; class UncompressedCache;
class ProcessList; class ProcessList;
@ -969,6 +970,10 @@ public:
std::shared_ptr<UncompressedCache> getUncompressedCache() const; std::shared_ptr<UncompressedCache> getUncompressedCache() const;
void clearUncompressedCache() const; void clearUncompressedCache() const;
void setPageCache(size_t bytes_per_chunk, size_t bytes_per_mmap, size_t bytes_total, bool use_madv_free, bool use_huge_pages);
std::shared_ptr<PageCache> getPageCache() const;
void dropPageCache() const;
void setMarkCache(const String & cache_policy, size_t max_cache_size_in_bytes, double size_ratio); void setMarkCache(const String & cache_policy, size_t max_cache_size_in_bytes, double size_ratio);
void updateMarkCacheConfiguration(const Poco::Util::AbstractConfiguration & config); void updateMarkCacheConfiguration(const Poco::Util::AbstractConfiguration & config);
std::shared_ptr<MarkCache> getMarkCache() const; std::shared_ptr<MarkCache> getMarkCache() const;

View File

@ -1019,6 +1019,7 @@ struct JoinOnKeyColumns
bool isRowFiltered(size_t i) const { return join_mask_column.isRowFiltered(i); } bool isRowFiltered(size_t i) const { return join_mask_column.isRowFiltered(i); }
}; };
template <bool lazy>
class AddedColumns class AddedColumns
{ {
public: public:
@ -1034,6 +1035,12 @@ public:
} }
}; };
struct LazyOutput
{
PaddedPODArray<UInt64> blocks;
PaddedPODArray<UInt32> row_nums;
};
AddedColumns( AddedColumns(
const Block & left_block, const Block & left_block,
const Block & block_with_columns_to_add, const Block & block_with_columns_to_add,
@ -1050,6 +1057,13 @@ public:
if (is_asof_join) if (is_asof_join)
++num_columns_to_add; ++num_columns_to_add;
if constexpr (lazy)
{
has_columns_to_add = num_columns_to_add > 0;
lazy_output.blocks.reserve(rows_to_add);
lazy_output.row_nums.reserve(rows_to_add);
}
columns.reserve(num_columns_to_add); columns.reserve(num_columns_to_add);
type_name.reserve(num_columns_to_add); type_name.reserve(num_columns_to_add);
right_indexes.reserve(num_columns_to_add); right_indexes.reserve(num_columns_to_add);
@ -1089,81 +1103,18 @@ public:
size_t size() const { return columns.size(); } size_t size() const { return columns.size(); }
void buildOutput();
ColumnWithTypeAndName moveColumn(size_t i) ColumnWithTypeAndName moveColumn(size_t i)
{ {
return ColumnWithTypeAndName(std::move(columns[i]), type_name[i].type, type_name[i].qualified_name); return ColumnWithTypeAndName(std::move(columns[i]), type_name[i].type, type_name[i].qualified_name);
} }
void appendFromBlock(const Block & block, size_t row_num, bool has_default);
template <bool has_defaults> void appendDefaultRow();
void appendFromBlock(const Block & block, size_t row_num)
{
if constexpr (has_defaults)
applyLazyDefaults();
#ifndef NDEBUG void applyLazyDefaults();
for (size_t j = 0; j < right_indexes.size(); ++j)
{
const auto * column_from_block = block.getByPosition(right_indexes[j]).column.get();
const auto * dest_column = columns[j].get();
if (auto * nullable_col = nullable_column_ptrs[j])
{
if (!is_join_get)
throw Exception(ErrorCodes::LOGICAL_ERROR,
"Columns {} and {} can have different nullability only in joinGetOrNull",
dest_column->getName(), column_from_block->getName());
dest_column = nullable_col->getNestedColumnPtr().get();
}
/** Using dest_column->structureEquals(*column_from_block) will not work for low cardinality columns,
* because dictionaries can be different, while calling insertFrom on them is safe, for example:
* ColumnLowCardinality(size = 0, UInt8(size = 0), ColumnUnique(size = 1, String(size = 1)))
* and
* ColumnLowCardinality(size = 0, UInt16(size = 0), ColumnUnique(size = 1, String(size = 1)))
*/
if (typeid(*dest_column) != typeid(*column_from_block))
throw Exception(ErrorCodes::LOGICAL_ERROR, "Columns {} and {} have different types {} and {}",
dest_column->getName(), column_from_block->getName(),
demangle(typeid(*dest_column).name()), demangle(typeid(*column_from_block).name()));
}
#endif
if (is_join_get)
{
size_t right_indexes_size = right_indexes.size();
for (size_t j = 0; j < right_indexes_size; ++j)
{
const auto & column_from_block = block.getByPosition(right_indexes[j]);
if (auto * nullable_col = nullable_column_ptrs[j])
nullable_col->insertFromNotNullable(*column_from_block.column, row_num);
else
columns[j]->insertFrom(*column_from_block.column, row_num);
}
}
else
{
size_t right_indexes_size = right_indexes.size();
for (size_t j = 0; j < right_indexes_size; ++j)
{
const auto & column_from_block = block.getByPosition(right_indexes[j]);
columns[j]->insertFrom(*column_from_block.column, row_num);
}
}
}
void appendDefaultRow()
{
++lazy_defaults_count;
}
void applyLazyDefaults()
{
if (lazy_defaults_count)
{
for (size_t j = 0, size = right_indexes.size(); j < size; ++j)
JoinCommon::addDefaultValues(*columns[j], type_name[j].type, lazy_defaults_count);
lazy_defaults_count = 0;
}
}
const IColumn & leftAsofKey() const { return *left_asof_key; } const IColumn & leftAsofKey() const { return *left_asof_key; }
@ -1192,16 +1143,50 @@ public:
} }
private: private:
std::vector<TypeAndName> type_name;
MutableColumns columns;
std::vector<ColumnNullable *> nullable_column_ptrs;
void checkBlock(const Block & block)
{
for (size_t j = 0; j < right_indexes.size(); ++j)
{
const auto * column_from_block = block.getByPosition(right_indexes[j]).column.get();
const auto * dest_column = columns[j].get();
if (auto * nullable_col = nullable_column_ptrs[j])
{
if (!is_join_get)
throw Exception(ErrorCodes::LOGICAL_ERROR,
"Columns {} and {} can have different nullability only in joinGetOrNull",
dest_column->getName(), column_from_block->getName());
dest_column = nullable_col->getNestedColumnPtr().get();
}
/** Using dest_column->structureEquals(*column_from_block) will not work for low cardinality columns,
* because dictionaries can be different, while calling insertFrom on them is safe, for example:
* ColumnLowCardinality(size = 0, UInt8(size = 0), ColumnUnique(size = 1, String(size = 1)))
* and
* ColumnLowCardinality(size = 0, UInt16(size = 0), ColumnUnique(size = 1, String(size = 1)))
*/
if (typeid(*dest_column) != typeid(*column_from_block))
throw Exception(ErrorCodes::LOGICAL_ERROR, "Columns {} and {} have different types {} and {}",
dest_column->getName(), column_from_block->getName(),
demangle(typeid(*dest_column).name()), demangle(typeid(*column_from_block).name()));
}
}
MutableColumns columns;
bool is_join_get;
std::vector<size_t> right_indexes; std::vector<size_t> right_indexes;
std::vector<TypeAndName> type_name;
std::vector<ColumnNullable *> nullable_column_ptrs;
size_t lazy_defaults_count = 0; size_t lazy_defaults_count = 0;
/// for lazy
// The default row is represented by an empty RowRef, so that fixed-size blocks can be generated sequentially,
// default_count cannot represent the position of the row
LazyOutput lazy_output;
bool has_columns_to_add;
/// for ASOF /// for ASOF
const IColumn * left_asof_key = nullptr; const IColumn * left_asof_key = nullptr;
bool is_join_get;
void addColumn(const ColumnWithTypeAndName & src_column, const std::string & qualified_name) void addColumn(const ColumnWithTypeAndName & src_column, const std::string & qualified_name)
{ {
@ -1210,6 +1195,126 @@ private:
type_name.emplace_back(src_column.type, src_column.name, qualified_name); type_name.emplace_back(src_column.type, src_column.name, qualified_name);
} }
}; };
template<> void AddedColumns<false>::buildOutput()
{
}
template<>
void AddedColumns<true>::buildOutput()
{
for (size_t i = 0; i < this->size(); ++i)
{
auto& col = columns[i];
size_t default_count = 0;
auto apply_default = [&]()
{
if (default_count > 0)
{
JoinCommon::addDefaultValues(*col, type_name[i].type, default_count);
default_count = 0;
}
};
for (size_t j = 0; j < lazy_output.blocks.size(); ++j)
{
if (!lazy_output.blocks[j])
{
default_count ++;
continue;
}
apply_default();
const auto & column_from_block = reinterpret_cast<const Block *>(lazy_output.blocks[j])->getByPosition(right_indexes[i]);
/// If it's joinGetOrNull, we need to wrap not-nullable columns in StorageJoin.
if (is_join_get)
{
if (auto * nullable_col = typeid_cast<ColumnNullable *>(col.get());
nullable_col && !column_from_block.column->isNullable())
{
nullable_col->insertFromNotNullable(*column_from_block.column, lazy_output.row_nums[j]);
continue;
}
}
col->insertFrom(*column_from_block.column, lazy_output.row_nums[j]);
}
apply_default();
}
}
template<>
void AddedColumns<false>::applyLazyDefaults()
{
if (lazy_defaults_count)
{
for (size_t j = 0, size = right_indexes.size(); j < size; ++j)
JoinCommon::addDefaultValues(*columns[j], type_name[j].type, lazy_defaults_count);
lazy_defaults_count = 0;
}
}
template<>
void AddedColumns<true>::applyLazyDefaults()
{
}
template <>
void AddedColumns<false>::appendFromBlock(const Block & block, size_t row_num,const bool has_defaults)
{
if (has_defaults)
applyLazyDefaults();
#ifndef NDEBUG
checkBlock(block);
#endif
if (is_join_get)
{
size_t right_indexes_size = right_indexes.size();
for (size_t j = 0; j < right_indexes_size; ++j)
{
const auto & column_from_block = block.getByPosition(right_indexes[j]);
if (auto * nullable_col = nullable_column_ptrs[j])
nullable_col->insertFromNotNullable(*column_from_block.column, row_num);
else
columns[j]->insertFrom(*column_from_block.column, row_num);
}
}
else
{
size_t right_indexes_size = right_indexes.size();
for (size_t j = 0; j < right_indexes_size; ++j)
{
const auto & column_from_block = block.getByPosition(right_indexes[j]);
columns[j]->insertFrom(*column_from_block.column, row_num);
}
}
}
template <>
void AddedColumns<true>::appendFromBlock(const Block & block, size_t row_num, bool)
{
#ifndef NDEBUG
checkBlock(block);
#endif
if (has_columns_to_add)
{
lazy_output.blocks.emplace_back(reinterpret_cast<UInt64>(&block));
lazy_output.row_nums.emplace_back(static_cast<uint32_t>(row_num));
}
}
template<>
void AddedColumns<false>::appendDefaultRow()
{
++lazy_defaults_count;
}
template<>
void AddedColumns<true>::appendDefaultRow()
{
if (has_columns_to_add)
{
lazy_output.blocks.emplace_back(0);
lazy_output.row_nums.emplace_back(0);
}
}
template <JoinKind KIND, JoinStrictness STRICTNESS> template <JoinKind KIND, JoinStrictness STRICTNESS>
struct JoinFeatures struct JoinFeatures
@ -1308,7 +1413,7 @@ public:
} }
}; };
template <typename Map, bool add_missing, bool multiple_disjuncts> template <typename Map, bool add_missing, bool multiple_disjuncts, typename AddedColumns>
void addFoundRowAll( void addFoundRowAll(
const typename Map::mapped_type & mapped, const typename Map::mapped_type & mapped,
AddedColumns & added, AddedColumns & added,
@ -1327,7 +1432,7 @@ void addFoundRowAll(
{ {
if (!known_rows.isKnown(std::make_pair(it->block, it->row_num))) if (!known_rows.isKnown(std::make_pair(it->block, it->row_num)))
{ {
added.appendFromBlock<false>(*it->block, it->row_num); added.appendFromBlock(*it->block, it->row_num, false);
++current_offset; ++current_offset;
if (!new_known_rows_ptr) if (!new_known_rows_ptr)
{ {
@ -1351,13 +1456,13 @@ void addFoundRowAll(
{ {
for (auto it = mapped.begin(); it.ok(); ++it) for (auto it = mapped.begin(); it.ok(); ++it)
{ {
added.appendFromBlock<false>(*it->block, it->row_num); added.appendFromBlock(*it->block, it->row_num, false);
++current_offset; ++current_offset;
} }
} }
} }
template <bool add_missing, bool need_offset> template <bool add_missing, bool need_offset, typename AddedColumns>
void addNotFoundRow(AddedColumns & added [[maybe_unused]], IColumn::Offset & current_offset [[maybe_unused]]) void addNotFoundRow(AddedColumns & added [[maybe_unused]], IColumn::Offset & current_offset [[maybe_unused]])
{ {
if constexpr (add_missing) if constexpr (add_missing)
@ -1377,7 +1482,7 @@ void setUsed(IColumn::Filter & filter [[maybe_unused]], size_t pos [[maybe_unuse
/// Joins right table columns which indexes are present in right_indexes using specified map. /// Joins right table columns which indexes are present in right_indexes using specified map.
/// Makes filter (1 if row presented in right table) and returns offsets to replicate (for ALL JOINS). /// Makes filter (1 if row presented in right table) and returns offsets to replicate (for ALL JOINS).
template <JoinKind KIND, JoinStrictness STRICTNESS, typename KeyGetter, typename Map, bool need_filter, bool multiple_disjuncts> template <JoinKind KIND, JoinStrictness STRICTNESS, typename KeyGetter, typename Map, bool need_filter, bool multiple_disjuncts, typename AddedColumns>
NO_INLINE size_t joinRightColumns( NO_INLINE size_t joinRightColumns(
std::vector<KeyGetter> && key_getter_vector, std::vector<KeyGetter> && key_getter_vector,
const std::vector<const Map *> & mapv, const std::vector<const Map *> & mapv,
@ -1440,7 +1545,7 @@ NO_INLINE size_t joinRightColumns(
else else
used_flags.template setUsed<join_features.need_flags, multiple_disjuncts>(find_result); used_flags.template setUsed<join_features.need_flags, multiple_disjuncts>(find_result);
added_columns.appendFromBlock<join_features.add_missing>(*row_ref.block, row_ref.row_num); added_columns.appendFromBlock(*row_ref.block, row_ref.row_num, join_features.add_missing);
} }
else else
addNotFoundRow<join_features.add_missing, join_features.need_replication>(added_columns, current_offset); addNotFoundRow<join_features.add_missing, join_features.need_replication>(added_columns, current_offset);
@ -1471,7 +1576,7 @@ NO_INLINE size_t joinRightColumns(
if (used_once) if (used_once)
{ {
setUsed<need_filter>(added_columns.filter, i); setUsed<need_filter>(added_columns.filter, i);
added_columns.appendFromBlock<join_features.add_missing>(*mapped.block, mapped.row_num); added_columns.appendFromBlock(*mapped.block, mapped.row_num, join_features.add_missing);
} }
break; break;
@ -1489,7 +1594,7 @@ NO_INLINE size_t joinRightColumns(
{ {
setUsed<need_filter>(added_columns.filter, i); setUsed<need_filter>(added_columns.filter, i);
used_flags.template setUsed<join_features.need_flags, multiple_disjuncts>(find_result); used_flags.template setUsed<join_features.need_flags, multiple_disjuncts>(find_result);
added_columns.appendFromBlock<join_features.add_missing>(*mapped.block, mapped.row_num); added_columns.appendFromBlock(*mapped.block, mapped.row_num, join_features.add_missing);
if (join_features.is_any_or_semi_join) if (join_features.is_any_or_semi_join)
{ {
@ -1516,7 +1621,7 @@ NO_INLINE size_t joinRightColumns(
return i; return i;
} }
template <JoinKind KIND, JoinStrictness STRICTNESS, typename KeyGetter, typename Map, bool need_filter> template <JoinKind KIND, JoinStrictness STRICTNESS, typename KeyGetter, typename Map, bool need_filter, typename AddedColumns>
size_t joinRightColumnsSwitchMultipleDisjuncts( size_t joinRightColumnsSwitchMultipleDisjuncts(
std::vector<KeyGetter> && key_getter_vector, std::vector<KeyGetter> && key_getter_vector,
const std::vector<const Map *> & mapv, const std::vector<const Map *> & mapv,
@ -1528,7 +1633,7 @@ size_t joinRightColumnsSwitchMultipleDisjuncts(
: joinRightColumns<KIND, STRICTNESS, KeyGetter, Map, need_filter, false>(std::forward<std::vector<KeyGetter>>(key_getter_vector), mapv, added_columns, used_flags); : joinRightColumns<KIND, STRICTNESS, KeyGetter, Map, need_filter, false>(std::forward<std::vector<KeyGetter>>(key_getter_vector), mapv, added_columns, used_flags);
} }
template <JoinKind KIND, JoinStrictness STRICTNESS, typename KeyGetter, typename Map> template <JoinKind KIND, JoinStrictness STRICTNESS, typename KeyGetter, typename Map, typename AddedColumns>
size_t joinRightColumnsSwitchNullability( size_t joinRightColumnsSwitchNullability(
std::vector<KeyGetter> && key_getter_vector, std::vector<KeyGetter> && key_getter_vector,
const std::vector<const Map *> & mapv, const std::vector<const Map *> & mapv,
@ -1545,7 +1650,7 @@ size_t joinRightColumnsSwitchNullability(
} }
} }
template <JoinKind KIND, JoinStrictness STRICTNESS, typename Maps> template <JoinKind KIND, JoinStrictness STRICTNESS, typename Maps, typename AddedColumns>
size_t switchJoinRightColumns( size_t switchJoinRightColumns(
const std::vector<const Maps *> & mapv, const std::vector<const Maps *> & mapv,
AddedColumns & added_columns, AddedColumns & added_columns,
@ -1680,14 +1785,9 @@ Block HashJoin::joinBlockImpl(
* but they will not be used at this stage of joining (and will be in `AdderNonJoined`), and they need to be skipped. * but they will not be used at this stage of joining (and will be in `AdderNonJoined`), and they need to be skipped.
* For ASOF, the last column is used as the ASOF column * For ASOF, the last column is used as the ASOF column
*/ */
AddedColumns added_columns( AddedColumns<!join_features.is_any_join> added_columns(
block, block, block_with_columns_to_add, savedBlockSample(), *this, std::move(join_on_keys), join_features.is_asof_join, is_join_get);
block_with_columns_to_add,
savedBlockSample(),
*this,
std::move(join_on_keys),
join_features.is_asof_join,
is_join_get);
bool has_required_right_keys = (required_right_keys.columns() != 0); bool has_required_right_keys = (required_right_keys.columns() != 0);
added_columns.need_filter = join_features.need_filter || has_required_right_keys; added_columns.need_filter = join_features.need_filter || has_required_right_keys;
@ -1702,6 +1802,7 @@ Block HashJoin::joinBlockImpl(
added_columns.join_on_keys.clear(); added_columns.join_on_keys.clear();
Block remaining_block = sliceBlock(block, num_joined); Block remaining_block = sliceBlock(block, num_joined);
added_columns.buildOutput();
for (size_t i = 0; i < added_columns.size(); ++i) for (size_t i = 0; i < added_columns.size(); ++i)
block.insert(added_columns.moveColumn(i)); block.insert(added_columns.moveColumn(i));

View File

@ -780,13 +780,30 @@ InterpreterSelectQuery::InterpreterSelectQuery(
result_header = getSampleBlockImpl(); result_header = getSampleBlockImpl();
}; };
/// This is a hack to make sure we reanalyze if GlobalSubqueriesVisitor changed allow_experimental_parallel_reading_from_replicas
/// inside the query context (because it doesn't have write access to the main context)
UInt64 parallel_replicas_before_analysis
= context->hasQueryContext() ? context->getQueryContext()->getSettingsRef().allow_experimental_parallel_reading_from_replicas : 0;
/// Conditionally support AST-based PREWHERE optimization. /// Conditionally support AST-based PREWHERE optimization.
analyze(shouldMoveToPrewhere() && (!settings.query_plan_optimize_prewhere || !settings.query_plan_enable_optimizations)); analyze(shouldMoveToPrewhere() && (!settings.query_plan_optimize_prewhere || !settings.query_plan_enable_optimizations));
bool need_analyze_again = false; bool need_analyze_again = false;
bool can_analyze_again = false; bool can_analyze_again = false;
if (context->hasQueryContext()) if (context->hasQueryContext())
{ {
/// As this query can't be executed with parallel replicas, we must reanalyze it
if (context->getQueryContext()->getSettingsRef().allow_experimental_parallel_reading_from_replicas
!= parallel_replicas_before_analysis)
{
context->setSetting("allow_experimental_parallel_reading_from_replicas", Field(0));
context->setSetting("max_parallel_replicas", UInt64{0});
need_analyze_again = true;
}
/// Check number of calls of 'analyze' function. /// Check number of calls of 'analyze' function.
/// If it is too big, we will not analyze the query again not to have exponential blowup. /// If it is too big, we will not analyze the query again not to have exponential blowup.
std::atomic<size_t> & current_query_analyze_count = context->getQueryContext()->kitchen_sink.analyze_counter; std::atomic<size_t> & current_query_analyze_count = context->getQueryContext()->kitchen_sink.analyze_counter;
@ -875,7 +892,7 @@ bool InterpreterSelectQuery::adjustParallelReplicasAfterAnalysis()
{ {
/// The query could use trivial count if it didn't use parallel replicas, so let's disable it and reanalyze /// The query could use trivial count if it didn't use parallel replicas, so let's disable it and reanalyze
context->setSetting("allow_experimental_parallel_reading_from_replicas", Field(0)); context->setSetting("allow_experimental_parallel_reading_from_replicas", Field(0));
context->setSetting("max_parallel_replicas", UInt64{0}); context->setSetting("max_parallel_replicas", UInt64{1});
LOG_DEBUG(log, "Disabling parallel replicas to be able to use a trivial count optimization"); LOG_DEBUG(log, "Disabling parallel replicas to be able to use a trivial count optimization");
return true; return true;
} }

View File

@ -10,6 +10,7 @@
#include <Common/ShellCommand.h> #include <Common/ShellCommand.h>
#include <Common/CurrentMetrics.h> #include <Common/CurrentMetrics.h>
#include <Common/FailPoint.h> #include <Common/FailPoint.h>
#include <Common/PageCache.h>
#include <Interpreters/Cache/FileCacheFactory.h> #include <Interpreters/Cache/FileCacheFactory.h>
#include <Interpreters/Cache/FileCache.h> #include <Interpreters/Cache/FileCache.h>
#include <Interpreters/Context.h> #include <Interpreters/Context.h>
@ -460,6 +461,13 @@ BlockIO InterpreterSystemQuery::execute()
{ {
throw Exception(ErrorCodes::SUPPORT_IS_DISABLED, "Not implemented"); throw Exception(ErrorCodes::SUPPORT_IS_DISABLED, "Not implemented");
} }
case Type::DROP_PAGE_CACHE:
{
getContext()->checkAccess(AccessType::SYSTEM_DROP_PAGE_CACHE);
getContext()->dropPageCache();
break;
}
case Type::DROP_SCHEMA_CACHE: case Type::DROP_SCHEMA_CACHE:
{ {
getContext()->checkAccess(AccessType::SYSTEM_DROP_SCHEMA_CACHE); getContext()->checkAccess(AccessType::SYSTEM_DROP_SCHEMA_CACHE);
@ -1201,6 +1209,7 @@ AccessRightsElements InterpreterSystemQuery::getRequiredAccessForDDLOnCluster()
case Type::DROP_INDEX_UNCOMPRESSED_CACHE: case Type::DROP_INDEX_UNCOMPRESSED_CACHE:
case Type::DROP_FILESYSTEM_CACHE: case Type::DROP_FILESYSTEM_CACHE:
case Type::SYNC_FILESYSTEM_CACHE: case Type::SYNC_FILESYSTEM_CACHE:
case Type::DROP_PAGE_CACHE:
case Type::DROP_SCHEMA_CACHE: case Type::DROP_SCHEMA_CACHE:
case Type::DROP_FORMAT_SCHEMA_CACHE: case Type::DROP_FORMAT_SCHEMA_CACHE:
case Type::DROP_S3_CLIENT_CACHE: case Type::DROP_S3_CLIENT_CACHE:

View File

@ -9,6 +9,8 @@
#include <Interpreters/Cache/QueryCache.h> #include <Interpreters/Cache/QueryCache.h>
#include <Interpreters/JIT/CompiledExpressionCache.h> #include <Interpreters/JIT/CompiledExpressionCache.h>
#include <Common/PageCache.h>
#include <Databases/IDatabase.h> #include <Databases/IDatabase.h>
#include <IO/UncompressedCache.h> #include <IO/UncompressedCache.h>
@ -77,6 +79,16 @@ void ServerAsynchronousMetrics::updateImpl(TimePoint update_time, TimePoint curr
new_values["MarkCacheFiles"] = { mark_cache->count(), "Total number of mark files cached in the mark cache" }; new_values["MarkCacheFiles"] = { mark_cache->count(), "Total number of mark files cached in the mark cache" };
} }
if (auto page_cache = getContext()->getPageCache())
{
auto rss = page_cache->getResidentSetSize();
new_values["PageCacheBytes"] = { rss.page_cache_rss, "Userspace page cache memory usage in bytes" };
new_values["PageCachePinnedBytes"] = { page_cache->getPinnedSize(), "Userspace page cache memory that's currently in use and can't be evicted" };
if (rss.unreclaimable_rss.has_value())
new_values["UnreclaimableRSS"] = { *rss.unreclaimable_rss, "The amount of physical memory used by the server process, in bytes, excluding memory reclaimable by the OS (MADV_FREE)" };
}
if (auto uncompressed_cache = getContext()->getUncompressedCache()) if (auto uncompressed_cache = getContext()->getUncompressedCache())
{ {
new_values["UncompressedCacheBytes"] = { uncompressed_cache->sizeInBytes(), new_values["UncompressedCacheBytes"] = { uncompressed_cache->sizeInBytes(),

View File

@ -6,7 +6,6 @@
#include <IO/WriteBufferFromFileDescriptor.h> #include <IO/WriteBufferFromFileDescriptor.h>
#include <IO/WriteHelpers.h> #include <IO/WriteHelpers.h>
#include <Interpreters/TraceLog.h> #include <Interpreters/TraceLog.h>
#include <Poco/Logger.h>
#include <Common/ProfileEvents.h> #include <Common/ProfileEvents.h>
#include <Common/setThreadName.h> #include <Common/setThreadName.h>
#include <Common/logger_useful.h> #include <Common/logger_useful.h>

View File

@ -0,0 +1,267 @@
#include <Common/PageCache.h>
#include <gtest/gtest.h>
#include <thread>
#ifdef OS_LINUX
#include <sys/sysinfo.h>
#endif
using namespace DB;
namespace ProfileEvents
{
extern const Event PageCacheChunkMisses;
extern const Event PageCacheChunkShared;
extern const Event PageCacheChunkDataHits;
extern const Event PageCacheChunkDataPartialHits;
extern const Event PageCacheChunkDataMisses;
}
#define CHECK(x) \
do { \
if (!(x)) \
{ \
std::cerr << "check on line " << __LINE__ << " failed: " << #x << std::endl; \
std::abort(); \
} \
} while (false)
size_t estimateRAMSize()
{
#ifdef OS_LINUX
struct sysinfo info;
int r = sysinfo(&info);
CHECK(r == 0);
return static_cast<size_t>(info.totalram * info.mem_unit);
#else
return 128ul << 30;
#endif
}
/// Do random reads and writes in PageCache from multiple threads, check that the data read matches the data written.
TEST(PageCache, DISABLED_Stress)
{
/// There doesn't seem to be a reasonable way to simulate memory pressure or force the eviction of MADV_FREE-d pages.
/// So we actually map more virtual memory than we have RAM and fill it all up a few times.
/// This takes an eternity (a few minutes), but idk how else to hit MADV_FREE eviction.
/// Expect ~1 GB/s, bottlenecked by page faults.
size_t ram_size = estimateRAMSize();
PageCache cache(2 << 20, 1 << 30, ram_size + ram_size / 10, /* use_madv_free */ true, /* use_huge_pages */ true);
CHECK(cache.getResidentSetSize().page_cache_rss);
const size_t num_keys = static_cast<size_t>(cache.maxChunks() * 1.5);
const size_t pages_per_chunk = cache.chunkSize() / cache.pageSize();
const size_t items_per_page = cache.pageSize() / 8;
const size_t passes = 2;
const size_t step = 20;
const size_t num_threads = 20;
const size_t chunks_touched = num_keys * passes * num_threads / step;
std::atomic<size_t> progress {0};
std::atomic<size_t> threads_finished {0};
std::atomic<size_t> total_racing_writes {0};
auto thread_func = [&]
{
pcg64 rng(randomSeed());
std::vector<PinnedPageChunk> pinned;
/// Stats.
size_t racing_writes = 0;
for (size_t i = 0; i < num_keys * passes; i += step)
{
progress += 1;
/// Touch the chunks sequentially + noise (to increase interference across threads), or at random 10% of the time.
size_t key_idx;
if (rng() % 10 == 0)
key_idx = std::uniform_int_distribution<size_t>(0, num_keys - 1)(rng);
else
key_idx = (i + std::uniform_int_distribution<size_t>(0, num_keys / 1000)(rng)) % num_keys;
/// For some keys, always use detached_if_missing = true and check that cache always misses.
bool key_detached_if_missing = key_idx % 100 == 42;
bool detached_if_missing = key_detached_if_missing || i % 101 == 42;
PageCacheKey key = key_idx * 0xcafebabeb0bad00dul; // a simple reversible hash (the constant can be any odd number)
PinnedPageChunk chunk = cache.getOrSet(key, detached_if_missing, /* inject_eviction */ false);
if (key_detached_if_missing)
CHECK(!chunk.getChunk()->pages_populated.any());
for (size_t page_idx = 0; page_idx < pages_per_chunk; ++page_idx)
{
bool populated = chunk.getChunk()->pages_populated.get(page_idx);
/// Generate page contents deterministically from key and page index.
size_t start = key_idx * page_idx;
if (start % 37 == 13)
{
/// Leave ~1/37 of the pages unpopulated.
CHECK(!populated);
}
else
{
/// We may write/read the same memory from multiple threads in parallel here.
std::atomic<size_t> * items = reinterpret_cast<std::atomic<size_t> *>(chunk.getChunk()->data + cache.pageSize() * page_idx);
if (populated)
{
for (size_t j = 0; j < items_per_page; ++j)
CHECK(items[j].load(std::memory_order_relaxed) == start + j);
}
else
{
for (size_t j = 0; j < items_per_page; ++j)
items[j].store(start + j, std::memory_order_relaxed);
if (!chunk.markPagePopulated(page_idx))
racing_writes += 1;
}
}
}
pinned.push_back(std::move(chunk));
CHECK(cache.getPinnedSize() >= cache.chunkSize());
/// Unpin 2 chunks on average.
while (rng() % 3 != 0 && !pinned.empty())
{
size_t idx = rng() % pinned.size();
if (idx != pinned.size() - 1)
pinned[idx] = std::move(pinned.back());
pinned.pop_back();
}
}
total_racing_writes += racing_writes;
threads_finished += 1;
};
std::cout << fmt::format("doing {:.1f} passes over {:.1f} GiB of virtual memory\nthis will take a few minutes, progress printed every 10 seconds",
chunks_touched * 1. / cache.maxChunks(), cache.maxChunks() * cache.chunkSize() * 1. / (1ul << 30)) << std::endl;
auto start_time = std::chrono::steady_clock::now();
std::vector<std::thread> threads;
for (size_t i = 0; i < num_threads; ++i)
threads.emplace_back(thread_func);
for (size_t poll = 0;; ++poll)
{
if (threads_finished == num_threads)
break;
if (poll % 100 == 0)
std::cout << fmt::format("{:.3f}%", progress.load() * 100. / num_keys / passes / num_threads * step) << std::endl;
std::this_thread::sleep_for(std::chrono::milliseconds(100));
}
for (std::thread & t : threads)
t.join();
auto end_time = std::chrono::steady_clock::now();
double elapsed_seconds = std::chrono::duration_cast<std::chrono::duration<double>>(end_time - start_time).count();
double touched_gib = chunks_touched * cache.chunkSize() * 1. / (1ul << 30);
std::cout << fmt::format("touched {:.1f} GiB in {:.1f} seconds, that's {:.3f} GiB/s",
touched_gib, elapsed_seconds, touched_gib / elapsed_seconds) << std::endl;
auto & counters = CurrentThread::getProfileEvents();
std::cout << "stats:"
<< "\nchunk misses: " << counters[ProfileEvents::PageCacheChunkMisses].load()
<< "\nchunk shared: " << counters[ProfileEvents::PageCacheChunkShared].load()
<< "\nchunk data misses: " << counters[ProfileEvents::PageCacheChunkDataMisses].load()
<< "\nchunk data partial hits: " << counters[ProfileEvents::PageCacheChunkDataPartialHits].load()
<< "\nchunk data hits: " << counters[ProfileEvents::PageCacheChunkDataHits].load()
<< "\nracing page writes: " << total_racing_writes << std::endl;
/// Check that we at least hit all the cases.
CHECK(counters[ProfileEvents::PageCacheChunkMisses].load() > 0);
CHECK(counters[ProfileEvents::PageCacheChunkShared].load() > 0);
CHECK(counters[ProfileEvents::PageCacheChunkDataMisses].load() > 0);
/// Partial hits are rare enough that sometimes this is zero, so don't check it.
/// That's good news because we don't need to implement downloading parts of a chunk.
/// CHECK(counters[ProfileEvents::PageCacheChunkDataPartialHits].load() > 0);
CHECK(counters[ProfileEvents::PageCacheChunkDataHits].load() > 0);
CHECK(total_racing_writes > 0);
CHECK(cache.getPinnedSize() == 0);
size_t rss = cache.getResidentSetSize().page_cache_rss;
std::cout << "RSS: " << rss * 1. / (1ul << 30) << " GiB" << std::endl;
/// This can be flaky if the system has < 10% free memory. If this turns out to be a problem, feel free to remove or reduce.
CHECK(rss > ram_size / 10);
cache.dropCache();
#ifdef OS_LINUX
/// MADV_DONTNEED is not synchronous, and we're freeing lots of pages. Let's give Linux a lot of time.
std::this_thread::sleep_for(std::chrono::seconds(10));
size_t new_rss = cache.getResidentSetSize().page_cache_rss;
std::cout << "RSS after dropping cache: " << new_rss * 1. / (1ul << 30) << " GiB" << std::endl;
CHECK(new_rss < rss / 2);
#endif
}
/// Benchmark that measures the PageCache overhead for cache hits. Doesn't touch the actual data, so
/// memory bandwidth mostly doesn't factor into this.
/// This measures the overhead of things like madvise(MADV_FREE) and probing the pages (restoreChunkFromLimbo()).
/// Disabled in CI, run manually with --gtest_also_run_disabled_tests --gtest_filter=PageCache.DISABLED_HitsBench
TEST(PageCache, DISABLED_HitsBench)
{
/// Do a few runs, with and without MADV_FREE.
for (size_t num_threads = 1; num_threads <= 16; num_threads *= 2)
{
for (size_t run = 0; run < 8; ++ run)
{
bool use_madv_free = run % 2 == 1;
bool use_huge_pages = run % 4 / 2 == 1;
PageCache cache(2 << 20, 1ul << 30, 20ul << 30, use_madv_free, use_huge_pages);
size_t passes = 3;
std::atomic<size_t> total_misses {0};
/// Prepopulate all chunks.
for (size_t i = 0; i < cache.maxChunks(); ++i)
{
PageCacheKey key = i * 0xcafebabeb0bad00dul;
PinnedPageChunk chunk = cache.getOrSet(key, /* detache_if_missing */ false, /* inject_eviction */ false);
memset(chunk.getChunk()->data, 42, chunk.getChunk()->size);
chunk.markPrefixPopulated(cache.chunkSize());
}
auto thread_func = [&]
{
pcg64 rng(randomSeed());
size_t misses = 0;
for (size_t i = 0; i < cache.maxChunks() * passes; ++i)
{
PageCacheKey key = rng() % cache.maxChunks() * 0xcafebabeb0bad00dul;
PinnedPageChunk chunk = cache.getOrSet(key, /* detache_if_missing */ false, /* inject_eviction */ false);
if (!chunk.isPrefixPopulated(cache.chunkSize()))
misses += 1;
}
total_misses += misses;
};
auto start_time = std::chrono::steady_clock::now();
std::vector<std::thread> threads;
for (size_t i = 0; i < num_threads; ++i)
threads.emplace_back(thread_func);
for (std::thread & t : threads)
t.join();
auto end_time = std::chrono::steady_clock::now();
double elapsed_seconds = std::chrono::duration_cast<std::chrono::duration<double>>(end_time - start_time).count();
double fetched_gib = cache.chunkSize() * cache.maxChunks() * passes * 1. / (1ul << 30);
std::cout << fmt::format(
"threads {}, run {}, use_madv_free = {}, use_huge_pages = {}\nrequested {:.1f} GiB in {:.1f} seconds\n"
"that's {:.1f} GiB/s, or overhead of {:.3}us/{:.1}MiB\n",
num_threads, run, use_madv_free, use_huge_pages, fetched_gib, elapsed_seconds, fetched_gib / elapsed_seconds,
elapsed_seconds * 1e6 / cache.maxChunks() / passes, cache.chunkSize() * 1. / (1 << 20)) << std::endl;
if (total_misses != 0)
std::cout << "!got " << total_misses.load() << " misses! perhaps your system doesn't have enough free memory, consider decreasing cache size in the benchmark code" << std::endl;
}
}
}

View File

@ -13,18 +13,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. limitations under the License.
Common/ErrorCodes.cpp
Common/UInt128.h
Core/Block.h
Core/Defines.h
Core/Settings.h
Databases/DatabasesCommon.cpp
IO/WriteBufferValidUTF8.cpp
Interpreters/InterpreterAlterQuery.cpp
Interpreters/InterpreterCreateQuery.cpp Interpreters/InterpreterCreateQuery.cpp
Interpreters/InterpreterFactory.cpp Interpreters/InterpreterFactory.cpp
Parsers/ASTAlterQuery.cpp Parsers/ASTAlterQuery.cpp
Parsers/ASTAlterQuery.h
Parsers/ASTCreateQuery.cpp Parsers/ASTCreateQuery.cpp
Parsers/ASTCreateQuery.h Parsers/ASTCreateQuery.h
Parsers/ParserAlterQuery.cpp Parsers/ParserAlterQuery.cpp

View File

@ -60,8 +60,6 @@ ASTPtr ASTAlterCommand::clone() const
res->settings_resets = res->children.emplace_back(settings_resets->clone()).get(); res->settings_resets = res->children.emplace_back(settings_resets->clone()).get();
if (select) if (select)
res->select = res->children.emplace_back(select->clone()).get(); res->select = res->children.emplace_back(select->clone()).get();
if (values)
res->values = res->children.emplace_back(values->clone()).get();
if (rename_to) if (rename_to)
res->rename_to = res->children.emplace_back(rename_to->clone()).get(); res->rename_to = res->children.emplace_back(rename_to->clone()).get();
@ -518,7 +516,6 @@ void ASTAlterCommand::forEachPointerToChild(std::function<void(void**)> f)
f(reinterpret_cast<void **>(&settings_changes)); f(reinterpret_cast<void **>(&settings_changes));
f(reinterpret_cast<void **>(&settings_resets)); f(reinterpret_cast<void **>(&settings_resets));
f(reinterpret_cast<void **>(&select)); f(reinterpret_cast<void **>(&select));
f(reinterpret_cast<void **>(&values));
f(reinterpret_cast<void **>(&rename_to)); f(reinterpret_cast<void **>(&rename_to));
} }

View File

@ -166,9 +166,6 @@ public:
/// For MODIFY_SQL_SECURITY /// For MODIFY_SQL_SECURITY
IAST * sql_security = nullptr; IAST * sql_security = nullptr;
/// In ALTER CHANNEL, ADD, DROP, SUSPEND, RESUME, REFRESH, MODIFY queries, the list of live views is stored here
IAST * values = nullptr;
/// Target column name /// Target column name
IAST * rename_to = nullptr; IAST * rename_to = nullptr;

View File

@ -348,13 +348,6 @@ void ASTCreateQuery::formatQueryImpl(const FormatSettings & settings, FormatStat
settings.ostr << (settings.hilite ? hilite_keyword : "") << " FROM " << (settings.hilite ? hilite_none : "") settings.ostr << (settings.hilite ? hilite_keyword : "") << " FROM " << (settings.hilite ? hilite_none : "")
<< quoteString(*attach_from_path); << quoteString(*attach_from_path);
if (live_view_periodic_refresh)
{
settings.ostr << (settings.hilite ? hilite_keyword : "") << " WITH" << (settings.hilite ? hilite_none : "")
<< (settings.hilite ? hilite_keyword : "") << " PERIODIC REFRESH " << (settings.hilite ? hilite_none : "")
<< *live_view_periodic_refresh;
}
formatOnCluster(settings); formatOnCluster(settings);
} }
else else

View File

@ -122,7 +122,6 @@ public:
ASTDictionary * dictionary = nullptr; /// dictionary definition (layout, primary key, etc.) ASTDictionary * dictionary = nullptr; /// dictionary definition (layout, primary key, etc.)
ASTRefreshStrategy * refresh_strategy = nullptr; // For CREATE MATERIALIZED VIEW ... REFRESH ... ASTRefreshStrategy * refresh_strategy = nullptr; // For CREATE MATERIALIZED VIEW ... REFRESH ...
std::optional<UInt64> live_view_periodic_refresh; /// For CREATE LIVE VIEW ... WITH [PERIODIC] REFRESH ...
bool is_watermark_strictly_ascending{false}; /// STRICTLY ASCENDING WATERMARK STRATEGY FOR WINDOW VIEW bool is_watermark_strictly_ascending{false}; /// STRICTLY ASCENDING WATERMARK STRATEGY FOR WINDOW VIEW
bool is_watermark_ascending{false}; /// ASCENDING WATERMARK STRATEGY FOR WINDOW VIEW bool is_watermark_ascending{false}; /// ASCENDING WATERMARK STRATEGY FOR WINDOW VIEW

View File

@ -415,6 +415,7 @@ void ASTSystemQuery::formatImpl(const FormatSettings & settings, FormatState & s
case Type::STOP_THREAD_FUZZER: case Type::STOP_THREAD_FUZZER:
case Type::START_VIEWS: case Type::START_VIEWS:
case Type::STOP_VIEWS: case Type::STOP_VIEWS:
case Type::DROP_PAGE_CACHE:
break; break;
case Type::UNKNOWN: case Type::UNKNOWN:
case Type::END: case Type::END:

View File

@ -31,6 +31,7 @@ public:
DROP_COMPILED_EXPRESSION_CACHE, DROP_COMPILED_EXPRESSION_CACHE,
DROP_FILESYSTEM_CACHE, DROP_FILESYSTEM_CACHE,
DROP_DISK_METADATA_CACHE, DROP_DISK_METADATA_CACHE,
DROP_PAGE_CACHE,
DROP_SCHEMA_CACHE, DROP_SCHEMA_CACHE,
DROP_FORMAT_SCHEMA_CACHE, DROP_FORMAT_SCHEMA_CACHE,
DROP_S3_CLIENT_CACHE, DROP_S3_CLIENT_CACHE,

View File

@ -138,7 +138,6 @@ bool ParserAlterCommand::parseImpl(Pos & pos, ASTPtr & node, Expected & expected
ParserList parser_reset_setting( ParserList parser_reset_setting(
std::make_unique<ParserIdentifier>(), std::make_unique<ParserToken>(TokenType::Comma), std::make_unique<ParserIdentifier>(), std::make_unique<ParserToken>(TokenType::Comma),
/* allow_empty = */ false); /* allow_empty = */ false);
ParserNameList values_p;
ParserSelectWithUnionQuery select_p; ParserSelectWithUnionQuery select_p;
ParserSQLSecurity sql_security_p; ParserSQLSecurity sql_security_p;
ParserRefreshStrategy refresh_p; ParserRefreshStrategy refresh_p;
@ -163,7 +162,6 @@ bool ParserAlterCommand::parseImpl(Pos & pos, ASTPtr & node, Expected & expected
ASTPtr command_settings_changes; ASTPtr command_settings_changes;
ASTPtr command_settings_resets; ASTPtr command_settings_resets;
ASTPtr command_select; ASTPtr command_select;
ASTPtr command_values;
ASTPtr command_rename_to; ASTPtr command_rename_to;
ASTPtr command_sql_security; ASTPtr command_sql_security;
@ -944,8 +942,6 @@ bool ParserAlterCommand::parseImpl(Pos & pos, ASTPtr & node, Expected & expected
command->settings_resets = command->children.emplace_back(std::move(command_settings_resets)).get(); command->settings_resets = command->children.emplace_back(std::move(command_settings_resets)).get();
if (command_select) if (command_select)
command->select = command->children.emplace_back(std::move(command_select)).get(); command->select = command->children.emplace_back(std::move(command_select)).get();
if (command_values)
command->values = command->children.emplace_back(std::move(command_values)).get();
if (command_sql_security) if (command_sql_security)
command->sql_security = command->children.emplace_back(std::move(command_sql_security)).get(); command->sql_security = command->children.emplace_back(std::move(command_sql_security)).get();
if (command_rename_to) if (command_rename_to)

View File

@ -917,15 +917,11 @@ bool ParserCreateLiveViewQuery::parseImpl(Pos & pos, ASTPtr & node, Expected & e
ASTPtr as_database; ASTPtr as_database;
ASTPtr as_table; ASTPtr as_table;
ASTPtr select; ASTPtr select;
ASTPtr live_view_periodic_refresh;
ASTPtr sql_security; ASTPtr sql_security;
String cluster_str; String cluster_str;
bool attach = false; bool attach = false;
bool if_not_exists = false; bool if_not_exists = false;
bool with_and = false;
bool with_timeout = false;
bool with_periodic_refresh = false;
if (!s_create.ignore(pos, expected)) if (!s_create.ignore(pos, expected))
{ {
@ -949,23 +945,6 @@ bool ParserCreateLiveViewQuery::parseImpl(Pos & pos, ASTPtr & node, Expected & e
if (!table_name_p.parse(pos, table, expected)) if (!table_name_p.parse(pos, table, expected))
return false; return false;
if (ParserKeyword{"WITH"}.ignore(pos, expected))
{
if (ParserKeyword{"REFRESH"}.ignore(pos, expected) || ParserKeyword{"PERIODIC REFRESH"}.ignore(pos, expected))
{
if (!ParserNumber{}.parse(pos, live_view_periodic_refresh, expected))
live_view_periodic_refresh = std::make_shared<ASTLiteral>(static_cast<UInt64>(60));
with_periodic_refresh = true;
}
else if (with_and)
return false;
if (!with_timeout && !with_periodic_refresh)
return false;
}
if (ParserKeyword{"ON"}.ignore(pos, expected)) if (ParserKeyword{"ON"}.ignore(pos, expected))
{ {
if (!ASTQueryWithOnCluster::parse(pos, cluster_str, expected)) if (!ASTQueryWithOnCluster::parse(pos, cluster_str, expected))
@ -1028,9 +1007,6 @@ bool ParserCreateLiveViewQuery::parseImpl(Pos & pos, ASTPtr & node, Expected & e
tryGetIdentifierNameInto(as_table, query->as_table); tryGetIdentifierNameInto(as_table, query->as_table);
query->set(query->select, select); query->set(query->select, select);
if (live_view_periodic_refresh)
query->live_view_periodic_refresh.emplace(live_view_periodic_refresh->as<ASTLiteral &>().value.safeGet<UInt64>());
if (comment) if (comment)
query->set(query->comment, comment); query->set(query->comment, comment);

View File

@ -391,7 +391,9 @@ void PipelineExecutor::executeImpl(size_t num_threads, bool concurrency_control)
SCOPE_EXIT_SAFE( SCOPE_EXIT_SAFE(
if (!finished_flag) if (!finished_flag)
{ {
finish(); /// If finished_flag is not set, there was an exception.
/// Cancel execution in this case.
cancel();
if (pool) if (pool)
pool->wait(); pool->wait();
} }

View File

@ -23,7 +23,6 @@ struct PullingAsyncPipelineExecutor::Data
std::atomic_bool is_finished = false; std::atomic_bool is_finished = false;
std::atomic_bool has_exception = false; std::atomic_bool has_exception = false;
ThreadFromGlobalPool thread; ThreadFromGlobalPool thread;
Poco::Event finish_event;
~Data() ~Data()
{ {
@ -89,12 +88,10 @@ static void threadFunction(
data.has_exception = true; data.has_exception = true;
/// Finish lazy format in case of exception. Otherwise thread.join() may hung. /// Finish lazy format in case of exception. Otherwise thread.join() may hung.
if (data.lazy_format) data.lazy_format->finalize();
data.lazy_format->finalize();
} }
data.is_finished = true; data.is_finished = true;
data.finish_event.set();
} }
@ -129,20 +126,8 @@ bool PullingAsyncPipelineExecutor::pull(Chunk & chunk, uint64_t milliseconds)
return false; return false;
} }
if (lazy_format) chunk = lazy_format->getChunk(milliseconds);
{ data->rethrowExceptionIfHas();
chunk = lazy_format->getChunk(milliseconds);
data->rethrowExceptionIfHas();
return true;
}
chunk.clear();
if (milliseconds)
data->finish_event.tryWait(milliseconds);
else
data->finish_event.wait();
return true; return true;
} }
@ -230,14 +215,12 @@ void PullingAsyncPipelineExecutor::cancelWithExceptionHandling(CancelFunc && can
Chunk PullingAsyncPipelineExecutor::getTotals() Chunk PullingAsyncPipelineExecutor::getTotals()
{ {
return lazy_format ? lazy_format->getTotals() return lazy_format->getTotals();
: Chunk();
} }
Chunk PullingAsyncPipelineExecutor::getExtremes() Chunk PullingAsyncPipelineExecutor::getExtremes()
{ {
return lazy_format ? lazy_format->getExtremes() return lazy_format->getExtremes();
: Chunk();
} }
Block PullingAsyncPipelineExecutor::getTotalsBlock() Block PullingAsyncPipelineExecutor::getTotalsBlock()
@ -264,15 +247,7 @@ Block PullingAsyncPipelineExecutor::getExtremesBlock()
ProfileInfo & PullingAsyncPipelineExecutor::getProfileInfo() ProfileInfo & PullingAsyncPipelineExecutor::getProfileInfo()
{ {
if (lazy_format) return lazy_format->getProfileInfo();
return lazy_format->getProfileInfo();
static ProfileInfo profile_info;
static std::once_flag flag;
/// Calculate rows before limit here to avoid race.
std::call_once(flag, []() { profile_info.getRowsBeforeLimit(); });
return profile_info;
} }
} }

View File

@ -1,6 +1,5 @@
#pragma once #pragma once
#include <memory> #include <memory>
#include <atomic>
#include <vector> #include <vector>
namespace DB namespace DB

Some files were not shown because too many files have changed in this diff Show More