mirror of
https://github.com/ClickHouse/ClickHouse.git
synced 2024-11-27 01:51:59 +00:00
Merge branch 'master' into changelog-23.1
This commit is contained in:
commit
2f670a772e
34
.github/workflows/release.yml
vendored
34
.github/workflows/release.yml
vendored
@ -12,38 +12,10 @@ jobs:
|
|||||||
ReleasePublish:
|
ReleasePublish:
|
||||||
runs-on: [self-hosted, style-checker]
|
runs-on: [self-hosted, style-checker]
|
||||||
steps:
|
steps:
|
||||||
- name: Set envs
|
- name: Deploy packages and assets
|
||||||
run: |
|
run: |
|
||||||
cat >> "$GITHUB_ENV" << 'EOF'
|
GITHUB_TAG="${GITHUB_REF#refs/tags/}"
|
||||||
JFROG_API_KEY=${{ secrets.JFROG_ARTIFACTORY_API_KEY }}
|
curl '${{ secrets.PACKAGES_RELEASE_URL }}/release/'"${GITHUB_TAG}"'?binary=binary_darwin&binary=binary_darwin_aarch64&sync=true' -d ''
|
||||||
TEMP_PATH=${{runner.temp}}/release_packages
|
|
||||||
REPO_COPY=${{runner.temp}}/release_packages/ClickHouse
|
|
||||||
EOF
|
|
||||||
- name: Check out repository code
|
|
||||||
uses: ClickHouse/checkout@v1
|
|
||||||
with:
|
|
||||||
# Always use the most recent script version
|
|
||||||
ref: master
|
|
||||||
- name: Download packages and push to Artifactory
|
|
||||||
run: |
|
|
||||||
rm -rf "$TEMP_PATH" && mkdir -p "$TEMP_PATH"
|
|
||||||
cp -r "$GITHUB_WORKSPACE" "$TEMP_PATH"
|
|
||||||
cd "$REPO_COPY"
|
|
||||||
# Download and push packages to artifactory
|
|
||||||
python3 ./tests/ci/push_to_artifactory.py --release '${{ github.ref }}' \
|
|
||||||
--commit '${{ github.sha }}' --artifactory-url '${{ secrets.JFROG_ARTIFACTORY_URL }}' --all
|
|
||||||
# Download macos binaries to ${{runner.temp}}/download_binary
|
|
||||||
python3 ./tests/ci/download_binary.py --version '${{ github.ref }}' \
|
|
||||||
--commit '${{ github.sha }}' binary_darwin binary_darwin_aarch64
|
|
||||||
mv '${{runner.temp}}/download_binary/'clickhouse-* '${{runner.temp}}/push_to_artifactory'
|
|
||||||
- name: Upload packages to release assets
|
|
||||||
uses: svenstaro/upload-release-action@v2
|
|
||||||
with:
|
|
||||||
repo_token: ${{ secrets.GITHUB_TOKEN }}
|
|
||||||
file: ${{runner.temp}}/push_to_artifactory/*
|
|
||||||
overwrite: true
|
|
||||||
tag: ${{ github.ref }}
|
|
||||||
file_glob: true
|
|
||||||
############################################################################################
|
############################################################################################
|
||||||
##################################### Docker images #######################################
|
##################################### Docker images #######################################
|
||||||
############################################################################################
|
############################################################################################
|
||||||
|
16
SECURITY.md
16
SECURITY.md
@ -13,9 +13,10 @@ The following versions of ClickHouse server are currently being supported with s
|
|||||||
|
|
||||||
| Version | Supported |
|
| Version | Supported |
|
||||||
|:-|:-|
|
|:-|:-|
|
||||||
|
| 23.1 | ✔️ |
|
||||||
| 22.12 | ✔️ |
|
| 22.12 | ✔️ |
|
||||||
| 22.11 | ✔️ |
|
| 22.11 | ✔️ |
|
||||||
| 22.10 | ✔️ |
|
| 22.10 | ❌ |
|
||||||
| 22.9 | ❌ |
|
| 22.9 | ❌ |
|
||||||
| 22.8 | ✔️ |
|
| 22.8 | ✔️ |
|
||||||
| 22.7 | ❌ |
|
| 22.7 | ❌ |
|
||||||
@ -25,18 +26,7 @@ The following versions of ClickHouse server are currently being supported with s
|
|||||||
| 22.3 | ✔️ |
|
| 22.3 | ✔️ |
|
||||||
| 22.2 | ❌ |
|
| 22.2 | ❌ |
|
||||||
| 22.1 | ❌ |
|
| 22.1 | ❌ |
|
||||||
| 21.12 | ❌ |
|
| 21.* | ❌ |
|
||||||
| 21.11 | ❌ |
|
|
||||||
| 21.10 | ❌ |
|
|
||||||
| 21.9 | ❌ |
|
|
||||||
| 21.8 | ❌ |
|
|
||||||
| 21.7 | ❌ |
|
|
||||||
| 21.6 | ❌ |
|
|
||||||
| 21.5 | ❌ |
|
|
||||||
| 21.4 | ❌ |
|
|
||||||
| 21.3 | ❌ |
|
|
||||||
| 21.2 | ❌ |
|
|
||||||
| 21.1 | ❌ |
|
|
||||||
| 20.* | ❌ |
|
| 20.* | ❌ |
|
||||||
| 19.* | ❌ |
|
| 19.* | ❌ |
|
||||||
| 18.* | ❌ |
|
| 18.* | ❌ |
|
||||||
|
@ -2,11 +2,11 @@
|
|||||||
|
|
||||||
# NOTE: has nothing common with DBMS_TCP_PROTOCOL_VERSION,
|
# NOTE: has nothing common with DBMS_TCP_PROTOCOL_VERSION,
|
||||||
# only DBMS_TCP_PROTOCOL_VERSION should be incremented on protocol changes.
|
# only DBMS_TCP_PROTOCOL_VERSION should be incremented on protocol changes.
|
||||||
SET(VERSION_REVISION 54470)
|
SET(VERSION_REVISION 54471)
|
||||||
SET(VERSION_MAJOR 22)
|
SET(VERSION_MAJOR 23)
|
||||||
SET(VERSION_MINOR 13)
|
SET(VERSION_MINOR 2)
|
||||||
SET(VERSION_PATCH 1)
|
SET(VERSION_PATCH 1)
|
||||||
SET(VERSION_GITHASH 688e488e930c83eefeac4f87c4cc029cc5b231e3)
|
SET(VERSION_GITHASH dcaac47702510cc87ddf266bc524f6b7ce0a8e6e)
|
||||||
SET(VERSION_DESCRIBE v22.13.1.1-testing)
|
SET(VERSION_DESCRIBE v23.2.1.1-testing)
|
||||||
SET(VERSION_STRING 22.13.1.1)
|
SET(VERSION_STRING 23.2.1.1)
|
||||||
# end of autochange
|
# end of autochange
|
||||||
|
2
contrib/NuRaft
vendored
2
contrib/NuRaft
vendored
@ -1 +1 @@
|
|||||||
Subproject commit afc36dfa9b0beb45bc4cd935060631cc80ba04a5
|
Subproject commit 545b8c810a956b2efdc116e86be219af7e83d68a
|
2
contrib/arrow
vendored
2
contrib/arrow
vendored
@ -1 +1 @@
|
|||||||
Subproject commit 450a5638704386356f8e520080468fc9bc8bcaf8
|
Subproject commit d03245f801f798c63ee9a7d2b8914a9e5c5cd666
|
2
contrib/poco
vendored
2
contrib/poco
vendored
@ -1 +1 @@
|
|||||||
Subproject commit 4b1c8dd9913d2a16db62df0e509fa598da5c8219
|
Subproject commit 7fefdf30244a9bf8eb58562a9b2a51cc59a8877a
|
@ -33,7 +33,7 @@ RUN arch=${TARGETARCH:-amd64} \
|
|||||||
# lts / testing / prestable / etc
|
# lts / testing / prestable / etc
|
||||||
ARG REPO_CHANNEL="stable"
|
ARG REPO_CHANNEL="stable"
|
||||||
ARG REPOSITORY="https://packages.clickhouse.com/tgz/${REPO_CHANNEL}"
|
ARG REPOSITORY="https://packages.clickhouse.com/tgz/${REPO_CHANNEL}"
|
||||||
ARG VERSION="22.12.3.5"
|
ARG VERSION="23.1.1.3077"
|
||||||
ARG PACKAGES="clickhouse-client clickhouse-server clickhouse-common-static"
|
ARG PACKAGES="clickhouse-client clickhouse-server clickhouse-common-static"
|
||||||
|
|
||||||
# user/group precreated explicitly with fixed uid/gid on purpose.
|
# user/group precreated explicitly with fixed uid/gid on purpose.
|
||||||
|
@ -21,7 +21,7 @@ RUN sed -i "s|http://archive.ubuntu.com|${apt_archive}|g" /etc/apt/sources.list
|
|||||||
|
|
||||||
ARG REPO_CHANNEL="stable"
|
ARG REPO_CHANNEL="stable"
|
||||||
ARG REPOSITORY="deb https://packages.clickhouse.com/deb ${REPO_CHANNEL} main"
|
ARG REPOSITORY="deb https://packages.clickhouse.com/deb ${REPO_CHANNEL} main"
|
||||||
ARG VERSION="22.12.3.5"
|
ARG VERSION="23.1.1.3077"
|
||||||
ARG PACKAGES="clickhouse-client clickhouse-server clickhouse-common-static"
|
ARG PACKAGES="clickhouse-client clickhouse-server clickhouse-common-static"
|
||||||
|
|
||||||
# set non-empty deb_location_url url to create a docker image
|
# set non-empty deb_location_url url to create a docker image
|
||||||
|
@ -146,6 +146,12 @@ def prepare_for_hung_check(drop_databases):
|
|||||||
"KILL QUERY WHERE query LIKE 'SELECT URL, uniq(SearchPhrase) AS u FROM test.hits GROUP BY URL ORDER BY u %'"
|
"KILL QUERY WHERE query LIKE 'SELECT URL, uniq(SearchPhrase) AS u FROM test.hits GROUP BY URL ORDER BY u %'"
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
# Long query from 02136_kill_scalar_queries
|
||||||
|
call_with_retry(
|
||||||
|
make_query_command(
|
||||||
|
"KILL QUERY WHERE query LIKE 'SELECT (SELECT number FROM system.numbers WHERE number = 1000000000000)%'"
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
if drop_databases:
|
if drop_databases:
|
||||||
for i in range(5):
|
for i in range(5):
|
||||||
|
21
docs/changelogs/v22.10.7.13-stable.md
Normal file
21
docs/changelogs/v22.10.7.13-stable.md
Normal file
@ -0,0 +1,21 @@
|
|||||||
|
---
|
||||||
|
sidebar_position: 1
|
||||||
|
sidebar_label: 2023
|
||||||
|
---
|
||||||
|
|
||||||
|
# 2023 Changelog
|
||||||
|
|
||||||
|
### ClickHouse release v22.10.7.13-stable (d261d9036cc) FIXME as compared to v22.10.6.3-stable (645a66d221f)
|
||||||
|
|
||||||
|
#### Bug Fix (user-visible misbehavior in official stable or prestable release)
|
||||||
|
|
||||||
|
* Backported in [#44998](https://github.com/ClickHouse/ClickHouse/issues/44998): Another fix for `Cannot read all data` error which could happen while reading `LowCardinality` dictionary from remote fs. Fixes [#44709](https://github.com/ClickHouse/ClickHouse/issues/44709). [#44875](https://github.com/ClickHouse/ClickHouse/pull/44875) ([Nikolai Kochetov](https://github.com/KochetovNicolai)).
|
||||||
|
* Backported in [#45551](https://github.com/ClickHouse/ClickHouse/issues/45551): Fix `SELECT ... FROM system.dictionaries` exception when there is a dictionary with a bad structure (e.g. incorrect type in xml config). [#45399](https://github.com/ClickHouse/ClickHouse/pull/45399) ([Aleksei Filatov](https://github.com/aalexfvk)).
|
||||||
|
|
||||||
|
#### NOT FOR CHANGELOG / INSIGNIFICANT
|
||||||
|
|
||||||
|
* Automatically merge green backport PRs and green approved PRs [#41110](https://github.com/ClickHouse/ClickHouse/pull/41110) ([Mikhail f. Shiryaev](https://github.com/Felixoid)).
|
||||||
|
* Improve release scripts [#45074](https://github.com/ClickHouse/ClickHouse/pull/45074) ([Mikhail f. Shiryaev](https://github.com/Felixoid)).
|
||||||
|
* Fix wrong approved_at, simplify conditions [#45302](https://github.com/ClickHouse/ClickHouse/pull/45302) ([Mikhail f. Shiryaev](https://github.com/Felixoid)).
|
||||||
|
* Get rid of artifactory in favor of r2 + ch-repos-manager [#45421](https://github.com/ClickHouse/ClickHouse/pull/45421) ([Mikhail f. Shiryaev](https://github.com/Felixoid)).
|
||||||
|
|
592
docs/changelogs/v23.1.1.3077-stable.md
Normal file
592
docs/changelogs/v23.1.1.3077-stable.md
Normal file
@ -0,0 +1,592 @@
|
|||||||
|
---
|
||||||
|
sidebar_position: 1
|
||||||
|
sidebar_label: 2023
|
||||||
|
---
|
||||||
|
|
||||||
|
# 2023 Changelog
|
||||||
|
|
||||||
|
### ClickHouse release v23.1.1.3077-stable (dcaac477025) FIXME as compared to v22.12.1.1752-stable (688e488e930)
|
||||||
|
|
||||||
|
#### Backward Incompatible Change
|
||||||
|
* Remove query `SYSTEM RESTART DISK`. [#44647](https://github.com/ClickHouse/ClickHouse/pull/44647) ([alesapin](https://github.com/alesapin)).
|
||||||
|
* Disallow Gorilla compression on columns of non-Float32 or non-Float64 type. [#45252](https://github.com/ClickHouse/ClickHouse/pull/45252) ([Robert Schulze](https://github.com/rschu1ze)).
|
||||||
|
* Remove PREALLOCATE for HASHED/SPARSE_HASHED dictionaries. [#45388](https://github.com/ClickHouse/ClickHouse/pull/45388) ([Azat Khuzhin](https://github.com/azat)).
|
||||||
|
* Parallel quorum inserts might work incorrectly with `*MergeTree` tables created with deprecated syntax. Therefore, parallel quorum inserts support is completely disabled for such tables. It does not affect tables created with a new syntax. [#45430](https://github.com/ClickHouse/ClickHouse/pull/45430) ([Alexander Tokmakov](https://github.com/tavplubix)).
|
||||||
|
|
||||||
|
#### New Feature
|
||||||
|
* Add `quantileInterpolatedWeighted`/`quantilesInterpolatedWeighted` functions. [#38252](https://github.com/ClickHouse/ClickHouse/pull/38252) ([Bharat Nallan](https://github.com/bharatnc)).
|
||||||
|
* Add an experimental inverted index as a new secondary index type for efficient text search. [#38667](https://github.com/ClickHouse/ClickHouse/pull/38667) ([larryluogit](https://github.com/larryluogit)).
|
||||||
|
* Add column `ptr` to `system.trace_log` for `trace_type = 'MemorySample'`. This column contains an address of allocation. Added function `flameGraph` which can build flamegraph containing allocated and not released memory. Reworking of [#38391](https://github.com/ClickHouse/ClickHouse/issues/38391). [#38953](https://github.com/ClickHouse/ClickHouse/pull/38953) ([Nikolai Kochetov](https://github.com/KochetovNicolai)).
|
||||||
|
* Dictionary source for extracting keys by traversing regular expressions tree. [#40878](https://github.com/ClickHouse/ClickHouse/pull/40878) ([Vage Ogannisian](https://github.com/nooblose)).
|
||||||
|
* Added parametrized view functionality, now it's possible to specify query parameters for View table engine. resolves [#40907](https://github.com/ClickHouse/ClickHouse/issues/40907). [#41687](https://github.com/ClickHouse/ClickHouse/pull/41687) ([SmitaRKulkarni](https://github.com/SmitaRKulkarni)).
|
||||||
|
* added extendable and configurable scheduling subsystem for IO requests (not yet integrated with IO code itself). [#41840](https://github.com/ClickHouse/ClickHouse/pull/41840) ([Sergei Trifonov](https://github.com/serxa)).
|
||||||
|
* Added `SYSTEM DROP DATABASE REPLICA` that removes metadata of dead replica of `Replicated` database. Resolves [#41794](https://github.com/ClickHouse/ClickHouse/issues/41794). [#42807](https://github.com/ClickHouse/ClickHouse/pull/42807) ([Alexander Tokmakov](https://github.com/tavplubix)).
|
||||||
|
* Array join support map type, like function explode in spark. [#43239](https://github.com/ClickHouse/ClickHouse/pull/43239) ([李扬](https://github.com/taiyang-li)).
|
||||||
|
* Support SQL standard binary and hex string literals. [#43785](https://github.com/ClickHouse/ClickHouse/pull/43785) ([Mo Xuan](https://github.com/mo-avatar)).
|
||||||
|
* Add experimental query result cache. [#43797](https://github.com/ClickHouse/ClickHouse/pull/43797) ([Robert Schulze](https://github.com/rschu1ze)).
|
||||||
|
* format datetime in joda datetime style. Refer to https://joda-time.sourceforge.net/apidocs/org/joda/time/format/DateTimeFormat.html. [#43818](https://github.com/ClickHouse/ClickHouse/pull/43818) ([李扬](https://github.com/taiyang-li)).
|
||||||
|
* to merge [#40878](https://github.com/ClickHouse/ClickHouse/issues/40878) , supporting regexp dictionary. [#43858](https://github.com/ClickHouse/ClickHouse/pull/43858) ([Han Fei](https://github.com/hanfei1991)).
|
||||||
|
* Implemented a fractional second formatter (`%f`) for formatDateTime. [#44060](https://github.com/ClickHouse/ClickHouse/pull/44060) ([ltrk2](https://github.com/ltrk2)).
|
||||||
|
* Added age function to calculate difference between two dates or dates with time values expressed as number of full units. Closes [#41115](https://github.com/ClickHouse/ClickHouse/issues/41115). [#44421](https://github.com/ClickHouse/ClickHouse/pull/44421) ([Robert Schulze](https://github.com/rschu1ze)).
|
||||||
|
* Implemented a fractional second formatter (%f) for formatDateTime. This is slightly modified PR [#44060](https://github.com/ClickHouse/ClickHouse/issues/44060) by @ltrk2. [#44497](https://github.com/ClickHouse/ClickHouse/pull/44497) ([Alexander Gololobov](https://github.com/davenger)).
|
||||||
|
* Add null source for dictionaries. Closes [#44240](https://github.com/ClickHouse/ClickHouse/issues/44240). [#44502](https://github.com/ClickHouse/ClickHouse/pull/44502) ([mayamika](https://github.com/mayamika)).
|
||||||
|
* We can use `s3_storage_class` to set different tier. Such as ``` <disks> <s3> <type>s3</type> <endpoint>xxx</endpoint> <access_key_id>xxx</access_key_id> <secret_access_key>xxx</secret_access_key> <s3_storage_class>STANDARD/INTELLIGENT_TIERING</s3_storage_class> </s3> </disks> ``` Closes [#44443](https://github.com/ClickHouse/ClickHouse/issues/44443). [#44707](https://github.com/ClickHouse/ClickHouse/pull/44707) ([chen](https://github.com/xiedeyantu)).
|
||||||
|
* Try to detect header with column names (and maybe types) for CSV/TSV/CustomSeparated input formats. Add settings `input_format_tsv/csv/custom_detect_header` that enables this behaviour (enabled by default). Closes [#44640](https://github.com/ClickHouse/ClickHouse/issues/44640). [#44953](https://github.com/ClickHouse/ClickHouse/pull/44953) ([Kruglov Pavel](https://github.com/Avogar)).
|
||||||
|
* Insert default values in case of missing elements in JSON object while parsing named tuple. Add setting `input_format_json_defaults_for_missing_elements_in_named_tuple` that controls this behaviour. Closes [#45142](https://github.com/ClickHouse/ClickHouse/issues/45142)#issuecomment-1380153217. [#45231](https://github.com/ClickHouse/ClickHouse/pull/45231) ([Kruglov Pavel](https://github.com/Avogar)).
|
||||||
|
* - Add total memory and used memory metrics with respect to cgroup in AsyncMetrics (https://github.com/ClickHouse/ClickHouse/issues/37983). [#45301](https://github.com/ClickHouse/ClickHouse/pull/45301) ([sichenzhao](https://github.com/sichenzhao)).
|
||||||
|
* Introduce non-throwing variants of hasToken and hasTokenCaseInsensitive. [#45341](https://github.com/ClickHouse/ClickHouse/pull/45341) ([ltrk2](https://github.com/ltrk2)).
|
||||||
|
|
||||||
|
#### Performance Improvement
|
||||||
|
* Added sharding support in HashedDictionary to allow parallel load (almost linear scaling based on number of shards). [#40003](https://github.com/ClickHouse/ClickHouse/pull/40003) ([Azat Khuzhin](https://github.com/azat)).
|
||||||
|
* Do not load inactive parts at startup of `MergeTree` tables. [#42181](https://github.com/ClickHouse/ClickHouse/pull/42181) ([Anton Popov](https://github.com/CurtizJ)).
|
||||||
|
* - Speed up query parsing. [#42284](https://github.com/ClickHouse/ClickHouse/pull/42284) ([Raúl Marín](https://github.com/Algunenano)).
|
||||||
|
* Always replace OR chain `expr = x1 OR ... OR expr = xN` to `expr IN (x1, ..., xN)` in case if `expr` is a `LowCardinality` column. Setting `optimize_min_equality_disjunction_chain_length` is ignored in this case. [#42889](https://github.com/ClickHouse/ClickHouse/pull/42889) ([Guo Wangyang](https://github.com/guowangy)).
|
||||||
|
* > Original changelog In the original implementation, the memory of ThreadGroupStatus:: finished_threads_counters_memory is released by moving it to a temporary std::vector, which soon expired and gets destructed. This method is viable, however not straightforward enough. To enhance the code readability, this commit releases the memory in the vector by firstly resizing it to 0 and then shrinking the capacity accordingly. [#43586](https://github.com/ClickHouse/ClickHouse/pull/43586) ([Zhiguo Zhou](https://github.com/ZhiguoZh)).
|
||||||
|
* As a follow-up of [#42214](https://github.com/ClickHouse/ClickHouse/issues/42214), this PR tries to optimize the column-wise ternary logic evaluation by achieving auto-vectorization. In the performance test of this [microbenchmark](https://github.com/ZhiguoZh/ClickHouse/blob/20221123-ternary-logic-opt-example/src/Functions/examples/associative_applier_perf.cpp), we've observed a peak **performance gain** of **21x** on the ICX device (Intel Xeon Platinum 8380 CPU). [#43669](https://github.com/ClickHouse/ClickHouse/pull/43669) ([Zhiguo Zhou](https://github.com/ZhiguoZh)).
|
||||||
|
* Improved latency of reading from storage `S3` and table function `s3` with large number of small files. Now settings `remote_filesystem_read_method` and `remote_filesystem_read_prefetch` take effect while reading from storage `S3`. [#43726](https://github.com/ClickHouse/ClickHouse/pull/43726) ([Anton Popov](https://github.com/CurtizJ)).
|
||||||
|
* - Avoid acquiring read locks in system.tables if possible. [#43840](https://github.com/ClickHouse/ClickHouse/pull/43840) ([Raúl Marín](https://github.com/Algunenano)).
|
||||||
|
* The performance experiments of SSB (Star Schema Benchmark) on the ICX device (Intel Xeon Platinum 8380 CPU, 80 cores, 160 threads) shows that this change could effectively decrease the lock contention for ThreadPoolImpl::mutex by **75%**, increasing the CPU utilization and improving the overall performance by **2.4%**. [#44308](https://github.com/ClickHouse/ClickHouse/pull/44308) ([Zhiguo Zhou](https://github.com/ZhiguoZh)).
|
||||||
|
* Now optimisation is applied only if the cached HT size is sufficiently large (thresholds were determined empirically and hardcoded). [#44455](https://github.com/ClickHouse/ClickHouse/pull/44455) ([Nikita Taranov](https://github.com/nickitat)).
|
||||||
|
* ... The whole struct field will be loaded at current, even though we just want to read one field of the struct. [#44484](https://github.com/ClickHouse/ClickHouse/pull/44484) ([lgbo](https://github.com/lgbo-ustc)).
|
||||||
|
* Small performance improvement for asynchronous reading from remote fs. [#44868](https://github.com/ClickHouse/ClickHouse/pull/44868) ([Kseniia Sumarokova](https://github.com/kssenii)).
|
||||||
|
* Switched to faster shared (RW) mutex implementation. Performance may be improved in queries with a lot of thread synchronization or for data structures experiencing heavy contention. [#45007](https://github.com/ClickHouse/ClickHouse/pull/45007) ([Sergei Trifonov](https://github.com/serxa)).
|
||||||
|
* Add fast path for: - col like '%%' - col like '%' - col not like '%' - col not like '%' - match(col, '.*'). [#45244](https://github.com/ClickHouse/ClickHouse/pull/45244) ([李扬](https://github.com/taiyang-li)).
|
||||||
|
* todo. [#45289](https://github.com/ClickHouse/ClickHouse/pull/45289) ([Nikita Taranov](https://github.com/nickitat)).
|
||||||
|
|
||||||
|
#### Improvement
|
||||||
|
* Refactor and Improve streaming engines Kafka/RabbitMQ/NATS and add support for all formats, also refactor formats a bit: - Fix producing messages in row-based formats with suffixes/prefixes. Now every message is formatted complitely with all delimiters and can be parsed back using input format. - Support block-based formats like Native, Parquet, ORC, etc. Every block is formatted as a separated message. The number of rows in one message depends on block size, so you can control it via setting `max_block_size`. - Add new engine settings `kafka_max_rows_per_message/rabbitmq_max_rows_per_message/nats_max_rows_per_message`. They control the number of rows formatted in one message in row-based formats. Default value: 1. - Fix high memory consumption in NATS table engine. - Support arbitrary binary data in NATS producer (previously it worked only with strings contained \0 at the end) - Add missing Kafka/RabbitMQ/NATS engine settings in documentation. - Refactor producing and consuming in Kafka/RabbitMQ/NATS, separate it from WriteBuffers/ReadBuffers semantic. - Refactor output formats: remove callbacks on each row used in Kafka/RabbitMQ/NATS (now we don't use callbacks there), allow to use IRowOutputFormat directly, clarify row end and row between delimiters, make it possible to reset output format to start formatting again - Add proper implementation in formatRow function (bonus after formats refactoring). [#42777](https://github.com/ClickHouse/ClickHouse/pull/42777) ([Kruglov Pavel](https://github.com/Avogar)).
|
||||||
|
* Support `optimize_or_like_chain` in the new infrastructure. Part of [#42648](https://github.com/ClickHouse/ClickHouse/issues/42648). [#42797](https://github.com/ClickHouse/ClickHouse/pull/42797) ([Dmitry Novik](https://github.com/novikd)).
|
||||||
|
* Improve the Asterisk and ColumnMatcher parsers. Part of [#42648](https://github.com/ClickHouse/ClickHouse/issues/42648). [#42884](https://github.com/ClickHouse/ClickHouse/pull/42884) ([Nikolay Degterinsky](https://github.com/evillique)).
|
||||||
|
* Implement `optimize_redundant_functions_in_order_by` on top of QueryTree. Part of [#42648](https://github.com/ClickHouse/ClickHouse/issues/42648). [#42970](https://github.com/ClickHouse/ClickHouse/pull/42970) ([Dmitry Novik](https://github.com/novikd)).
|
||||||
|
* Support `optimize_group_by_function_keys` in the new analyzer architecture. Also, add support for optimizing GROUPING SETS keys. Part of [#42648](https://github.com/ClickHouse/ClickHouse/issues/42648). [#43261](https://github.com/ClickHouse/ClickHouse/pull/43261) ([Dmitry Novik](https://github.com/novikd)).
|
||||||
|
* Improve reading CSV field in CustomSeparated/Template format. Closes [#42352](https://github.com/ClickHouse/ClickHouse/issues/42352) Closes [#39620](https://github.com/ClickHouse/ClickHouse/issues/39620). [#43332](https://github.com/ClickHouse/ClickHouse/pull/43332) ([Kruglov Pavel](https://github.com/Avogar)).
|
||||||
|
* Support reading/writing `Nested` tables as `List` of `Struct` in CapnProto format. Read/write `Decimal32/64` as `Int32/64`. Closes [#43319](https://github.com/ClickHouse/ClickHouse/issues/43319). [#43379](https://github.com/ClickHouse/ClickHouse/pull/43379) ([Kruglov Pavel](https://github.com/Avogar)).
|
||||||
|
* - Unify query elapsed time measurements. [#43455](https://github.com/ClickHouse/ClickHouse/pull/43455) ([Raúl Marín](https://github.com/Algunenano)).
|
||||||
|
* Support scalar subqueries cache Implementation: * Added a map with hash of the node (without alias) and the evaluated value to Context. Testing: * Added a test-case with new analyser in 02174_cte_scalar_cache.sql. [#43640](https://github.com/ClickHouse/ClickHouse/pull/43640) ([SmitaRKulkarni](https://github.com/SmitaRKulkarni)).
|
||||||
|
* Improve automatic usage of structure from insertion table in table functions file/hdfs/s3 when virtual columns present in select query, it fixes possible error `Block structure mismatch` or `number of columns mismatch`. [#43695](https://github.com/ClickHouse/ClickHouse/pull/43695) ([Kruglov Pavel](https://github.com/Avogar)).
|
||||||
|
* Add support for signed arguments in range(). Fixes [#43333](https://github.com/ClickHouse/ClickHouse/issues/43333). [#43733](https://github.com/ClickHouse/ClickHouse/pull/43733) ([sanyu](https://github.com/wineternity)).
|
||||||
|
* Remove redundant sorting, for example, sorting related ORDER BY clauses in subqueries. Implemented on top of query plan. It does similar optimization as `optimize_duplicate_order_by_and_distinct` regarding `ORDER BY` clauses, but more generic, since it's applied to any redundant sorting steps (not only caused by ORDER BY clause) and applied to subqueries of any depth. Related to [#42648](https://github.com/ClickHouse/ClickHouse/issues/42648). [#43905](https://github.com/ClickHouse/ClickHouse/pull/43905) ([Igor Nikonov](https://github.com/devcrafter)).
|
||||||
|
* Added mmap support for StorageFile, which should improve the performance of clickhouse-local. [#43927](https://github.com/ClickHouse/ClickHouse/pull/43927) ([pufit](https://github.com/pufit)).
|
||||||
|
* Add ability to disable deduplication for BACKUP (for backups wiithout deduplication ATTACH can be used instead of full RESTORE), example `BACKUP foo TO S3(...) SETTINGS deduplicate_files=0` (default `deduplicate_files=1`). [#43947](https://github.com/ClickHouse/ClickHouse/pull/43947) ([Azat Khuzhin](https://github.com/azat)).
|
||||||
|
* Make `system.replicas` table do parallel fetches of replicas statuses. Closes [#43918](https://github.com/ClickHouse/ClickHouse/issues/43918). [#43998](https://github.com/ClickHouse/ClickHouse/pull/43998) ([Nikolay Degterinsky](https://github.com/evillique)).
|
||||||
|
* Refactor and improve schema inference for text formats. Add new setting `schema_inference_make_columns_nullable` that controls making result types `Nullable` (enabled by default);. [#44019](https://github.com/ClickHouse/ClickHouse/pull/44019) ([Kruglov Pavel](https://github.com/Avogar)).
|
||||||
|
* Better support for PROXYv1. [#44135](https://github.com/ClickHouse/ClickHouse/pull/44135) ([Yakov Olkhovskiy](https://github.com/yakov-olkhovskiy)).
|
||||||
|
* Add information about the latest part check by cleanup thread into `system.parts` table. [#44244](https://github.com/ClickHouse/ClickHouse/pull/44244) ([Dmitry Novik](https://github.com/novikd)).
|
||||||
|
* Disable functions in readonly for inserts. [#44290](https://github.com/ClickHouse/ClickHouse/pull/44290) ([SmitaRKulkarni](https://github.com/SmitaRKulkarni)).
|
||||||
|
* Add a setting `simultaneous_parts_removal_limit` to allow to limit the number of parts being processed by one iteration of CleanupThread. [#44461](https://github.com/ClickHouse/ClickHouse/pull/44461) ([Dmitry Novik](https://github.com/novikd)).
|
||||||
|
* If user only need virtual columns, we don't need to initialize ReadBufferFromS3. May be helpful to [#44246](https://github.com/ClickHouse/ClickHouse/issues/44246). [#44493](https://github.com/ClickHouse/ClickHouse/pull/44493) ([chen](https://github.com/xiedeyantu)).
|
||||||
|
* Prevent duplicate column names hints. Closes [#44130](https://github.com/ClickHouse/ClickHouse/issues/44130). [#44519](https://github.com/ClickHouse/ClickHouse/pull/44519) ([Joanna Hulboj](https://github.com/jh0x)).
|
||||||
|
* Allow macro substitution in endpoint of disks resolve [#40951](https://github.com/ClickHouse/ClickHouse/issues/40951). [#44533](https://github.com/ClickHouse/ClickHouse/pull/44533) ([SmitaRKulkarni](https://github.com/SmitaRKulkarni)).
|
||||||
|
* Added a `message_format_string` column to `system.text_log`. The column contains a pattern that was used to format the message. [#44543](https://github.com/ClickHouse/ClickHouse/pull/44543) ([Alexander Tokmakov](https://github.com/tavplubix)).
|
||||||
|
* Improve schema inference when `input_format_json_read_object_as_string` is enabled. [#44546](https://github.com/ClickHouse/ClickHouse/pull/44546) ([Kruglov Pavel](https://github.com/Avogar)).
|
||||||
|
* Add user-level setting `database_replicated_allow_replicated_engine_arguments` which allow to ban creation of `ReplicatedMergeTree` tables with arguments in `DatabaseReplicated`. [#44566](https://github.com/ClickHouse/ClickHouse/pull/44566) ([alesapin](https://github.com/alesapin)).
|
||||||
|
* Prevent users from mistakenly specifying zero (invalid) value for `index_granularity`. This closes [#44536](https://github.com/ClickHouse/ClickHouse/issues/44536). [#44578](https://github.com/ClickHouse/ClickHouse/pull/44578) ([Alexey Milovidov](https://github.com/alexey-milovidov)).
|
||||||
|
* Added possibility to set path to service keytab file in `keytab` parameter in `kerberos` section of config.xml. [#44594](https://github.com/ClickHouse/ClickHouse/pull/44594) ([Roman Vasin](https://github.com/rvasin)).
|
||||||
|
* Use already written part of the query for fuzzy search (pass to skim). [#44600](https://github.com/ClickHouse/ClickHouse/pull/44600) ([Azat Khuzhin](https://github.com/azat)).
|
||||||
|
* Enable input_format_json_read_objects_as_strings by default to be able to read nested JSON objects while JSON Object type is experimental. [#44657](https://github.com/ClickHouse/ClickHouse/pull/44657) ([Kruglov Pavel](https://github.com/Avogar)).
|
||||||
|
* When users do duplicate async inserts, we should dedup inside the memory before we query keeper. [#44682](https://github.com/ClickHouse/ClickHouse/pull/44682) ([Han Fei](https://github.com/hanfei1991)).
|
||||||
|
* Input/ouptut Avro bool type as ClickHouse bool type. [#44684](https://github.com/ClickHouse/ClickHouse/pull/44684) ([Kruglov Pavel](https://github.com/Avogar)).
|
||||||
|
* - Don't parse beyond the quotes when reading UUIDs. [#44686](https://github.com/ClickHouse/ClickHouse/pull/44686) ([Raúl Marín](https://github.com/Algunenano)).
|
||||||
|
* Infer UInt64 in case of Int64 overflow and fix some transforms in schema inference. [#44696](https://github.com/ClickHouse/ClickHouse/pull/44696) ([Kruglov Pavel](https://github.com/Avogar)).
|
||||||
|
* Previously dependency resolving inside DatabaseReplicated was done in a hacky way and now it done right using an explicit graph. [#44697](https://github.com/ClickHouse/ClickHouse/pull/44697) ([Nikita Mikhaylov](https://github.com/nikitamikhaylov)).
|
||||||
|
* Support Bool type in Arrow/Parquet/ORC. Closes [#43970](https://github.com/ClickHouse/ClickHouse/issues/43970). [#44698](https://github.com/ClickHouse/ClickHouse/pull/44698) ([Kruglov Pavel](https://github.com/Avogar)).
|
||||||
|
* Fix `output_format_pretty_row_numbers` does not preserve the counter across the blocks. Closes [#44815](https://github.com/ClickHouse/ClickHouse/issues/44815). [#44832](https://github.com/ClickHouse/ClickHouse/pull/44832) ([flynn](https://github.com/ucasfl)).
|
||||||
|
* Extend function "toDayOfWeek" with a mode argument describing if a) the week starts on Monday or Sunday and b) if counting starts at 0 or 1. [#44860](https://github.com/ClickHouse/ClickHouse/pull/44860) ([李扬](https://github.com/taiyang-li)).
|
||||||
|
* - Don't report errors in system.errors due to parts being merged concurrently with the background cleanup process. [#44874](https://github.com/ClickHouse/ClickHouse/pull/44874) ([Raúl Marín](https://github.com/Algunenano)).
|
||||||
|
* Optimize and fix metrics for Distributed async INSERT. [#44922](https://github.com/ClickHouse/ClickHouse/pull/44922) ([Azat Khuzhin](https://github.com/azat)).
|
||||||
|
* Added settings to disallow concurrent backups and restores resolves [#43891](https://github.com/ClickHouse/ClickHouse/issues/43891) Implementation: * Added server level settings to disallow concurrent backups and restores, which are read and set when BackupWorker is created in Context. * Settings are set to true by default. * Before starting backup or restores, added a check to see if any other backups/restores are running. For internal request it checks if its from the self node using backup_uuid. [#45072](https://github.com/ClickHouse/ClickHouse/pull/45072) ([SmitaRKulkarni](https://github.com/SmitaRKulkarni)).
|
||||||
|
* add a cache for async block ids. This will reduce the requests of zookeeper when we enable async inserts deduplication. [#45106](https://github.com/ClickHouse/ClickHouse/pull/45106) ([Han Fei](https://github.com/hanfei1991)).
|
||||||
|
* CRC32 changes to address the WeakHash collision issue in PowerPC. [#45144](https://github.com/ClickHouse/ClickHouse/pull/45144) ([MeenaRenganathan22](https://github.com/MeenaRenganathan22)).
|
||||||
|
* Optimize memory consumption during backup to S3: files to S3 now will be copied directly without using `WriteBufferFromS3` (which could use a lot of memory). [#45188](https://github.com/ClickHouse/ClickHouse/pull/45188) ([Vitaly Baranov](https://github.com/vitlibar)).
|
||||||
|
* Use structure from insertion table in generateRandom without arguments. [#45239](https://github.com/ClickHouse/ClickHouse/pull/45239) ([Kruglov Pavel](https://github.com/Avogar)).
|
||||||
|
* Use `GetObjectAttributes` request instead of `HeadObject` request to get the size of an object in AWS S3. This change fixes handling endpoints without explicit region, for example. [#45288](https://github.com/ClickHouse/ClickHouse/pull/45288) ([Vitaly Baranov](https://github.com/vitlibar)).
|
||||||
|
* Add `<storage_policy>` config parameter for system logs. [#45320](https://github.com/ClickHouse/ClickHouse/pull/45320) ([Stig Bakken](https://github.com/stigsb)).
|
||||||
|
* Remove redundant sorting, for example, sorting related ORDER BY clauses in subqueries. Implemented on top of query plan. It does similar optimization as `optimize_duplicate_order_by_and_distinct` regarding `ORDER BY` clauses, but more generic, since it's applied to any redundant sorting steps (not only caused by ORDER BY clause) and applied to subqueries of any depth. Related to [#42648](https://github.com/ClickHouse/ClickHouse/issues/42648). [#45420](https://github.com/ClickHouse/ClickHouse/pull/45420) ([Igor Nikonov](https://github.com/devcrafter)).
|
||||||
|
* Allow to implicitly convert floats stored in string fields of JSON to integers in `JSONExtract` functions. E.g. `JSONExtract('{"a": "1000.111"}', 'a', 'UInt64')` -> `1000`, previously it returned 0. [#45432](https://github.com/ClickHouse/ClickHouse/pull/45432) ([Anton Popov](https://github.com/CurtizJ)).
|
||||||
|
* Added fields `supports_parallel_parsing` and `supports_parallel_formatting` to table `system.formats` for better introspection. [#45499](https://github.com/ClickHouse/ClickHouse/pull/45499) ([Anton Popov](https://github.com/CurtizJ)).
|
||||||
|
* Attempt to improve fsync latency (by syncing all files at once during fetches and small files after mutations) and one tiny fix for fsync_part_directory. [#45537](https://github.com/ClickHouse/ClickHouse/pull/45537) ([Azat Khuzhin](https://github.com/azat)).
|
||||||
|
|
||||||
|
#### Bug Fix
|
||||||
|
* Fix HTTP requests without path for AWS. After updating AWS SDK the sdk no longer adds a slash to requesting paths so we need to do it in our PocoHTTPClient to keep HTTP requests correct. [#45238](https://github.com/ClickHouse/ClickHouse/pull/45238) ([Vitaly Baranov](https://github.com/vitlibar)).
|
||||||
|
* Fix backup if mutations get killed during the backup process. [#45351](https://github.com/ClickHouse/ClickHouse/pull/45351) ([Vitaly Baranov](https://github.com/vitlibar)).
|
||||||
|
|
||||||
|
#### Build/Testing/Packaging Improvement
|
||||||
|
* Builtin skim for fuzzy search in clickhouse client/local history. [#44239](https://github.com/ClickHouse/ClickHouse/pull/44239) ([Azat Khuzhin](https://github.com/azat)).
|
||||||
|
* Memory limit for server is set now in AST fuzz tests to avoid OOMs. [#44282](https://github.com/ClickHouse/ClickHouse/pull/44282) ([Nikita Taranov](https://github.com/nickitat)).
|
||||||
|
* In rare cases, we don't rebuild binaries, because another task with a similar prefix succeeded. E.g. `binary_darwin` didn't restart because `binary_darwin_aarch64`. [#44311](https://github.com/ClickHouse/ClickHouse/pull/44311) ([Mikhail f. Shiryaev](https://github.com/Felixoid)).
|
||||||
|
* The "universal.sh" now fetches a SSE2 build on systems which don't have SSE4.2. [#44366](https://github.com/ClickHouse/ClickHouse/pull/44366) ([Robert Schulze](https://github.com/rschu1ze)).
|
||||||
|
* Retry the integration tests on compressing errors. [#44529](https://github.com/ClickHouse/ClickHouse/pull/44529) ([Mikhail f. Shiryaev](https://github.com/Felixoid)).
|
||||||
|
* ... 1. Added pytest-random by default in integration tests runner 2. Disable TSAN checks for tests with GPRC ( like https://s3.amazonaws.com/clickhouse-test-reports/42807/e9d7407a58f6e3f7d88c0c534685704f23560704/integration_tests__tsan__[4/6].html ) 3. Cleanup tables after tests in odbc. [#44711](https://github.com/ClickHouse/ClickHouse/pull/44711) ([Ilya Yatsishin](https://github.com/qoega)).
|
||||||
|
* We removed support for shared linking because of Rust. Actually, Rust is only an excuse for this removal, and we wanted to remove it nevertheless. [#44828](https://github.com/ClickHouse/ClickHouse/pull/44828) ([Alexey Milovidov](https://github.com/alexey-milovidov)).
|
||||||
|
* Checks will try to download images before running integration tests. If image, proxy or whatever is broken in infrastructure it will not make tests flaky. Images will be cached locally and download time will not be added to random tests. Compose images are now changed to be used without correct environment from helpers/cluster.py. [#44848](https://github.com/ClickHouse/ClickHouse/pull/44848) ([Ilya Yatsishin](https://github.com/qoega)).
|
||||||
|
* Fix zookeeper downloading, update the version, and optimize the image size. [#44853](https://github.com/ClickHouse/ClickHouse/pull/44853) ([Mikhail f. Shiryaev](https://github.com/Felixoid)).
|
||||||
|
* The performance tests were silently broken because `Errors` wasn't detected in the status message. [#44867](https://github.com/ClickHouse/ClickHouse/pull/44867) ([Mikhail f. Shiryaev](https://github.com/Felixoid)).
|
||||||
|
* Remove the dependency on the `adduser` tool from the packages, because we don't use it. This fixes [#44934](https://github.com/ClickHouse/ClickHouse/issues/44934). [#45011](https://github.com/ClickHouse/ClickHouse/pull/45011) ([Alexey Milovidov](https://github.com/alexey-milovidov)).
|
||||||
|
* SQLite library is updated to the latest. It is used for the SQLite database and table integration engines. Also, fixed a false-positive TSan report. This closes [#45027](https://github.com/ClickHouse/ClickHouse/issues/45027). [#45031](https://github.com/ClickHouse/ClickHouse/pull/45031) ([Alexey Milovidov](https://github.com/alexey-milovidov)).
|
||||||
|
* Fix report sending in the case when FastTest failed. [#45588](https://github.com/ClickHouse/ClickHouse/pull/45588) ([Dmitry Novik](https://github.com/novikd)).
|
||||||
|
|
||||||
|
#### Bug Fix (user-visible misbehavior in official stable or prestable release)
|
||||||
|
|
||||||
|
* #40651 [#41404](https://github.com/ClickHouse/ClickHouse/issues/41404). [#42126](https://github.com/ClickHouse/ClickHouse/pull/42126) ([Alexander Gololobov](https://github.com/davenger)).
|
||||||
|
* Fix possible use-of-unitialized value after executing expressions after sorting. Closes [#43386](https://github.com/ClickHouse/ClickHouse/issues/43386) CC: @nickitat. [#43635](https://github.com/ClickHouse/ClickHouse/pull/43635) ([Kruglov Pavel](https://github.com/Avogar)).
|
||||||
|
* Better handling of NULL in aggregate combinators, fix possible segfault/logical error while using optimization `optimize_rewrite_sum_if_to_count_if`. Closes [#43758](https://github.com/ClickHouse/ClickHouse/issues/43758). [#43813](https://github.com/ClickHouse/ClickHouse/pull/43813) ([Kruglov Pavel](https://github.com/Avogar)).
|
||||||
|
* Fix CREATE USER/ROLE query settings constraints. [#43993](https://github.com/ClickHouse/ClickHouse/pull/43993) ([Nikolay Degterinsky](https://github.com/evillique)).
|
||||||
|
* * Fix wrong behavior of `JOIN ON t1.x = t2.x AND 1 = 1`, forbid such queries. [#44016](https://github.com/ClickHouse/ClickHouse/pull/44016) ([Vladimir C](https://github.com/vdimir)).
|
||||||
|
* Fixed bug with non-parsable default value for EPHEMERAL column in table metadata. [#44026](https://github.com/ClickHouse/ClickHouse/pull/44026) ([Yakov Olkhovskiy](https://github.com/yakov-olkhovskiy)).
|
||||||
|
* Fix parsing of bad version from compatibility setting. [#44224](https://github.com/ClickHouse/ClickHouse/pull/44224) ([Kruglov Pavel](https://github.com/Avogar)).
|
||||||
|
* Bring interval subtraction from datetime in line with addition. [#44241](https://github.com/ClickHouse/ClickHouse/pull/44241) ([ltrk2](https://github.com/ltrk2)).
|
||||||
|
* Fix double-free in HashTable::clearAndShrink() with zero elements in it. [#44256](https://github.com/ClickHouse/ClickHouse/pull/44256) ([Azat Khuzhin](https://github.com/azat)).
|
||||||
|
* Remove limits on maximum size of the result for view. [#44261](https://github.com/ClickHouse/ClickHouse/pull/44261) ([lizhuoyu5](https://github.com/lzydmxy)).
|
||||||
|
* Fix possible logical error in cache if `do_not_evict_index_and_mrk_files=1`. Closes [#42142](https://github.com/ClickHouse/ClickHouse/issues/42142). [#44268](https://github.com/ClickHouse/ClickHouse/pull/44268) ([Kseniia Sumarokova](https://github.com/kssenii)).
|
||||||
|
* Fix possible too early cache write interruption in write-through cache (caching could be stopped due to false assumption when it shouldn't have). [#44289](https://github.com/ClickHouse/ClickHouse/pull/44289) ([Kseniia Sumarokova](https://github.com/kssenii)).
|
||||||
|
* Fix possible crash in case function `IN` with constant arguments was used as a constant argument together with `LowCardinality`. Fixes [#44221](https://github.com/ClickHouse/ClickHouse/issues/44221). [#44346](https://github.com/ClickHouse/ClickHouse/pull/44346) ([Nikolai Kochetov](https://github.com/KochetovNicolai)).
|
||||||
|
* Fix support for complex parameters (like arrays) of parametric aggregate functions. This closes [#30975](https://github.com/ClickHouse/ClickHouse/issues/30975). The aggregate function `sumMapFiltered` was unusable in distributed queries before this change. [#44358](https://github.com/ClickHouse/ClickHouse/pull/44358) ([Alexey Milovidov](https://github.com/alexey-milovidov)).
|
||||||
|
* * Fix possible nullptr deference in JoinSwitcher with `allow_experimental_analyzer`. [#44371](https://github.com/ClickHouse/ClickHouse/pull/44371) ([Vladimir C](https://github.com/vdimir)).
|
||||||
|
* Fix reading ObjectId in BSON schema inference. [#44382](https://github.com/ClickHouse/ClickHouse/pull/44382) ([Kruglov Pavel](https://github.com/Avogar)).
|
||||||
|
* Fix race which can lead to premature temp parts removal before merge finished in ReplicatedMergeTree. This issue could lead to errors like `No such file or directory: xxx`. Fixes [#43983](https://github.com/ClickHouse/ClickHouse/issues/43983). [#44383](https://github.com/ClickHouse/ClickHouse/pull/44383) ([alesapin](https://github.com/alesapin)).
|
||||||
|
* Some invalid `SYSTEM ... ON CLUSTER` queries worked in an unexpected way if a cluster name was not specified. It's fixed, now invalid queries throw `SYNTAX_ERROR` as they should. Fixes [#44264](https://github.com/ClickHouse/ClickHouse/issues/44264). [#44387](https://github.com/ClickHouse/ClickHouse/pull/44387) ([Alexander Tokmakov](https://github.com/tavplubix)).
|
||||||
|
* Fix reading Map type in ORC format. [#44400](https://github.com/ClickHouse/ClickHouse/pull/44400) ([Kruglov Pavel](https://github.com/Avogar)).
|
||||||
|
* Fix reading columns that are not presented in input data in Parquet/ORC formats. Previously it could lead to error `INCORRECT_NUMBER_OF_COLUMNS`. Closes [#44333](https://github.com/ClickHouse/ClickHouse/issues/44333). [#44405](https://github.com/ClickHouse/ClickHouse/pull/44405) ([Kruglov Pavel](https://github.com/Avogar)).
|
||||||
|
* Previously bar() function used the same '▋' (U+258B "Left five eighths block") character to display both 5/8 and 6/8 bars. This change corrects this behavior by using '▊' (U+258A "Left three quarters block") for displaying 6/8 bar. [#44410](https://github.com/ClickHouse/ClickHouse/pull/44410) ([Alexander Gololobov](https://github.com/davenger)).
|
||||||
|
* Placing profile settings after profile settings constraints in the configuration file made constraints ineffective. [#44411](https://github.com/ClickHouse/ClickHouse/pull/44411) ([Konstantin Bogdanov](https://github.com/thevar1able)).
|
||||||
|
* Fix `SYNTAX_ERROR` while running `EXPLAIN AST INSERT` queries with data. Closes [#44207](https://github.com/ClickHouse/ClickHouse/issues/44207). [#44413](https://github.com/ClickHouse/ClickHouse/pull/44413) ([save-my-heart](https://github.com/save-my-heart)).
|
||||||
|
* Fix reading bool value with CRLF in CSV format. Closes [#44401](https://github.com/ClickHouse/ClickHouse/issues/44401). [#44442](https://github.com/ClickHouse/ClickHouse/pull/44442) ([Kruglov Pavel](https://github.com/Avogar)).
|
||||||
|
* Don't execute and/or/if/multiIf on LowCardinality dictionary, so the result type cannot be LowCardinality. It could lead to error `Illegal column ColumnLowCardinality` in some cases. Fixes [#43603](https://github.com/ClickHouse/ClickHouse/issues/43603). [#44469](https://github.com/ClickHouse/ClickHouse/pull/44469) ([Kruglov Pavel](https://github.com/Avogar)).
|
||||||
|
* Fix mutations with setting `max_streams_for_merge_tree_reading`. [#44472](https://github.com/ClickHouse/ClickHouse/pull/44472) ([Anton Popov](https://github.com/CurtizJ)).
|
||||||
|
* Fix potential null pointer dereference with GROUPING SETS in ASTSelectQuery::formatImpl ([#43049](https://github.com/ClickHouse/ClickHouse/issues/43049)). [#44479](https://github.com/ClickHouse/ClickHouse/pull/44479) ([Robert Schulze](https://github.com/rschu1ze)).
|
||||||
|
* Validate types in table function arguments, CAST function arguments, JSONAsObject schema inference according to settings. [#44501](https://github.com/ClickHouse/ClickHouse/pull/44501) ([Kruglov Pavel](https://github.com/Avogar)).
|
||||||
|
* - Fix IN function with LC and const column, close [#44503](https://github.com/ClickHouse/ClickHouse/issues/44503). [#44506](https://github.com/ClickHouse/ClickHouse/pull/44506) ([Duc Canh Le](https://github.com/canhld94)).
|
||||||
|
* Fixed a bug in normalization of a `DEFAULT` expression in `CREATE TABLE` statement. The second argument of function `in` (or the right argument of operator `IN`) might be replaced with the result of its evaluation during CREATE query execution. Fixes [#44496](https://github.com/ClickHouse/ClickHouse/issues/44496). [#44547](https://github.com/ClickHouse/ClickHouse/pull/44547) ([Alexander Tokmakov](https://github.com/tavplubix)).
|
||||||
|
* Projections do not work in presence of WITH ROLLUP, WITH CUBE and WITH TOTALS. In previous versions, a query produced an exception instead of skipping the usage of projections. This closes [#44614](https://github.com/ClickHouse/ClickHouse/issues/44614). This closes [#42772](https://github.com/ClickHouse/ClickHouse/issues/42772). [#44615](https://github.com/ClickHouse/ClickHouse/pull/44615) ([Alexey Milovidov](https://github.com/alexey-milovidov)).
|
||||||
|
* * Fix bug in experimental analyzer and `aggregate_functions_null_for_empty = 1`. Close [#44644](https://github.com/ClickHouse/ClickHouse/issues/44644). [#44648](https://github.com/ClickHouse/ClickHouse/pull/44648) ([Vladimir C](https://github.com/vdimir)).
|
||||||
|
* async blocks are not cleaned because the function `get all blocks sorted by time` didn't get async blocks. [#44651](https://github.com/ClickHouse/ClickHouse/pull/44651) ([Han Fei](https://github.com/hanfei1991)).
|
||||||
|
* Fix `LOGICAL_ERROR` `The top step of the right pipeline should be ExpressionStep` for JOIN with subquery, UNION, and TOTALS. Fixes [#43687](https://github.com/ClickHouse/ClickHouse/issues/43687). [#44673](https://github.com/ClickHouse/ClickHouse/pull/44673) ([Nikolai Kochetov](https://github.com/KochetovNicolai)).
|
||||||
|
* Avoid std::out_of_range exception in StorageExecutable. [#44681](https://github.com/ClickHouse/ClickHouse/pull/44681) ([Kruglov Pavel](https://github.com/Avogar)).
|
||||||
|
* Do not apply `optimize_syntax_fuse_functions` to quantiles on AST, close [#44712](https://github.com/ClickHouse/ClickHouse/issues/44712). [#44713](https://github.com/ClickHouse/ClickHouse/pull/44713) ([Vladimir C](https://github.com/vdimir)).
|
||||||
|
* Fix bug with wrong type in Merge table and PREWHERE, close [#43324](https://github.com/ClickHouse/ClickHouse/issues/43324). [#44716](https://github.com/ClickHouse/ClickHouse/pull/44716) ([Vladimir C](https://github.com/vdimir)).
|
||||||
|
* Fix possible crash during shutdown (while destroying TraceCollector). Fixes [#44757](https://github.com/ClickHouse/ClickHouse/issues/44757). [#44758](https://github.com/ClickHouse/ClickHouse/pull/44758) ([Nikolai Kochetov](https://github.com/KochetovNicolai)).
|
||||||
|
* Fix a possible crash in distributed query processing. The crash could happen if a query with totals or extremes returned an empty result and there are mismatched types in the Distrubuted and the local tables. Fixes [#44738](https://github.com/ClickHouse/ClickHouse/issues/44738). [#44760](https://github.com/ClickHouse/ClickHouse/pull/44760) ([Nikolai Kochetov](https://github.com/KochetovNicolai)).
|
||||||
|
* Fix fsync for fetches (`min_compressed_bytes_to_fsync_after_fetch`)/small files (ttl.txt, columns.txt) in mutations (`min_rows_to_fsync_after_merge`/`min_compressed_bytes_to_fsync_after_merge`). [#44781](https://github.com/ClickHouse/ClickHouse/pull/44781) ([Azat Khuzhin](https://github.com/azat)).
|
||||||
|
* A rare race condition was possible when querying the `system.parts` or `system.parts_columns` tables in the presence of parts being moved between disks. Introduced in [#41145](https://github.com/ClickHouse/ClickHouse/issues/41145). [#44809](https://github.com/ClickHouse/ClickHouse/pull/44809) ([Alexey Milovidov](https://github.com/alexey-milovidov)).
|
||||||
|
* Fix the error `Context has expired` which could appear with enabled projections optimization. Can be reproduced for queries with specific functions, like `dictHas/dictGet` which use context in runtime. Fixes [#44844](https://github.com/ClickHouse/ClickHouse/issues/44844). [#44850](https://github.com/ClickHouse/ClickHouse/pull/44850) ([Nikolai Kochetov](https://github.com/KochetovNicolai)).
|
||||||
|
* Another fix for `Cannot read all data` error which could happen while reading `LowCardinality` dictionary from remote fs. Fixes [#44709](https://github.com/ClickHouse/ClickHouse/issues/44709). [#44875](https://github.com/ClickHouse/ClickHouse/pull/44875) ([Nikolai Kochetov](https://github.com/KochetovNicolai)).
|
||||||
|
* - Ignore hwmon sensors on label read issues. [#44895](https://github.com/ClickHouse/ClickHouse/pull/44895) ([Raúl Marín](https://github.com/Algunenano)).
|
||||||
|
* Use `max_delay_to_insert` value in case calculated time to delay INSERT exceeds the setting value. Related to [#44902](https://github.com/ClickHouse/ClickHouse/issues/44902). [#44916](https://github.com/ClickHouse/ClickHouse/pull/44916) ([Igor Nikonov](https://github.com/devcrafter)).
|
||||||
|
* Fix error `Different order of columns in UNION subquery` for queries with `UNION`. Fixes [#44866](https://github.com/ClickHouse/ClickHouse/issues/44866). [#44920](https://github.com/ClickHouse/ClickHouse/pull/44920) ([Nikolai Kochetov](https://github.com/KochetovNicolai)).
|
||||||
|
* Delay for INSERT can be calculated incorrectly, which can lead to always using `max_delay_to_insert` setting as delay instead of a correct value. Using simple formula `max_delay_to_insert * (parts_over_threshold/max_allowed_parts_over_threshold)` i.e. delay grows proportionally to parts over threshold. Closes [#44902](https://github.com/ClickHouse/ClickHouse/issues/44902). [#44954](https://github.com/ClickHouse/ClickHouse/pull/44954) ([Igor Nikonov](https://github.com/devcrafter)).
|
||||||
|
* fix alter table ttl error when wide part has light weight delete mask. [#44959](https://github.com/ClickHouse/ClickHouse/pull/44959) ([Mingliang Pan](https://github.com/liangliangpan)).
|
||||||
|
* Follow-up fix for Replace domain IP types (IPv4, IPv6) with native [#43221](https://github.com/ClickHouse/ClickHouse/issues/43221). [#45024](https://github.com/ClickHouse/ClickHouse/pull/45024) ([Yakov Olkhovskiy](https://github.com/yakov-olkhovskiy)).
|
||||||
|
* Follow-up fix for Replace domain IP types (IPv4, IPv6) with native https://github.com/ClickHouse/ClickHouse/pull/43221. [#45043](https://github.com/ClickHouse/ClickHouse/pull/45043) ([Yakov Olkhovskiy](https://github.com/yakov-olkhovskiy)).
|
||||||
|
* A buffer overflow was possible in the parser. Found by fuzzer. [#45047](https://github.com/ClickHouse/ClickHouse/pull/45047) ([Alexey Milovidov](https://github.com/alexey-milovidov)).
|
||||||
|
* Fix possible cannot-read-all-data error in storage FileLog. Closes [#45051](https://github.com/ClickHouse/ClickHouse/issues/45051), [#38257](https://github.com/ClickHouse/ClickHouse/issues/38257). [#45057](https://github.com/ClickHouse/ClickHouse/pull/45057) ([Kseniia Sumarokova](https://github.com/kssenii)).
|
||||||
|
* Memory efficient aggregation (setting `distributed_aggregation_memory_efficient`) is disabled when grouping sets are present in the query. [#45058](https://github.com/ClickHouse/ClickHouse/pull/45058) ([Nikita Taranov](https://github.com/nickitat)).
|
||||||
|
* Fix `RANGE_HASHED` dictionary to count range columns as part of primary key during updates when `update_field` is specified. Closes [#44588](https://github.com/ClickHouse/ClickHouse/issues/44588). [#45061](https://github.com/ClickHouse/ClickHouse/pull/45061) ([Maksim Kita](https://github.com/kitaisreal)).
|
||||||
|
* Fix error `Cannot capture column` for `LowCardinality` captured argument of nested labmda. Fixes [#45028](https://github.com/ClickHouse/ClickHouse/issues/45028). [#45065](https://github.com/ClickHouse/ClickHouse/pull/45065) ([Nikolai Kochetov](https://github.com/KochetovNicolai)).
|
||||||
|
* Fix the wrong query result of `additional_table_filters` (additional filter was not applied) in case if minmax/count projection is used. [#45133](https://github.com/ClickHouse/ClickHouse/pull/45133) ([Nikolai Kochetov](https://github.com/KochetovNicolai)).
|
||||||
|
* - Fixed bug in `histogram` function accepting negative values. [#45147](https://github.com/ClickHouse/ClickHouse/pull/45147) ([simpleton](https://github.com/rgzntrade)).
|
||||||
|
* Follow-up fix for Replace domain IP types (IPv4, IPv6) with native https://github.com/ClickHouse/ClickHouse/pull/43221. [#45150](https://github.com/ClickHouse/ClickHouse/pull/45150) ([Yakov Olkhovskiy](https://github.com/yakov-olkhovskiy)).
|
||||||
|
* Fix wrong column nullability in StoreageJoin, close [#44940](https://github.com/ClickHouse/ClickHouse/issues/44940). [#45184](https://github.com/ClickHouse/ClickHouse/pull/45184) ([Vladimir C](https://github.com/vdimir)).
|
||||||
|
* Fix `background_fetches_pool_size` settings reload (increase at runtime). [#45189](https://github.com/ClickHouse/ClickHouse/pull/45189) ([Raúl Marín](https://github.com/Algunenano)).
|
||||||
|
* Correctly process `SELECT` queries on KV engines (e.g. KeeperMap, EmbeddedRocksDB) using `IN` on the key with subquery producing different type. [#45215](https://github.com/ClickHouse/ClickHouse/pull/45215) ([Antonio Andelic](https://github.com/antonio2368)).
|
||||||
|
* Fix logical error in SEMI JOIN & join_use_nulls in some cases, close [#45163](https://github.com/ClickHouse/ClickHouse/issues/45163), close [#45209](https://github.com/ClickHouse/ClickHouse/issues/45209). [#45230](https://github.com/ClickHouse/ClickHouse/pull/45230) ([Vladimir C](https://github.com/vdimir)).
|
||||||
|
* Fix heap-use-after-free in reading from s3. [#45253](https://github.com/ClickHouse/ClickHouse/pull/45253) ([Kruglov Pavel](https://github.com/Avogar)).
|
||||||
|
* Fix bug when the Avro Union type is ['null', Nested type], closes [#45275](https://github.com/ClickHouse/ClickHouse/issues/45275). Fix bug that incorrectly infer `bytes` type to `Float`. [#45276](https://github.com/ClickHouse/ClickHouse/pull/45276) ([flynn](https://github.com/ucasfl)).
|
||||||
|
* Throw a correct exception when explicit PREWHERE cannot be used with table using storage engine `Merge`. [#45319](https://github.com/ClickHouse/ClickHouse/pull/45319) ([Antonio Andelic](https://github.com/antonio2368)).
|
||||||
|
* Under WSL1 Ubuntu self-extracting clickhouse fails to decompress due to inconsistency - /proc/self/maps reporting 32bit file's inode, while stat reporting 64bit inode. [#45339](https://github.com/ClickHouse/ClickHouse/pull/45339) ([Yakov Olkhovskiy](https://github.com/yakov-olkhovskiy)).
|
||||||
|
* Fix race in Distributed table startup (that could lead to processing file of async INSERT multiple times). [#45360](https://github.com/ClickHouse/ClickHouse/pull/45360) ([Azat Khuzhin](https://github.com/azat)).
|
||||||
|
* Fix possible crash while reading from storage `S3` and table function `s3` in case when `ListObject` request has failed. [#45371](https://github.com/ClickHouse/ClickHouse/pull/45371) ([Anton Popov](https://github.com/CurtizJ)).
|
||||||
|
* * Fixed some bugs in JOINS with WHERE by disabling "move to prewhere" optimization for it, close [#44062](https://github.com/ClickHouse/ClickHouse/issues/44062). [#45391](https://github.com/ClickHouse/ClickHouse/pull/45391) ([Vladimir C](https://github.com/vdimir)).
|
||||||
|
* Fix `SELECT ... FROM system.dictionaries` exception when there is a dictionary with a bad structure (e.g. incorrect type in xml config). [#45399](https://github.com/ClickHouse/ClickHouse/pull/45399) ([Aleksei Filatov](https://github.com/aalexfvk)).
|
||||||
|
* Fix s3Cluster schema inference when structure from insertion table is used in `INSERT INTO ... SELECT * FROM s3Cluster` queries. [#45422](https://github.com/ClickHouse/ClickHouse/pull/45422) ([Kruglov Pavel](https://github.com/Avogar)).
|
||||||
|
* Fix bug in JSON/BSONEachRow parsing with HTTP that could lead to using default values for some columns instead of values from data. [#45424](https://github.com/ClickHouse/ClickHouse/pull/45424) ([Kruglov Pavel](https://github.com/Avogar)).
|
||||||
|
* Fixed bug (Code: 632. DB::Exception: Unexpected data ... after parsed IPv6 value ...) with typed parsing of IP types from text source. [#45425](https://github.com/ClickHouse/ClickHouse/pull/45425) ([Yakov Olkhovskiy](https://github.com/yakov-olkhovskiy)).
|
||||||
|
* close [#45297](https://github.com/ClickHouse/ClickHouse/issues/45297) Add check for empty regular expressions. [#45428](https://github.com/ClickHouse/ClickHouse/pull/45428) ([Han Fei](https://github.com/hanfei1991)).
|
||||||
|
* Fix possible (likely distributed) query hung. [#45448](https://github.com/ClickHouse/ClickHouse/pull/45448) ([Azat Khuzhin](https://github.com/azat)).
|
||||||
|
* Fix disabled two-level aggregation from HTTP. [#45450](https://github.com/ClickHouse/ClickHouse/pull/45450) ([Nikolai Kochetov](https://github.com/KochetovNicolai)).
|
||||||
|
* Fix possible deadlock with `allow_asynchronous_read_from_io_pool_for_merge_tree` enabled in case of exception from `ThreadPool::schedule`. [#45481](https://github.com/ClickHouse/ClickHouse/pull/45481) ([Nikolai Kochetov](https://github.com/KochetovNicolai)).
|
||||||
|
* Fix possible in-use table after DETACH. [#45493](https://github.com/ClickHouse/ClickHouse/pull/45493) ([Azat Khuzhin](https://github.com/azat)).
|
||||||
|
* Fix rare abort in case when query is canceled and parallel parsing was used during its execution. [#45498](https://github.com/ClickHouse/ClickHouse/pull/45498) ([Anton Popov](https://github.com/CurtizJ)).
|
||||||
|
* Fix a race between Distributed table creation and INSERT into it (could lead to CANNOT_LINK during INSERT into the table). [#45502](https://github.com/ClickHouse/ClickHouse/pull/45502) ([Azat Khuzhin](https://github.com/azat)).
|
||||||
|
* Add proper default (SLRU) to cache policy getter. Closes [#45514](https://github.com/ClickHouse/ClickHouse/issues/45514). [#45524](https://github.com/ClickHouse/ClickHouse/pull/45524) ([Kseniia Sumarokova](https://github.com/kssenii)).
|
||||||
|
* Remove AST-based optimization `optimize_fuse_sum_count_avg`, close [#45439](https://github.com/ClickHouse/ClickHouse/issues/45439). [#45558](https://github.com/ClickHouse/ClickHouse/pull/45558) ([Vladimir C](https://github.com/vdimir)).
|
||||||
|
|
||||||
|
#### Bug-fix
|
||||||
|
|
||||||
|
* Disallow arrayjoin in mutations closes [#42637](https://github.com/ClickHouse/ClickHouse/issues/42637) Implementation: * Added a new parameter to ActionsVisitor::Data disallow_arrayjoin, which is set by MutationsIterator when it appends expression. * ActionsVisitor uses disallow_arrayjoin and throws error when its used with mutations. Testing: * Added test for the same 02504_disallow_arrayjoin_in_mutations.sql. [#44447](https://github.com/ClickHouse/ClickHouse/pull/44447) ([SmitaRKulkarni](https://github.com/SmitaRKulkarni)).
|
||||||
|
* Fix for qualified asterisks with alias table name and column transformer resolves [#44736](https://github.com/ClickHouse/ClickHouse/issues/44736). [#44755](https://github.com/ClickHouse/ClickHouse/pull/44755) ([SmitaRKulkarni](https://github.com/SmitaRKulkarni)).
|
||||||
|
* Updated backup/restore status when concurrent backups & restores are not allowed resolves [#45486](https://github.com/ClickHouse/ClickHouse/issues/45486) Implementation: * Moved concurrent backup/restore check inside try-catch block which sets the status so that other nodes in cluster are aware of failures. * Renamed backup_uuid to restore_uuid in RestoreSettings. [#45497](https://github.com/ClickHouse/ClickHouse/pull/45497) ([SmitaRKulkarni](https://github.com/SmitaRKulkarni)).
|
||||||
|
|
||||||
|
#### Build Improvement
|
||||||
|
|
||||||
|
* crc32 fix for s390x. [#43706](https://github.com/ClickHouse/ClickHouse/pull/43706) ([Suzy Wang](https://github.com/SuzyWangIBMer)).
|
||||||
|
* Fixed endian issues in transform function for s390x. [#45522](https://github.com/ClickHouse/ClickHouse/pull/45522) ([Harry Lee](https://github.com/HarryLeeIBM)).
|
||||||
|
|
||||||
|
#### Feature
|
||||||
|
|
||||||
|
* Record server startup time in ProfileEvents resolves [#43188](https://github.com/ClickHouse/ClickHouse/issues/43188) Implementation: * Added ProfileEvents::ServerStartupMilliseconds. * Recorded time from start of main till listening to sockets. Testing: * Added a test 02532_profileevents_server_startup_time.sql. [#45250](https://github.com/ClickHouse/ClickHouse/pull/45250) ([SmitaRKulkarni](https://github.com/SmitaRKulkarni)).
|
||||||
|
|
||||||
|
#### NO CL ENTRY
|
||||||
|
|
||||||
|
* NO CL ENTRY: 'Revert "If user only need virtual columns, we don't need to initialize ReadBufferFromS3"'. [#44939](https://github.com/ClickHouse/ClickHouse/pull/44939) ([Anton Popov](https://github.com/CurtizJ)).
|
||||||
|
* NO CL ENTRY: 'Revert "Custom reading for mutation"'. [#45121](https://github.com/ClickHouse/ClickHouse/pull/45121) ([Alexander Tokmakov](https://github.com/tavplubix)).
|
||||||
|
* NO CL ENTRY: 'Revert "Revert "Custom reading for mutation""'. [#45122](https://github.com/ClickHouse/ClickHouse/pull/45122) ([Nikolai Kochetov](https://github.com/KochetovNicolai)).
|
||||||
|
* NO CL ENTRY: 'Revert "update function DAYOFWEEK and add new function WEEKDAY for mysql/spark compatiability"'. [#45221](https://github.com/ClickHouse/ClickHouse/pull/45221) ([Alexander Tokmakov](https://github.com/tavplubix)).
|
||||||
|
* NO CL ENTRY: 'Revert "Validate function arguments in query tree"'. [#45299](https://github.com/ClickHouse/ClickHouse/pull/45299) ([Maksim Kita](https://github.com/kitaisreal)).
|
||||||
|
* NO CL ENTRY: 'Revert "Revert "Validate function arguments in query tree""'. [#45300](https://github.com/ClickHouse/ClickHouse/pull/45300) ([Maksim Kita](https://github.com/kitaisreal)).
|
||||||
|
* NO CL ENTRY: 'Revert "Support optimize_or_like_chain in QueryTreePassManager"'. [#45406](https://github.com/ClickHouse/ClickHouse/pull/45406) ([Anton Popov](https://github.com/CurtizJ)).
|
||||||
|
* NO CL ENTRY: 'Resubmit Support optimize_or_like_chain in QueryTreePassManager'. [#45410](https://github.com/ClickHouse/ClickHouse/pull/45410) ([Dmitry Novik](https://github.com/novikd)).
|
||||||
|
* NO CL ENTRY: 'Revert "Remove redundant sorting"'. [#45414](https://github.com/ClickHouse/ClickHouse/pull/45414) ([Igor Nikonov](https://github.com/devcrafter)).
|
||||||
|
|
||||||
|
#### NOT FOR CHANGELOG / INSIGNIFICANT
|
||||||
|
|
||||||
|
* Automatically merge green backport PRs and green approved PRs [#41110](https://github.com/ClickHouse/ClickHouse/pull/41110) ([Mikhail f. Shiryaev](https://github.com/Felixoid)).
|
||||||
|
* Fix assertion in async read buffer from remote [#41231](https://github.com/ClickHouse/ClickHouse/pull/41231) ([Kseniia Sumarokova](https://github.com/kssenii)).
|
||||||
|
* add retries on ConnectionError [#42991](https://github.com/ClickHouse/ClickHouse/pull/42991) ([Yakov Olkhovskiy](https://github.com/yakov-olkhovskiy)).
|
||||||
|
* Update aws-c* submodules [#43020](https://github.com/ClickHouse/ClickHouse/pull/43020) ([Vitaly Baranov](https://github.com/vitlibar)).
|
||||||
|
* Replace domain IP types (IPv4, IPv6) with native [#43221](https://github.com/ClickHouse/ClickHouse/pull/43221) ([Yakov Olkhovskiy](https://github.com/yakov-olkhovskiy)).
|
||||||
|
* Fix aggregate functions optimisation in AggregateFunctionsArithmericOperationsPass [#43372](https://github.com/ClickHouse/ClickHouse/pull/43372) ([Dmitry Novik](https://github.com/novikd)).
|
||||||
|
* Improve pytest --pdb experience by preserving dockerd on SIGINT [#43392](https://github.com/ClickHouse/ClickHouse/pull/43392) ([Azat Khuzhin](https://github.com/azat)).
|
||||||
|
* RFC: tests: add stacktraces for hunged queries [#43396](https://github.com/ClickHouse/ClickHouse/pull/43396) ([Azat Khuzhin](https://github.com/azat)).
|
||||||
|
* Followup fixes for systemd notification ([#43400](https://github.com/ClickHouse/ClickHouse/issues/43400)) [#43597](https://github.com/ClickHouse/ClickHouse/pull/43597) ([Alexander Gololobov](https://github.com/davenger)).
|
||||||
|
* Refactor FunctionNode [#43761](https://github.com/ClickHouse/ClickHouse/pull/43761) ([Dmitry Novik](https://github.com/novikd)).
|
||||||
|
* Some cleanup: grace hash join [#43851](https://github.com/ClickHouse/ClickHouse/pull/43851) ([Igor Nikonov](https://github.com/devcrafter)).
|
||||||
|
* Temporary files evict fs cache - 2nd approach [#43972](https://github.com/ClickHouse/ClickHouse/pull/43972) ([Vladimir C](https://github.com/vdimir)).
|
||||||
|
* Randomize setting `enable_memory_bound_merging_of_aggregation_results` in tests [#43986](https://github.com/ClickHouse/ClickHouse/pull/43986) ([Nikita Taranov](https://github.com/nickitat)).
|
||||||
|
* Analyzer aggregate functions passes small fixes [#44013](https://github.com/ClickHouse/ClickHouse/pull/44013) ([Maksim Kita](https://github.com/kitaisreal)).
|
||||||
|
* Fix wrong char in command [#44018](https://github.com/ClickHouse/ClickHouse/pull/44018) ([alesapin](https://github.com/alesapin)).
|
||||||
|
* Analyzer support Set index [#44097](https://github.com/ClickHouse/ClickHouse/pull/44097) ([Maksim Kita](https://github.com/kitaisreal)).
|
||||||
|
* Provide monotonicity info for `toUnixTimestamp64*` [#44116](https://github.com/ClickHouse/ClickHouse/pull/44116) ([Nikita Taranov](https://github.com/nickitat)).
|
||||||
|
* Avoid loading toolchain files multiple times [#44122](https://github.com/ClickHouse/ClickHouse/pull/44122) ([Azat Khuzhin](https://github.com/azat)).
|
||||||
|
* tests: exclude flaky columns from SHOW CLUSTERS test [#44123](https://github.com/ClickHouse/ClickHouse/pull/44123) ([Azat Khuzhin](https://github.com/azat)).
|
||||||
|
* Bump libdivide (to gain some new optimizations) [#44132](https://github.com/ClickHouse/ClickHouse/pull/44132) ([Azat Khuzhin](https://github.com/azat)).
|
||||||
|
* Make atomic counter relaxed in blockNumber() [#44193](https://github.com/ClickHouse/ClickHouse/pull/44193) ([Igor Nikonov](https://github.com/devcrafter)).
|
||||||
|
* Try fix flaky 01072_window_view_multiple_columns_groupby [#44195](https://github.com/ClickHouse/ClickHouse/pull/44195) ([Kseniia Sumarokova](https://github.com/kssenii)).
|
||||||
|
* Apply new code of named collections (from [#43147](https://github.com/ClickHouse/ClickHouse/issues/43147)) to external table engines part 1 [#44204](https://github.com/ClickHouse/ClickHouse/pull/44204) ([Kseniia Sumarokova](https://github.com/kssenii)).
|
||||||
|
* Add some settings under `compatibility` [#44209](https://github.com/ClickHouse/ClickHouse/pull/44209) ([Alexey Milovidov](https://github.com/alexey-milovidov)).
|
||||||
|
* Recommend Slack over Telegram in the "Question" issue template [#44222](https://github.com/ClickHouse/ClickHouse/pull/44222) ([Ivan Blinkov](https://github.com/blinkov)).
|
||||||
|
* Forbid paths in timezone names [#44225](https://github.com/ClickHouse/ClickHouse/pull/44225) ([Kruglov Pavel](https://github.com/Avogar)).
|
||||||
|
* Analyzer storage view crash fix [#44230](https://github.com/ClickHouse/ClickHouse/pull/44230) ([Maksim Kita](https://github.com/kitaisreal)).
|
||||||
|
* Add ThreadsInOvercommitTracker metric [#44233](https://github.com/ClickHouse/ClickHouse/pull/44233) ([Dmitry Novik](https://github.com/novikd)).
|
||||||
|
* Analyzer expired Context crash fix [#44234](https://github.com/ClickHouse/ClickHouse/pull/44234) ([Maksim Kita](https://github.com/kitaisreal)).
|
||||||
|
* Fix grace join memory consumption, pt1 [#44238](https://github.com/ClickHouse/ClickHouse/pull/44238) ([Vladimir C](https://github.com/vdimir)).
|
||||||
|
* Fixed use-after-free of BLAKE3 error message [#44242](https://github.com/ClickHouse/ClickHouse/pull/44242) ([Joanna Hulboj](https://github.com/jh0x)).
|
||||||
|
* Fix deadlock in StorageSystemDatabases [#44272](https://github.com/ClickHouse/ClickHouse/pull/44272) ([Alexander Tokmakov](https://github.com/tavplubix)).
|
||||||
|
* Get rid of global Git object [#44273](https://github.com/ClickHouse/ClickHouse/pull/44273) ([Mikhail f. Shiryaev](https://github.com/Felixoid)).
|
||||||
|
* Update version after release [#44275](https://github.com/ClickHouse/ClickHouse/pull/44275) ([Mikhail f. Shiryaev](https://github.com/Felixoid)).
|
||||||
|
* Update version_date.tsv and changelogs after v22.12.1.1752-stable [#44281](https://github.com/ClickHouse/ClickHouse/pull/44281) ([robot-clickhouse](https://github.com/robot-clickhouse)).
|
||||||
|
* Do not hold data parts during insert [#44299](https://github.com/ClickHouse/ClickHouse/pull/44299) ([Anton Popov](https://github.com/CurtizJ)).
|
||||||
|
* Another fix `test_server_reload` [#44306](https://github.com/ClickHouse/ClickHouse/pull/44306) ([Antonio Andelic](https://github.com/antonio2368)).
|
||||||
|
* Update version_date.tsv and changelogs after v22.9.7.34-stable [#44309](https://github.com/ClickHouse/ClickHouse/pull/44309) ([robot-clickhouse](https://github.com/robot-clickhouse)).
|
||||||
|
* tests/perf: fix dependency check during DROP [#44312](https://github.com/ClickHouse/ClickHouse/pull/44312) ([Azat Khuzhin](https://github.com/azat)).
|
||||||
|
* (unused openssl integration, not for production) a follow-up [#44325](https://github.com/ClickHouse/ClickHouse/pull/44325) ([Boris Kuschel](https://github.com/bkuschel)).
|
||||||
|
* Replace old named collections code with new (from [#43147](https://github.com/ClickHouse/ClickHouse/issues/43147)) part 2 [#44327](https://github.com/ClickHouse/ClickHouse/pull/44327) ([Kseniia Sumarokova](https://github.com/kssenii)).
|
||||||
|
* Disable "git-import" test in debug mode [#44328](https://github.com/ClickHouse/ClickHouse/pull/44328) ([Alexey Milovidov](https://github.com/alexey-milovidov)).
|
||||||
|
* Check s3 part upload settings [#44335](https://github.com/ClickHouse/ClickHouse/pull/44335) ([Nikolai Kochetov](https://github.com/KochetovNicolai)).
|
||||||
|
* Fix typo [#44337](https://github.com/ClickHouse/ClickHouse/pull/44337) ([Alexey Milovidov](https://github.com/alexey-milovidov)).
|
||||||
|
* Add a test for PowerBI [#44338](https://github.com/ClickHouse/ClickHouse/pull/44338) ([Alexey Milovidov](https://github.com/alexey-milovidov)).
|
||||||
|
* Add a test for [#36038](https://github.com/ClickHouse/ClickHouse/issues/36038) [#44339](https://github.com/ClickHouse/ClickHouse/pull/44339) ([Alexey Milovidov](https://github.com/alexey-milovidov)).
|
||||||
|
* Add a test for [#29386](https://github.com/ClickHouse/ClickHouse/issues/29386) [#44340](https://github.com/ClickHouse/ClickHouse/pull/44340) ([Alexey Milovidov](https://github.com/alexey-milovidov)).
|
||||||
|
* Add a test for [#22929](https://github.com/ClickHouse/ClickHouse/issues/22929) [#44341](https://github.com/ClickHouse/ClickHouse/pull/44341) ([Alexey Milovidov](https://github.com/alexey-milovidov)).
|
||||||
|
* Add a test for [#29883](https://github.com/ClickHouse/ClickHouse/issues/29883) [#44342](https://github.com/ClickHouse/ClickHouse/pull/44342) ([Alexey Milovidov](https://github.com/alexey-milovidov)).
|
||||||
|
* Fix Docker [#44343](https://github.com/ClickHouse/ClickHouse/pull/44343) ([Alexey Milovidov](https://github.com/alexey-milovidov)).
|
||||||
|
* fix flack test "02481_async_insert_dedup.python" [#44349](https://github.com/ClickHouse/ClickHouse/pull/44349) ([Han Fei](https://github.com/hanfei1991)).
|
||||||
|
* Add a test for [#22160](https://github.com/ClickHouse/ClickHouse/issues/22160) [#44355](https://github.com/ClickHouse/ClickHouse/pull/44355) ([Alexey Milovidov](https://github.com/alexey-milovidov)).
|
||||||
|
* Add a test for [#34708](https://github.com/ClickHouse/ClickHouse/issues/34708) [#44356](https://github.com/ClickHouse/ClickHouse/pull/44356) ([Alexey Milovidov](https://github.com/alexey-milovidov)).
|
||||||
|
* Add a test for [#30679](https://github.com/ClickHouse/ClickHouse/issues/30679) [#44357](https://github.com/ClickHouse/ClickHouse/pull/44357) ([Alexey Milovidov](https://github.com/alexey-milovidov)).
|
||||||
|
* Add a test for [#34669](https://github.com/ClickHouse/ClickHouse/issues/34669) [#44359](https://github.com/ClickHouse/ClickHouse/pull/44359) ([Alexey Milovidov](https://github.com/alexey-milovidov)).
|
||||||
|
* Add a test for [#34724](https://github.com/ClickHouse/ClickHouse/issues/34724) [#44360](https://github.com/ClickHouse/ClickHouse/pull/44360) ([Alexey Milovidov](https://github.com/alexey-milovidov)).
|
||||||
|
* Try restarting ZK cluster on failed connection in `test_keeper_zookeeper_converted` [#44363](https://github.com/ClickHouse/ClickHouse/pull/44363) ([Antonio Andelic](https://github.com/antonio2368)).
|
||||||
|
* Disable grase_hash in test 00172_parallel_join [#44367](https://github.com/ClickHouse/ClickHouse/pull/44367) ([Vladimir C](https://github.com/vdimir)).
|
||||||
|
* Add check for submodules sanity [#44386](https://github.com/ClickHouse/ClickHouse/pull/44386) ([Mikhail f. Shiryaev](https://github.com/Felixoid)).
|
||||||
|
* Lock table for share during startup for database ordinary [#44393](https://github.com/ClickHouse/ClickHouse/pull/44393) ([alesapin](https://github.com/alesapin)).
|
||||||
|
* Implement a custom central checkout action [#44399](https://github.com/ClickHouse/ClickHouse/pull/44399) ([Mikhail f. Shiryaev](https://github.com/Felixoid)).
|
||||||
|
* Try fix some tests [#44406](https://github.com/ClickHouse/ClickHouse/pull/44406) ([Kseniia Sumarokova](https://github.com/kssenii)).
|
||||||
|
* Better ParserAllCollectionsOfLiterals [#44408](https://github.com/ClickHouse/ClickHouse/pull/44408) ([Nikolay Degterinsky](https://github.com/evillique)).
|
||||||
|
* Fix bug with merge/mutate pool size increase [#44436](https://github.com/ClickHouse/ClickHouse/pull/44436) ([alesapin](https://github.com/alesapin)).
|
||||||
|
* Update 01072_window_view_multiple_columns_groupby.sh [#44438](https://github.com/ClickHouse/ClickHouse/pull/44438) ([Kseniia Sumarokova](https://github.com/kssenii)).
|
||||||
|
* Disable buggy tsan assertion for integration test [#44444](https://github.com/ClickHouse/ClickHouse/pull/44444) ([alesapin](https://github.com/alesapin)).
|
||||||
|
* Respect setting settings.schema_inference_make_columns_nullable in Parquet/ORC/Arrow formats [#44446](https://github.com/ClickHouse/ClickHouse/pull/44446) ([Kruglov Pavel](https://github.com/Avogar)).
|
||||||
|
* Add tests as examples with errors of date(time) and string comparison that we should eliminate [#44462](https://github.com/ClickHouse/ClickHouse/pull/44462) ([Ilya Yatsishin](https://github.com/qoega)).
|
||||||
|
* Parallel parts cleanup with zero copy replication [#44466](https://github.com/ClickHouse/ClickHouse/pull/44466) ([Alexander Tokmakov](https://github.com/tavplubix)).
|
||||||
|
* Fix incorrect usages of `getPartName()` [#44468](https://github.com/ClickHouse/ClickHouse/pull/44468) ([Alexander Tokmakov](https://github.com/tavplubix)).
|
||||||
|
* Fix flaky test `roaring_memory_tracking` [#44470](https://github.com/ClickHouse/ClickHouse/pull/44470) ([Alexey Milovidov](https://github.com/alexey-milovidov)).
|
||||||
|
* Clarify query_id in test 01092_memory_profiler [#44483](https://github.com/ClickHouse/ClickHouse/pull/44483) ([Vladimir C](https://github.com/vdimir)).
|
||||||
|
* Default value for optional in SortNode::updateTreeHashImpl [#44491](https://github.com/ClickHouse/ClickHouse/pull/44491) ([Vladimir C](https://github.com/vdimir)).
|
||||||
|
* Do not try to remove WAL/move broken parts for static storage [#44495](https://github.com/ClickHouse/ClickHouse/pull/44495) ([Azat Khuzhin](https://github.com/azat)).
|
||||||
|
* Removed parent pid check that breaks in containers [#44499](https://github.com/ClickHouse/ClickHouse/pull/44499) ([Alexander Gololobov](https://github.com/davenger)).
|
||||||
|
* Analyzer duplicate alias crash fix [#44508](https://github.com/ClickHouse/ClickHouse/pull/44508) ([Maksim Kita](https://github.com/kitaisreal)).
|
||||||
|
* Minor code polishing [#44513](https://github.com/ClickHouse/ClickHouse/pull/44513) ([alesapin](https://github.com/alesapin)).
|
||||||
|
* Better error message if named collection does not exist [#44517](https://github.com/ClickHouse/ClickHouse/pull/44517) ([Kseniia Sumarokova](https://github.com/kssenii)).
|
||||||
|
* Add the lambda to collect data for workflow_jobs [#44520](https://github.com/ClickHouse/ClickHouse/pull/44520) ([Mikhail f. Shiryaev](https://github.com/Felixoid)).
|
||||||
|
* Introduce groupArrayLast() (useful to store last X values) [#44521](https://github.com/ClickHouse/ClickHouse/pull/44521) ([Azat Khuzhin](https://github.com/azat)).
|
||||||
|
* Infer numbers starting from zero as strings in TSV [#44522](https://github.com/ClickHouse/ClickHouse/pull/44522) ([Kruglov Pavel](https://github.com/Avogar)).
|
||||||
|
* Fix wrong condition for enabling async reading from MergeTree. [#44530](https://github.com/ClickHouse/ClickHouse/pull/44530) ([Nikolai Kochetov](https://github.com/KochetovNicolai)).
|
||||||
|
* tests: capture dmesg in integration tests [#44535](https://github.com/ClickHouse/ClickHouse/pull/44535) ([Azat Khuzhin](https://github.com/azat)).
|
||||||
|
* Analyzer support distributed queries processing [#44540](https://github.com/ClickHouse/ClickHouse/pull/44540) ([Maksim Kita](https://github.com/kitaisreal)).
|
||||||
|
* Followup [#43761](https://github.com/ClickHouse/ClickHouse/issues/43761) [#44541](https://github.com/ClickHouse/ClickHouse/pull/44541) ([Dmitry Novik](https://github.com/novikd)).
|
||||||
|
* Drop unused columns after join on/using [#44545](https://github.com/ClickHouse/ClickHouse/pull/44545) ([Vladimir C](https://github.com/vdimir)).
|
||||||
|
* Improve inferring arrays with nulls in JSON formats [#44550](https://github.com/ClickHouse/ClickHouse/pull/44550) ([Kruglov Pavel](https://github.com/Avogar)).
|
||||||
|
* Make BC check optional (if env var set) [#44564](https://github.com/ClickHouse/ClickHouse/pull/44564) ([alesapin](https://github.com/alesapin)).
|
||||||
|
* Fix extremely slow stack traces in debug build [#44569](https://github.com/ClickHouse/ClickHouse/pull/44569) ([Alexey Milovidov](https://github.com/alexey-milovidov)).
|
||||||
|
* Better command line argument name in `clickhouse-benchmark` [#44570](https://github.com/ClickHouse/ClickHouse/pull/44570) ([Alexey Milovidov](https://github.com/alexey-milovidov)).
|
||||||
|
* Fix HDFS test [#44572](https://github.com/ClickHouse/ClickHouse/pull/44572) ([Alexey Milovidov](https://github.com/alexey-milovidov)).
|
||||||
|
* Fix test_distributed_queries_stress [#44573](https://github.com/ClickHouse/ClickHouse/pull/44573) ([Alexey Milovidov](https://github.com/alexey-milovidov)).
|
||||||
|
* Switch "contrib/sysroot" back to master. [#44574](https://github.com/ClickHouse/ClickHouse/pull/44574) ([Vitaly Baranov](https://github.com/vitlibar)).
|
||||||
|
* Non-significant changes [#44575](https://github.com/ClickHouse/ClickHouse/pull/44575) ([Alexey Milovidov](https://github.com/alexey-milovidov)).
|
||||||
|
* Fuzzer HTML: fix trash [#44580](https://github.com/ClickHouse/ClickHouse/pull/44580) ([Alexey Milovidov](https://github.com/alexey-milovidov)).
|
||||||
|
* Better diagnostics on server stop for the stress test [#44593](https://github.com/ClickHouse/ClickHouse/pull/44593) ([Alexey Milovidov](https://github.com/alexey-milovidov)).
|
||||||
|
* The position of the log message about the server environment was wrong [#44595](https://github.com/ClickHouse/ClickHouse/pull/44595) ([Alexey Milovidov](https://github.com/alexey-milovidov)).
|
||||||
|
* Fix bad punctuation in log [#44596](https://github.com/ClickHouse/ClickHouse/pull/44596) ([Alexey Milovidov](https://github.com/alexey-milovidov)).
|
||||||
|
* Fix misleading log message [#44598](https://github.com/ClickHouse/ClickHouse/pull/44598) ([Alexey Milovidov](https://github.com/alexey-milovidov)).
|
||||||
|
* Fix bad log message about MergeTree metadata cache. [#44599](https://github.com/ClickHouse/ClickHouse/pull/44599) ([Alexey Milovidov](https://github.com/alexey-milovidov)).
|
||||||
|
* Slightly cleanup interactive line reader code [#44601](https://github.com/ClickHouse/ClickHouse/pull/44601) ([Azat Khuzhin](https://github.com/azat)).
|
||||||
|
* Rename `runlog.log` to `run.log` in tests [#44603](https://github.com/ClickHouse/ClickHouse/pull/44603) ([Alexey Milovidov](https://github.com/alexey-milovidov)).
|
||||||
|
* Fix hung query in stress test [#44604](https://github.com/ClickHouse/ClickHouse/pull/44604) ([Alexey Milovidov](https://github.com/alexey-milovidov)).
|
||||||
|
* Improve variable name [#44605](https://github.com/ClickHouse/ClickHouse/pull/44605) ([Alexey Milovidov](https://github.com/alexey-milovidov)).
|
||||||
|
* Faster server startup after stress test [#44606](https://github.com/ClickHouse/ClickHouse/pull/44606) ([Alexey Milovidov](https://github.com/alexey-milovidov)).
|
||||||
|
* Fix log messages in Coordination [#44607](https://github.com/ClickHouse/ClickHouse/pull/44607) ([Alexey Milovidov](https://github.com/alexey-milovidov)).
|
||||||
|
* Disable Analyzer in fuzz and stress tests [#44609](https://github.com/ClickHouse/ClickHouse/pull/44609) ([Alexey Milovidov](https://github.com/alexey-milovidov)).
|
||||||
|
* Better log message [#44610](https://github.com/ClickHouse/ClickHouse/pull/44610) ([Alexey Milovidov](https://github.com/alexey-milovidov)).
|
||||||
|
* Maybe fix a bogus MSan error [#44611](https://github.com/ClickHouse/ClickHouse/pull/44611) ([Alexey Milovidov](https://github.com/alexey-milovidov)).
|
||||||
|
* Fix "too large allocation" message from MSan [#44613](https://github.com/ClickHouse/ClickHouse/pull/44613) ([Alexey Milovidov](https://github.com/alexey-milovidov)).
|
||||||
|
* Do not fail the AST fuzzer if sanitizer is out of memory [#44616](https://github.com/ClickHouse/ClickHouse/pull/44616) ([Alexey Milovidov](https://github.com/alexey-milovidov)).
|
||||||
|
* Fix test `01111_create_drop_replicated_db_stress` [#44617](https://github.com/ClickHouse/ClickHouse/pull/44617) ([Alexey Milovidov](https://github.com/alexey-milovidov)).
|
||||||
|
* tests/integration: suppress exceptions during logging (due to pytest) [#44618](https://github.com/ClickHouse/ClickHouse/pull/44618) ([Azat Khuzhin](https://github.com/azat)).
|
||||||
|
* Fix rust modules rebuild (previously ignores changes in cargo config.toml) [#44623](https://github.com/ClickHouse/ClickHouse/pull/44623) ([Azat Khuzhin](https://github.com/azat)).
|
||||||
|
* Sometimes spot instances fail more than 20 times in a row [#44626](https://github.com/ClickHouse/ClickHouse/pull/44626) ([Alexey Milovidov](https://github.com/alexey-milovidov)).
|
||||||
|
* Fix restart after quorum insert [#44628](https://github.com/ClickHouse/ClickHouse/pull/44628) ([alesapin](https://github.com/alesapin)).
|
||||||
|
* Revert "Merge pull request [#38953](https://github.com/ClickHouse/ClickHouse/issues/38953) from ClickHouse/add-allocation-ptr-to-trace-log [#44629](https://github.com/ClickHouse/ClickHouse/pull/44629) ([Raúl Marín](https://github.com/Algunenano)).
|
||||||
|
* Fix lambdas parsing [#44639](https://github.com/ClickHouse/ClickHouse/pull/44639) ([Nikolay Degterinsky](https://github.com/evillique)).
|
||||||
|
* Function viewExplain accept SELECT and settings [#44641](https://github.com/ClickHouse/ClickHouse/pull/44641) ([Vladimir C](https://github.com/vdimir)).
|
||||||
|
* Fix test `02015_async_inserts_2` [#44642](https://github.com/ClickHouse/ClickHouse/pull/44642) ([Anton Popov](https://github.com/CurtizJ)).
|
||||||
|
* Fix flaky test `test_keeper_multinode_simple` [#44645](https://github.com/ClickHouse/ClickHouse/pull/44645) ([Nikita Mikhaylov](https://github.com/nikitamikhaylov)).
|
||||||
|
* Add +x flag for run-fuzzer.sh [#44649](https://github.com/ClickHouse/ClickHouse/pull/44649) ([alesapin](https://github.com/alesapin)).
|
||||||
|
* Custom reading for mutation [#44653](https://github.com/ClickHouse/ClickHouse/pull/44653) ([Nikolai Kochetov](https://github.com/KochetovNicolai)).
|
||||||
|
* Fix flaky test test_backup_restore_on_cluster [#44660](https://github.com/ClickHouse/ClickHouse/pull/44660) ([Vitaly Baranov](https://github.com/vitlibar)).
|
||||||
|
* tests/integration: add missing kazoo client termination [#44666](https://github.com/ClickHouse/ClickHouse/pull/44666) ([Azat Khuzhin](https://github.com/azat)).
|
||||||
|
* Move dmesg dumping out from runner to ci-runner.py [#44667](https://github.com/ClickHouse/ClickHouse/pull/44667) ([Azat Khuzhin](https://github.com/azat)).
|
||||||
|
* Remove questdb (it makes a little sense but the test was flaky) [#44669](https://github.com/ClickHouse/ClickHouse/pull/44669) ([Alexey Milovidov](https://github.com/alexey-milovidov)).
|
||||||
|
* Fix minor typo: replace validate_bugix_check with validate_bugfix_check [#44672](https://github.com/ClickHouse/ClickHouse/pull/44672) ([Pradeep Chhetri](https://github.com/chhetripradeep)).
|
||||||
|
* Fix parsing of ANY operator [#44678](https://github.com/ClickHouse/ClickHouse/pull/44678) ([Nikolay Degterinsky](https://github.com/evillique)).
|
||||||
|
* Fix test `01130_in_memory_parts` [#44683](https://github.com/ClickHouse/ClickHouse/pull/44683) ([Anton Popov](https://github.com/CurtizJ)).
|
||||||
|
* Remove old code [#44685](https://github.com/ClickHouse/ClickHouse/pull/44685) ([Alexey Milovidov](https://github.com/alexey-milovidov)).
|
||||||
|
* Fix flaky test git-import [#44687](https://github.com/ClickHouse/ClickHouse/pull/44687) ([Alexey Milovidov](https://github.com/alexey-milovidov)).
|
||||||
|
* Improve odbc test [#44688](https://github.com/ClickHouse/ClickHouse/pull/44688) ([Alexey Milovidov](https://github.com/alexey-milovidov)).
|
||||||
|
* Add retries to HTTP requests in ClickHouse test [#44689](https://github.com/ClickHouse/ClickHouse/pull/44689) ([alesapin](https://github.com/alesapin)).
|
||||||
|
* Fix flaky tests [#44690](https://github.com/ClickHouse/ClickHouse/pull/44690) ([Nikita Mikhaylov](https://github.com/nikitamikhaylov)).
|
||||||
|
* Fix flaky test "01502_long_log_tinylog_deadlock_race" [#44693](https://github.com/ClickHouse/ClickHouse/pull/44693) ([Alexey Milovidov](https://github.com/alexey-milovidov)).
|
||||||
|
* Improve handling of old parts [#44694](https://github.com/ClickHouse/ClickHouse/pull/44694) ([Raúl Marín](https://github.com/Algunenano)).
|
||||||
|
* Update entrypoint.sh [#44699](https://github.com/ClickHouse/ClickHouse/pull/44699) ([Denny Crane](https://github.com/den-crane)).
|
||||||
|
* tests: more fixes for test_keeper_auth [#44702](https://github.com/ClickHouse/ClickHouse/pull/44702) ([Azat Khuzhin](https://github.com/azat)).
|
||||||
|
* Fix crash on delete from materialized view [#44705](https://github.com/ClickHouse/ClickHouse/pull/44705) ([Alexander Gololobov](https://github.com/davenger)).
|
||||||
|
* Fix flaky filelog tests with database ordinary [#44706](https://github.com/ClickHouse/ClickHouse/pull/44706) ([Kseniia Sumarokova](https://github.com/kssenii)).
|
||||||
|
* Make lightweight deletes always synchronous [#44718](https://github.com/ClickHouse/ClickHouse/pull/44718) ([Alexander Gololobov](https://github.com/davenger)).
|
||||||
|
* Fix deadlock in attach thread [#44719](https://github.com/ClickHouse/ClickHouse/pull/44719) ([alesapin](https://github.com/alesapin)).
|
||||||
|
* A few improvements to AST Fuzzer [#44720](https://github.com/ClickHouse/ClickHouse/pull/44720) ([Alexey Milovidov](https://github.com/alexey-milovidov)).
|
||||||
|
* Fix flaky test [#44721](https://github.com/ClickHouse/ClickHouse/pull/44721) ([Alexey Milovidov](https://github.com/alexey-milovidov)).
|
||||||
|
* Rename log in stress test [#44722](https://github.com/ClickHouse/ClickHouse/pull/44722) ([alesapin](https://github.com/alesapin)).
|
||||||
|
* Debug deadlock in stress test [#44723](https://github.com/ClickHouse/ClickHouse/pull/44723) ([Alexey Milovidov](https://github.com/alexey-milovidov)).
|
||||||
|
* Fix flaky test "02102_row_binary_with_names_and_types.sh" [#44724](https://github.com/ClickHouse/ClickHouse/pull/44724) ([Alexey Milovidov](https://github.com/alexey-milovidov)).
|
||||||
|
* Slightly better some tests [#44725](https://github.com/ClickHouse/ClickHouse/pull/44725) ([alesapin](https://github.com/alesapin)).
|
||||||
|
* Fix cases when clickhouse-server takes long time to start in functional tests with MSan [#44726](https://github.com/ClickHouse/ClickHouse/pull/44726) ([Alexey Milovidov](https://github.com/alexey-milovidov)).
|
||||||
|
* Perf test: Log the time spent waiting for file sync [#44737](https://github.com/ClickHouse/ClickHouse/pull/44737) ([Raúl Marín](https://github.com/Algunenano)).
|
||||||
|
* Fix flaky test 02448_clone_replica_lost_part [#44759](https://github.com/ClickHouse/ClickHouse/pull/44759) ([alesapin](https://github.com/alesapin)).
|
||||||
|
* Build rust modules from the binary directory [#44762](https://github.com/ClickHouse/ClickHouse/pull/44762) ([Azat Khuzhin](https://github.com/azat)).
|
||||||
|
* Remove database ordinary from stress test [#44763](https://github.com/ClickHouse/ClickHouse/pull/44763) ([alesapin](https://github.com/alesapin)).
|
||||||
|
* Fix flaky test 02479_mysql_connect_to_self [#44768](https://github.com/ClickHouse/ClickHouse/pull/44768) ([Alexey Milovidov](https://github.com/alexey-milovidov)).
|
||||||
|
* Print fatal messages in Fuzzer [#44769](https://github.com/ClickHouse/ClickHouse/pull/44769) ([Alexey Milovidov](https://github.com/alexey-milovidov)).
|
||||||
|
* Fix incorrect docs [#44795](https://github.com/ClickHouse/ClickHouse/pull/44795) ([Kruglov Pavel](https://github.com/Avogar)).
|
||||||
|
* Added table name to error message [#44806](https://github.com/ClickHouse/ClickHouse/pull/44806) ([Alexander Gololobov](https://github.com/davenger)).
|
||||||
|
* Retry packages download if GitHub returned HTTP 500. [#44807](https://github.com/ClickHouse/ClickHouse/pull/44807) ([Alexey Milovidov](https://github.com/alexey-milovidov)).
|
||||||
|
* Slightly better docs [#44808](https://github.com/ClickHouse/ClickHouse/pull/44808) ([Kruglov Pavel](https://github.com/Avogar)).
|
||||||
|
* Fix total trash in stress test [#44810](https://github.com/ClickHouse/ClickHouse/pull/44810) ([Alexey Milovidov](https://github.com/alexey-milovidov)).
|
||||||
|
* Fix ASan builds for glibc 2.36+ [#44811](https://github.com/ClickHouse/ClickHouse/pull/44811) ([Azat Khuzhin](https://github.com/azat)).
|
||||||
|
* Remove the remainings of TestFlows [#44812](https://github.com/ClickHouse/ClickHouse/pull/44812) ([Alexey Milovidov](https://github.com/alexey-milovidov)).
|
||||||
|
* Fix `grep` [#44813](https://github.com/ClickHouse/ClickHouse/pull/44813) ([Alexey Milovidov](https://github.com/alexey-milovidov)).
|
||||||
|
* Fix bad cast in monotonicity analysis [#44818](https://github.com/ClickHouse/ClickHouse/pull/44818) ([Alexey Milovidov](https://github.com/alexey-milovidov)).
|
||||||
|
* Modern tools, part 1 [#44819](https://github.com/ClickHouse/ClickHouse/pull/44819) ([Alexey Milovidov](https://github.com/alexey-milovidov)).
|
||||||
|
* Modern tools in CI, part 2. [#44820](https://github.com/ClickHouse/ClickHouse/pull/44820) ([Alexey Milovidov](https://github.com/alexey-milovidov)).
|
||||||
|
* Fix data race in DDLWorker [#44821](https://github.com/ClickHouse/ClickHouse/pull/44821) ([Alexey Milovidov](https://github.com/alexey-milovidov)).
|
||||||
|
* Fix tests for bridges [#44822](https://github.com/ClickHouse/ClickHouse/pull/44822) ([Alexey Milovidov](https://github.com/alexey-milovidov)).
|
||||||
|
* Fix flaky test_multiple_disks::test_jbod_overflow [#44823](https://github.com/ClickHouse/ClickHouse/pull/44823) ([Azat Khuzhin](https://github.com/azat)).
|
||||||
|
* Less OOM in stress test [#44824](https://github.com/ClickHouse/ClickHouse/pull/44824) ([Alexey Milovidov](https://github.com/alexey-milovidov)).
|
||||||
|
* Fix misleading integration tests reports for parametrized tests [#44825](https://github.com/ClickHouse/ClickHouse/pull/44825) ([Azat Khuzhin](https://github.com/azat)).
|
||||||
|
* Fix two typos [#44826](https://github.com/ClickHouse/ClickHouse/pull/44826) ([Alexey Milovidov](https://github.com/alexey-milovidov)).
|
||||||
|
* Adjust CSS [#44829](https://github.com/ClickHouse/ClickHouse/pull/44829) ([Alexey Milovidov](https://github.com/alexey-milovidov)).
|
||||||
|
* Fix fuzzer report [#44830](https://github.com/ClickHouse/ClickHouse/pull/44830) ([Alexey Milovidov](https://github.com/alexey-milovidov)).
|
||||||
|
* check-style: check base for std::cerr/cout too [#44833](https://github.com/ClickHouse/ClickHouse/pull/44833) ([Azat Khuzhin](https://github.com/azat)).
|
||||||
|
* Try fixing `test_keeper_snapshot_small_distance` with ZK restart [#44834](https://github.com/ClickHouse/ClickHouse/pull/44834) ([Antonio Andelic](https://github.com/antonio2368)).
|
||||||
|
* Exclude cargo shared libraries from the artifacts [#44836](https://github.com/ClickHouse/ClickHouse/pull/44836) ([Azat Khuzhin](https://github.com/azat)).
|
||||||
|
* Add a tiny but important logging [#44837](https://github.com/ClickHouse/ClickHouse/pull/44837) ([Mikhail f. Shiryaev](https://github.com/Felixoid)).
|
||||||
|
* Escape submodules in style-check [#44838](https://github.com/ClickHouse/ClickHouse/pull/44838) ([Mikhail f. Shiryaev](https://github.com/Felixoid)).
|
||||||
|
* Move `test_dies_with_parent` to another module [#44839](https://github.com/ClickHouse/ClickHouse/pull/44839) ([Nikita Mikhaylov](https://github.com/nikitamikhaylov)).
|
||||||
|
* Remove unneeded softlink to official dev docs [#44841](https://github.com/ClickHouse/ClickHouse/pull/44841) ([Robert Schulze](https://github.com/rschu1ze)).
|
||||||
|
* Fix data race in StorageS3 [#44842](https://github.com/ClickHouse/ClickHouse/pull/44842) ([Antonio Andelic](https://github.com/antonio2368)).
|
||||||
|
* Fix rare race which can lead to queue hang [#44847](https://github.com/ClickHouse/ClickHouse/pull/44847) ([alesapin](https://github.com/alesapin)).
|
||||||
|
* No more retries in integration tests [#44851](https://github.com/ClickHouse/ClickHouse/pull/44851) ([Ilya Yatsishin](https://github.com/qoega)).
|
||||||
|
* Document usage of check_cxx_source_compiles instead of check_cxx_source_runs [#44854](https://github.com/ClickHouse/ClickHouse/pull/44854) ([Robert Schulze](https://github.com/rschu1ze)).
|
||||||
|
* More cases of OOM in Fuzzer [#44855](https://github.com/ClickHouse/ClickHouse/pull/44855) ([Alexey Milovidov](https://github.com/alexey-milovidov)).
|
||||||
|
* Fix: sorted DISTINCT with empty string [#44856](https://github.com/ClickHouse/ClickHouse/pull/44856) ([Igor Nikonov](https://github.com/devcrafter)).
|
||||||
|
* Try to fix MSan build [#44857](https://github.com/ClickHouse/ClickHouse/pull/44857) ([Nikolay Degterinsky](https://github.com/evillique)).
|
||||||
|
* Cleanup setup_minio.sh [#44858](https://github.com/ClickHouse/ClickHouse/pull/44858) ([Pradeep Chhetri](https://github.com/chhetripradeep)).
|
||||||
|
* Wait for ZK process to stop in tests using snapshot [#44859](https://github.com/ClickHouse/ClickHouse/pull/44859) ([Antonio Andelic](https://github.com/antonio2368)).
|
||||||
|
* Fix flaky test and several typos [#44870](https://github.com/ClickHouse/ClickHouse/pull/44870) ([alesapin](https://github.com/alesapin)).
|
||||||
|
* Upload status files to S3 report for bugfix check [#44871](https://github.com/ClickHouse/ClickHouse/pull/44871) ([Mikhail f. Shiryaev](https://github.com/Felixoid)).
|
||||||
|
* Fix flaky test `02503_insert_storage_snapshot` [#44873](https://github.com/ClickHouse/ClickHouse/pull/44873) ([alesapin](https://github.com/alesapin)).
|
||||||
|
* Revert some changes from [#42777](https://github.com/ClickHouse/ClickHouse/issues/42777) to fix performance tests [#44876](https://github.com/ClickHouse/ClickHouse/pull/44876) ([Kruglov Pavel](https://github.com/Avogar)).
|
||||||
|
* Rewrite test_postgres_protocol test [#44880](https://github.com/ClickHouse/ClickHouse/pull/44880) ([Ilya Yatsishin](https://github.com/qoega)).
|
||||||
|
* Fix ConcurrentBoundedQueue::emplace() return value in case of finished queue [#44881](https://github.com/ClickHouse/ClickHouse/pull/44881) ([Azat Khuzhin](https://github.com/azat)).
|
||||||
|
* Validate function arguments in query tree [#44882](https://github.com/ClickHouse/ClickHouse/pull/44882) ([Dmitry Novik](https://github.com/novikd)).
|
||||||
|
* Rework CI reports to have a class and clarify the logic [#44883](https://github.com/ClickHouse/ClickHouse/pull/44883) ([Mikhail f. Shiryaev](https://github.com/Felixoid)).
|
||||||
|
* fix-typo [#44886](https://github.com/ClickHouse/ClickHouse/pull/44886) ([Enrique Herreros](https://github.com/eherrerosj)).
|
||||||
|
* Store ZK generated data in `test_keeper_snapshot_small_distance` [#44888](https://github.com/ClickHouse/ClickHouse/pull/44888) ([Antonio Andelic](https://github.com/antonio2368)).
|
||||||
|
* Fix "AttributeError: 'BuildResult' object has no attribute 'libraries'" in BuilderReport and BuilderSpecialReport [#44890](https://github.com/ClickHouse/ClickHouse/pull/44890) ([Robert Schulze](https://github.com/rschu1ze)).
|
||||||
|
* Convert integration test_dictionaries_update_field to a stateless [#44891](https://github.com/ClickHouse/ClickHouse/pull/44891) ([Azat Khuzhin](https://github.com/azat)).
|
||||||
|
* Upgrade googletest to latest HEAD [#44894](https://github.com/ClickHouse/ClickHouse/pull/44894) ([Robert Schulze](https://github.com/rschu1ze)).
|
||||||
|
* Try fix rabbitmq potential leak [#44897](https://github.com/ClickHouse/ClickHouse/pull/44897) ([Kseniia Sumarokova](https://github.com/kssenii)).
|
||||||
|
* Try to fix flaky `test_storage_kafka::test_kafka_produce_key_timestamp` [#44898](https://github.com/ClickHouse/ClickHouse/pull/44898) ([Antonio Andelic](https://github.com/antonio2368)).
|
||||||
|
* Fix flaky `test_concurrent_queries_restriction_by_query_kind` [#44903](https://github.com/ClickHouse/ClickHouse/pull/44903) ([Antonio Andelic](https://github.com/antonio2368)).
|
||||||
|
* Avoid Keeper crash on shutdown (fix `test_keeper_snapshot_on_exit`) [#44908](https://github.com/ClickHouse/ClickHouse/pull/44908) ([Antonio Andelic](https://github.com/antonio2368)).
|
||||||
|
* Do not merge over a gap with outdated undeleted parts [#44909](https://github.com/ClickHouse/ClickHouse/pull/44909) ([Sema Checherinda](https://github.com/CheSema)).
|
||||||
|
* Fix logging message in MergeTreeDataMergerMutator (about merged parts) [#44917](https://github.com/ClickHouse/ClickHouse/pull/44917) ([Azat Khuzhin](https://github.com/azat)).
|
||||||
|
* Fix flaky test `test_lost_part` [#44921](https://github.com/ClickHouse/ClickHouse/pull/44921) ([Nikita Mikhaylov](https://github.com/nikitamikhaylov)).
|
||||||
|
* Add fast and cancellable shared_mutex alternatives [#44924](https://github.com/ClickHouse/ClickHouse/pull/44924) ([Sergei Trifonov](https://github.com/serxa)).
|
||||||
|
* Fix deadlock in Keeper's changelog [#44937](https://github.com/ClickHouse/ClickHouse/pull/44937) ([Antonio Andelic](https://github.com/antonio2368)).
|
||||||
|
* Stop merges to avoid a race between merge and freeze. [#44938](https://github.com/ClickHouse/ClickHouse/pull/44938) ([Nikolai Kochetov](https://github.com/KochetovNicolai)).
|
||||||
|
* Fix memory leak in Aws::InitAPI [#44942](https://github.com/ClickHouse/ClickHouse/pull/44942) ([Vitaly Baranov](https://github.com/vitlibar)).
|
||||||
|
* Change error code on invalid background_pool_size config [#44947](https://github.com/ClickHouse/ClickHouse/pull/44947) ([Raúl Marín](https://github.com/Algunenano)).
|
||||||
|
* Fix exception fix in TraceCollector dtor [#44948](https://github.com/ClickHouse/ClickHouse/pull/44948) ([Robert Schulze](https://github.com/rschu1ze)).
|
||||||
|
* Parallel distributed insert select with s3Cluster [3] [#44955](https://github.com/ClickHouse/ClickHouse/pull/44955) ([Nikita Mikhaylov](https://github.com/nikitamikhaylov)).
|
||||||
|
* Do not check read result consistency when unwinding [#44956](https://github.com/ClickHouse/ClickHouse/pull/44956) ([Alexander Gololobov](https://github.com/davenger)).
|
||||||
|
* Up the log level of tables dependencies graphs [#44957](https://github.com/ClickHouse/ClickHouse/pull/44957) ([Vitaly Baranov](https://github.com/vitlibar)).
|
||||||
|
* Hipster's HTML [#44961](https://github.com/ClickHouse/ClickHouse/pull/44961) ([Alexey Milovidov](https://github.com/alexey-milovidov)).
|
||||||
|
* Docs: Mention non-standard DOTALL behavior of ClickHouse's match() [#44977](https://github.com/ClickHouse/ClickHouse/pull/44977) ([Robert Schulze](https://github.com/rschu1ze)).
|
||||||
|
* tests: fix test_replicated_users flakiness [#44978](https://github.com/ClickHouse/ClickHouse/pull/44978) ([Azat Khuzhin](https://github.com/azat)).
|
||||||
|
* Check what if disable some checks in storage Merge. [#44983](https://github.com/ClickHouse/ClickHouse/pull/44983) ([Nikolai Kochetov](https://github.com/KochetovNicolai)).
|
||||||
|
* Fix check for not existing input in ActionsDAG [#44987](https://github.com/ClickHouse/ClickHouse/pull/44987) ([Nikolai Kochetov](https://github.com/KochetovNicolai)).
|
||||||
|
* Update version_date.tsv and changelogs after v22.12.2.25-stable [#44988](https://github.com/ClickHouse/ClickHouse/pull/44988) ([robot-clickhouse](https://github.com/robot-clickhouse)).
|
||||||
|
* Fix test test_grpc_protocol/test.py::test_progress [#44996](https://github.com/ClickHouse/ClickHouse/pull/44996) ([Vitaly Baranov](https://github.com/vitlibar)).
|
||||||
|
* Improve S3 EC2 metadata tests [#45001](https://github.com/ClickHouse/ClickHouse/pull/45001) ([Vitaly Baranov](https://github.com/vitlibar)).
|
||||||
|
* Fix minmax_count_projection with _partition_value [#45003](https://github.com/ClickHouse/ClickHouse/pull/45003) ([Amos Bird](https://github.com/amosbird)).
|
||||||
|
* Fix strange trash in Fuzzer [#45006](https://github.com/ClickHouse/ClickHouse/pull/45006) ([Alexey Milovidov](https://github.com/alexey-milovidov)).
|
||||||
|
* Add `dmesg.log` to Fuzzer [#45008](https://github.com/ClickHouse/ClickHouse/pull/45008) ([Alexey Milovidov](https://github.com/alexey-milovidov)).
|
||||||
|
* Fix `01961_roaring_memory_tracking` test, again [#45009](https://github.com/ClickHouse/ClickHouse/pull/45009) ([Alexey Milovidov](https://github.com/alexey-milovidov)).
|
||||||
|
* Recognize more ok cases for Fuzzer [#45012](https://github.com/ClickHouse/ClickHouse/pull/45012) ([Alexey Milovidov](https://github.com/alexey-milovidov)).
|
||||||
|
* Supposedly fix the "Download script failed" error [#45013](https://github.com/ClickHouse/ClickHouse/pull/45013) ([Alexey Milovidov](https://github.com/alexey-milovidov)).
|
||||||
|
* Add snapshot creation retry in Keeper tests using ZooKeeper [#45016](https://github.com/ClickHouse/ClickHouse/pull/45016) ([Antonio Andelic](https://github.com/antonio2368)).
|
||||||
|
* test for [#20098](https://github.com/ClickHouse/ClickHouse/issues/20098) [#45017](https://github.com/ClickHouse/ClickHouse/pull/45017) ([Denny Crane](https://github.com/den-crane)).
|
||||||
|
* test for [#26473](https://github.com/ClickHouse/ClickHouse/issues/26473) [#45018](https://github.com/ClickHouse/ClickHouse/pull/45018) ([Denny Crane](https://github.com/den-crane)).
|
||||||
|
* Remove the remainings of Testflows (2). [#45021](https://github.com/ClickHouse/ClickHouse/pull/45021) ([Alexey Milovidov](https://github.com/alexey-milovidov)).
|
||||||
|
* Enable the check that was commented [#45022](https://github.com/ClickHouse/ClickHouse/pull/45022) ([Alexey Milovidov](https://github.com/alexey-milovidov)).
|
||||||
|
* Fix false positive in Fuzzer [#45025](https://github.com/ClickHouse/ClickHouse/pull/45025) ([Alexey Milovidov](https://github.com/alexey-milovidov)).
|
||||||
|
* Fix false positive in Fuzzer, alternative variant [#45026](https://github.com/ClickHouse/ClickHouse/pull/45026) ([Alexey Milovidov](https://github.com/alexey-milovidov)).
|
||||||
|
* Fix function `range` (the bug was unreleased) [#45030](https://github.com/ClickHouse/ClickHouse/pull/45030) ([Alexey Milovidov](https://github.com/alexey-milovidov)).
|
||||||
|
* Fix OOM in Fuzzer [#45032](https://github.com/ClickHouse/ClickHouse/pull/45032) ([Alexey Milovidov](https://github.com/alexey-milovidov)).
|
||||||
|
* Less OOM in Stress test [#45033](https://github.com/ClickHouse/ClickHouse/pull/45033) ([Alexey Milovidov](https://github.com/alexey-milovidov)).
|
||||||
|
* Add a test for [#31361](https://github.com/ClickHouse/ClickHouse/issues/31361) [#45034](https://github.com/ClickHouse/ClickHouse/pull/45034) ([Alexey Milovidov](https://github.com/alexey-milovidov)).
|
||||||
|
* Add a test for [#38729](https://github.com/ClickHouse/ClickHouse/issues/38729) [#45035](https://github.com/ClickHouse/ClickHouse/pull/45035) ([Alexey Milovidov](https://github.com/alexey-milovidov)).
|
||||||
|
* Fix typos [#45036](https://github.com/ClickHouse/ClickHouse/pull/45036) ([Alexey Milovidov](https://github.com/alexey-milovidov)).
|
||||||
|
* I didn't understand the logic of this test, @azat [#45037](https://github.com/ClickHouse/ClickHouse/pull/45037) ([Alexey Milovidov](https://github.com/alexey-milovidov)).
|
||||||
|
* Small fixes for Coordination unit tests [#45039](https://github.com/ClickHouse/ClickHouse/pull/45039) ([Antonio Andelic](https://github.com/antonio2368)).
|
||||||
|
* Fix flaky test (hilarious) [#45042](https://github.com/ClickHouse/ClickHouse/pull/45042) ([Alexey Milovidov](https://github.com/alexey-milovidov)).
|
||||||
|
* Non significant changes [#45046](https://github.com/ClickHouse/ClickHouse/pull/45046) ([Alexey Milovidov](https://github.com/alexey-milovidov)).
|
||||||
|
* Don't fix parallel formatting [#45050](https://github.com/ClickHouse/ClickHouse/pull/45050) ([Alexey Milovidov](https://github.com/alexey-milovidov)).
|
||||||
|
* Fix (benign) data race in clickhouse-client [#45053](https://github.com/ClickHouse/ClickHouse/pull/45053) ([Alexey Milovidov](https://github.com/alexey-milovidov)).
|
||||||
|
* Analyzer aggregation without column fix [#45055](https://github.com/ClickHouse/ClickHouse/pull/45055) ([Maksim Kita](https://github.com/kitaisreal)).
|
||||||
|
* Analyzer ARRAY JOIN crash fix [#45059](https://github.com/ClickHouse/ClickHouse/pull/45059) ([Maksim Kita](https://github.com/kitaisreal)).
|
||||||
|
* Fix data race in openSQLiteDB [#45062](https://github.com/ClickHouse/ClickHouse/pull/45062) ([Kseniia Sumarokova](https://github.com/kssenii)).
|
||||||
|
* Analyzer function IN crash fix [#45064](https://github.com/ClickHouse/ClickHouse/pull/45064) ([Maksim Kita](https://github.com/kitaisreal)).
|
||||||
|
* JIT compilation float to bool conversion fix [#45067](https://github.com/ClickHouse/ClickHouse/pull/45067) ([Maksim Kita](https://github.com/kitaisreal)).
|
||||||
|
* Update version_date.tsv and changelogs after v22.11.3.47-stable [#45069](https://github.com/ClickHouse/ClickHouse/pull/45069) ([robot-clickhouse](https://github.com/robot-clickhouse)).
|
||||||
|
* Update version_date.tsv and changelogs after v22.10.5.54-stable [#45071](https://github.com/ClickHouse/ClickHouse/pull/45071) ([robot-clickhouse](https://github.com/robot-clickhouse)).
|
||||||
|
* Update version_date.tsv and changelogs after v22.3.16.1190-lts [#45073](https://github.com/ClickHouse/ClickHouse/pull/45073) ([robot-clickhouse](https://github.com/robot-clickhouse)).
|
||||||
|
* Improve release scripts [#45074](https://github.com/ClickHouse/ClickHouse/pull/45074) ([Mikhail f. Shiryaev](https://github.com/Felixoid)).
|
||||||
|
* Change the color of links in dark reports a little bit [#45077](https://github.com/ClickHouse/ClickHouse/pull/45077) ([Mikhail f. Shiryaev](https://github.com/Felixoid)).
|
||||||
|
* Fix Fuzzer script [#45082](https://github.com/ClickHouse/ClickHouse/pull/45082) ([Alexey Milovidov](https://github.com/alexey-milovidov)).
|
||||||
|
* Try fixing KeeperMap tests [#45094](https://github.com/ClickHouse/ClickHouse/pull/45094) ([Antonio Andelic](https://github.com/antonio2368)).
|
||||||
|
* Update version_date.tsv and changelogs after v22.8.12.45-lts [#45098](https://github.com/ClickHouse/ClickHouse/pull/45098) ([robot-clickhouse](https://github.com/robot-clickhouse)).
|
||||||
|
* Try to fix flaky test_create_user_and_login/test.py::test_login_as_dropped_user_xml [#45099](https://github.com/ClickHouse/ClickHouse/pull/45099) ([Ilya Yatsishin](https://github.com/qoega)).
|
||||||
|
* Update version_date.tsv and changelogs after v22.10.6.3-stable [#45107](https://github.com/ClickHouse/ClickHouse/pull/45107) ([robot-clickhouse](https://github.com/robot-clickhouse)).
|
||||||
|
* Docs: Make heading consistent with other headings in System Table docs [#45109](https://github.com/ClickHouse/ClickHouse/pull/45109) ([Robert Schulze](https://github.com/rschu1ze)).
|
||||||
|
* Update version_date.tsv and changelogs after v22.11.4.3-stable [#45110](https://github.com/ClickHouse/ClickHouse/pull/45110) ([robot-clickhouse](https://github.com/robot-clickhouse)).
|
||||||
|
* Update version_date.tsv and changelogs after v22.12.3.5-stable [#45113](https://github.com/ClickHouse/ClickHouse/pull/45113) ([robot-clickhouse](https://github.com/robot-clickhouse)).
|
||||||
|
* Docs: Rewrite awkwardly phrased sentence about flush interval [#45114](https://github.com/ClickHouse/ClickHouse/pull/45114) ([Robert Schulze](https://github.com/rschu1ze)).
|
||||||
|
* Fix data race in s3Cluster. [#45123](https://github.com/ClickHouse/ClickHouse/pull/45123) ([Nikolai Kochetov](https://github.com/KochetovNicolai)).
|
||||||
|
* Pull SQLancer image before check run [#45125](https://github.com/ClickHouse/ClickHouse/pull/45125) ([Ilya Yatsishin](https://github.com/qoega)).
|
||||||
|
* Fix flaky azure test [#45134](https://github.com/ClickHouse/ClickHouse/pull/45134) ([alesapin](https://github.com/alesapin)).
|
||||||
|
* Minor cleanup in stress/run.sh [#45136](https://github.com/ClickHouse/ClickHouse/pull/45136) ([Alexander Tokmakov](https://github.com/tavplubix)).
|
||||||
|
* Performance report: "Partial queries" --> "Backward-incompatible queries [#45152](https://github.com/ClickHouse/ClickHouse/pull/45152) ([Robert Schulze](https://github.com/rschu1ze)).
|
||||||
|
* Fix flaky test_tcp_handler_interserver_listen_host [#45156](https://github.com/ClickHouse/ClickHouse/pull/45156) ([Ilya Yatsishin](https://github.com/qoega)).
|
||||||
|
* Clean trash from changelog for v22.3.16.1190-lts [#45159](https://github.com/ClickHouse/ClickHouse/pull/45159) ([Mikhail f. Shiryaev](https://github.com/Felixoid)).
|
||||||
|
* Disable `test_storage_rabbitmq` [#45161](https://github.com/ClickHouse/ClickHouse/pull/45161) ([Alexander Tokmakov](https://github.com/tavplubix)).
|
||||||
|
* Disable test_ttl_move_memory_usage as too flaky. [#45162](https://github.com/ClickHouse/ClickHouse/pull/45162) ([Nikolai Kochetov](https://github.com/KochetovNicolai)).
|
||||||
|
* More logging to facilitate debugging of flaky test_ttl_replicated [#45165](https://github.com/ClickHouse/ClickHouse/pull/45165) ([Alexander Gololobov](https://github.com/davenger)).
|
||||||
|
* Try to fix flaky test_ttl_move_memory_usage [#45168](https://github.com/ClickHouse/ClickHouse/pull/45168) ([Alexander Tokmakov](https://github.com/tavplubix)).
|
||||||
|
* Fix flaky test test_multiple_disks/test.py::test_rename [#45180](https://github.com/ClickHouse/ClickHouse/pull/45180) ([Kseniia Sumarokova](https://github.com/kssenii)).
|
||||||
|
* Calculate only required columns in system.detached_parts [#45181](https://github.com/ClickHouse/ClickHouse/pull/45181) ([Kseniia Sumarokova](https://github.com/kssenii)).
|
||||||
|
* Restart NightlyBuilds if the runner died [#45187](https://github.com/ClickHouse/ClickHouse/pull/45187) ([Mikhail f. Shiryaev](https://github.com/Felixoid)).
|
||||||
|
* Fix part ID generation for IP types for backward compatibility [#45191](https://github.com/ClickHouse/ClickHouse/pull/45191) ([Yakov Olkhovskiy](https://github.com/yakov-olkhovskiy)).
|
||||||
|
* Fix integration test test_replicated_users::test_rename_replicated [#45192](https://github.com/ClickHouse/ClickHouse/pull/45192) ([Nikita Mikhaylov](https://github.com/nikitamikhaylov)).
|
||||||
|
* Add CACHE_INVALIDATOR for sqlancer builds [#45201](https://github.com/ClickHouse/ClickHouse/pull/45201) ([Ilya Yatsishin](https://github.com/qoega)).
|
||||||
|
* Fix possible stack-use-after-return in LimitReadBuffer [#45203](https://github.com/ClickHouse/ClickHouse/pull/45203) ([Kruglov Pavel](https://github.com/Avogar)).
|
||||||
|
* Disable check to make test_overcommit_tracker not flaky [#45206](https://github.com/ClickHouse/ClickHouse/pull/45206) ([Dmitry Novik](https://github.com/novikd)).
|
||||||
|
* Fix flaky test `01961_roaring_memory_tracking` (3) [#45208](https://github.com/ClickHouse/ClickHouse/pull/45208) ([Alexey Milovidov](https://github.com/alexey-milovidov)).
|
||||||
|
* Remove trash from stress test [#45211](https://github.com/ClickHouse/ClickHouse/pull/45211) ([Alexey Milovidov](https://github.com/alexey-milovidov)).
|
||||||
|
* remove unused function [#45212](https://github.com/ClickHouse/ClickHouse/pull/45212) ([flynn](https://github.com/ucasfl)).
|
||||||
|
* Fix flaky `test_keeper_three_nodes_two_alive` [#45213](https://github.com/ClickHouse/ClickHouse/pull/45213) ([Antonio Andelic](https://github.com/antonio2368)).
|
||||||
|
* Fuzz PREWHERE clause [#45222](https://github.com/ClickHouse/ClickHouse/pull/45222) ([Alexander Gololobov](https://github.com/davenger)).
|
||||||
|
* Added a test for merge join key condition with big int & decimal [#45228](https://github.com/ClickHouse/ClickHouse/pull/45228) ([SmitaRKulkarni](https://github.com/SmitaRKulkarni)).
|
||||||
|
* Fix rare logical error: `Too large alignment` [#45229](https://github.com/ClickHouse/ClickHouse/pull/45229) ([Anton Popov](https://github.com/CurtizJ)).
|
||||||
|
* Update version_date.tsv and changelogs after v22.3.17.13-lts [#45234](https://github.com/ClickHouse/ClickHouse/pull/45234) ([robot-clickhouse](https://github.com/robot-clickhouse)).
|
||||||
|
* More verbose logs about replication log entries [#45235](https://github.com/ClickHouse/ClickHouse/pull/45235) ([Alexander Tokmakov](https://github.com/tavplubix)).
|
||||||
|
* One more attempt to fix race in TCPHandler [#45240](https://github.com/ClickHouse/ClickHouse/pull/45240) ([Nikita Mikhaylov](https://github.com/nikitamikhaylov)).
|
||||||
|
* Update clickhouse-test [#45251](https://github.com/ClickHouse/ClickHouse/pull/45251) ([Alexander Tokmakov](https://github.com/tavplubix)).
|
||||||
|
* Planner small fixes [#45254](https://github.com/ClickHouse/ClickHouse/pull/45254) ([Maksim Kita](https://github.com/kitaisreal)).
|
||||||
|
* Fix log level "Test" for send_logs_level in client [#45273](https://github.com/ClickHouse/ClickHouse/pull/45273) ([Azat Khuzhin](https://github.com/azat)).
|
||||||
|
* tests: fix clickhouse binaries detection [#45283](https://github.com/ClickHouse/ClickHouse/pull/45283) ([Azat Khuzhin](https://github.com/azat)).
|
||||||
|
* tests/ci: encode HTML entities in the reports [#45284](https://github.com/ClickHouse/ClickHouse/pull/45284) ([Azat Khuzhin](https://github.com/azat)).
|
||||||
|
* Disable `02151_hash_table_sizes_stats_distributed` under TSAN [#45287](https://github.com/ClickHouse/ClickHouse/pull/45287) ([Nikita Taranov](https://github.com/nickitat)).
|
||||||
|
* Fix wrong approved_at, simplify conditions [#45302](https://github.com/ClickHouse/ClickHouse/pull/45302) ([Mikhail f. Shiryaev](https://github.com/Felixoid)).
|
||||||
|
* Disable 02028_create_select_settings with Ordinary [#45307](https://github.com/ClickHouse/ClickHouse/pull/45307) ([Alexander Tokmakov](https://github.com/tavplubix)).
|
||||||
|
* Save message format strings for DB::Exception [#45342](https://github.com/ClickHouse/ClickHouse/pull/45342) ([Alexander Tokmakov](https://github.com/tavplubix)).
|
||||||
|
* Slightly better output for glibc check [#45353](https://github.com/ClickHouse/ClickHouse/pull/45353) ([Kseniia Sumarokova](https://github.com/kssenii)).
|
||||||
|
* Add checks for compilation of regexps [#45356](https://github.com/ClickHouse/ClickHouse/pull/45356) ([Anton Popov](https://github.com/CurtizJ)).
|
||||||
|
* Analyzer compound identifier typo correction fix [#45357](https://github.com/ClickHouse/ClickHouse/pull/45357) ([Maksim Kita](https://github.com/kitaisreal)).
|
||||||
|
* Bump to newer version of debug-action [#45359](https://github.com/ClickHouse/ClickHouse/pull/45359) ([Ilya Yatsishin](https://github.com/qoega)).
|
||||||
|
* Improve failed kafka startup logging [#45369](https://github.com/ClickHouse/ClickHouse/pull/45369) ([Ilya Yatsishin](https://github.com/qoega)).
|
||||||
|
* Fix flaky ttl test [#45370](https://github.com/ClickHouse/ClickHouse/pull/45370) ([alesapin](https://github.com/alesapin)).
|
||||||
|
* Add detailed profile events for throttling [#45373](https://github.com/ClickHouse/ClickHouse/pull/45373) ([Sergei Trifonov](https://github.com/serxa)).
|
||||||
|
* Update .gitignore [#45378](https://github.com/ClickHouse/ClickHouse/pull/45378) ([Nikolay Degterinsky](https://github.com/evillique)).
|
||||||
|
* Make test simpler to see errors [#45402](https://github.com/ClickHouse/ClickHouse/pull/45402) ([Ilya Yatsishin](https://github.com/qoega)).
|
||||||
|
* Reduce an amount of trash in `tests_system_merges` [#45403](https://github.com/ClickHouse/ClickHouse/pull/45403) ([Alexander Tokmakov](https://github.com/tavplubix)).
|
||||||
|
* Fix reading from encrypted disk with passed file size [#45418](https://github.com/ClickHouse/ClickHouse/pull/45418) ([Anton Popov](https://github.com/CurtizJ)).
|
||||||
|
* Add delete by ttl for zookeeper_log [#45419](https://github.com/ClickHouse/ClickHouse/pull/45419) ([Nikita Taranov](https://github.com/nickitat)).
|
||||||
|
* Get rid of artifactory in favor of r2 + ch-repos-manager [#45421](https://github.com/ClickHouse/ClickHouse/pull/45421) ([Mikhail f. Shiryaev](https://github.com/Felixoid)).
|
||||||
|
* Minor improvements around reading from remote [#45442](https://github.com/ClickHouse/ClickHouse/pull/45442) ([Kseniia Sumarokova](https://github.com/kssenii)).
|
||||||
|
* Docs: Beautify section on secondary index types [#45444](https://github.com/ClickHouse/ClickHouse/pull/45444) ([Robert Schulze](https://github.com/rschu1ze)).
|
||||||
|
* Fix Buffer's offsets mismatch logical error in stress test [#45446](https://github.com/ClickHouse/ClickHouse/pull/45446) ([Kseniia Sumarokova](https://github.com/kssenii)).
|
||||||
|
* Better formatting for exception messages [#45449](https://github.com/ClickHouse/ClickHouse/pull/45449) ([Alexander Tokmakov](https://github.com/tavplubix)).
|
||||||
|
* Add default GRANULARITY argument for secondary indexes [#45451](https://github.com/ClickHouse/ClickHouse/pull/45451) ([Nikolay Degterinsky](https://github.com/evillique)).
|
||||||
|
* Cleanup of inverted index [#45460](https://github.com/ClickHouse/ClickHouse/pull/45460) ([Robert Schulze](https://github.com/rschu1ze)).
|
||||||
|
* CherryPick: Fix a wrong staring search date [#45466](https://github.com/ClickHouse/ClickHouse/pull/45466) ([Mikhail f. Shiryaev](https://github.com/Felixoid)).
|
||||||
|
* Fix typos [#45470](https://github.com/ClickHouse/ClickHouse/pull/45470) ([Robert Schulze](https://github.com/rschu1ze)).
|
||||||
|
* Fix possible aborts in arrow lib [#45478](https://github.com/ClickHouse/ClickHouse/pull/45478) ([Kruglov Pavel](https://github.com/Avogar)).
|
||||||
|
* Add more retries to AST Fuzzer [#45479](https://github.com/ClickHouse/ClickHouse/pull/45479) ([Nikolay Degterinsky](https://github.com/evillique)).
|
||||||
|
* Fix schema inference from insertion table in hdfsCluster [#45483](https://github.com/ClickHouse/ClickHouse/pull/45483) ([Kruglov Pavel](https://github.com/Avogar)).
|
||||||
|
* Remove unnecessary getTotalRowCount function calls [#45485](https://github.com/ClickHouse/ClickHouse/pull/45485) ([Maksim Kita](https://github.com/kitaisreal)).
|
||||||
|
* Use new copy s3 functions in S3ObjectStorage [#45487](https://github.com/ClickHouse/ClickHouse/pull/45487) ([Vitaly Baranov](https://github.com/vitlibar)).
|
||||||
|
* Forward declaration of ConcurrentBoundedQueue in ThreadStatus [#45489](https://github.com/ClickHouse/ClickHouse/pull/45489) ([Azat Khuzhin](https://github.com/azat)).
|
||||||
|
* Revert "Merge pull request [#44922](https://github.com/ClickHouse/ClickHouse/issues/44922) from azat/dist/async-INSERT-metrics" [#45492](https://github.com/ClickHouse/ClickHouse/pull/45492) ([Azat Khuzhin](https://github.com/azat)).
|
||||||
|
* Docs: Fix weird formatting [#45495](https://github.com/ClickHouse/ClickHouse/pull/45495) ([Robert Schulze](https://github.com/rschu1ze)).
|
||||||
|
* Docs: Fix link to writing guide [#45496](https://github.com/ClickHouse/ClickHouse/pull/45496) ([Robert Schulze](https://github.com/rschu1ze)).
|
||||||
|
* Improve logging for TeePopen.timeout exceeded [#45504](https://github.com/ClickHouse/ClickHouse/pull/45504) ([Mikhail f. Shiryaev](https://github.com/Felixoid)).
|
||||||
|
* Fix MSan build once again (too heavy translation units) [#45512](https://github.com/ClickHouse/ClickHouse/pull/45512) ([Nikolay Degterinsky](https://github.com/evillique)).
|
||||||
|
* Additional check in MergeTreeReadPool [#45515](https://github.com/ClickHouse/ClickHouse/pull/45515) ([Kseniia Sumarokova](https://github.com/kssenii)).
|
||||||
|
* Update test_system_merges/test.py [#45516](https://github.com/ClickHouse/ClickHouse/pull/45516) ([Alexander Tokmakov](https://github.com/tavplubix)).
|
||||||
|
* Revert "Merge pull request [#45493](https://github.com/ClickHouse/ClickHouse/issues/45493) from azat/fix-detach" [#45545](https://github.com/ClickHouse/ClickHouse/pull/45545) ([Alexander Tokmakov](https://github.com/tavplubix)).
|
||||||
|
* Update stress [#45546](https://github.com/ClickHouse/ClickHouse/pull/45546) ([Alexander Tokmakov](https://github.com/tavplubix)).
|
||||||
|
* Ignore utf errors in clickhouse-test reportLogStats [#45556](https://github.com/ClickHouse/ClickHouse/pull/45556) ([Vladimir C](https://github.com/vdimir)).
|
||||||
|
* Resubmit "Fix possible in-use table after DETACH" [#45566](https://github.com/ClickHouse/ClickHouse/pull/45566) ([Alexander Tokmakov](https://github.com/tavplubix)).
|
||||||
|
* Typo: "Granulesis" --> "Granules" [#45598](https://github.com/ClickHouse/ClickHouse/pull/45598) ([Robert Schulze](https://github.com/rschu1ze)).
|
||||||
|
* Fix version in autogenerated_versions.txt [#45624](https://github.com/ClickHouse/ClickHouse/pull/45624) ([Dmitry Novik](https://github.com/novikd)).
|
||||||
|
|
@ -21,6 +21,13 @@ ENGINE = HDFS(URI, format)
|
|||||||
`SELECT` queries, the format must be supported for input, and to perform
|
`SELECT` queries, the format must be supported for input, and to perform
|
||||||
`INSERT` queries – for output. The available formats are listed in the
|
`INSERT` queries – for output. The available formats are listed in the
|
||||||
[Formats](../../../interfaces/formats.md#formats) section.
|
[Formats](../../../interfaces/formats.md#formats) section.
|
||||||
|
- [PARTITION BY expr]
|
||||||
|
|
||||||
|
### PARTITION BY
|
||||||
|
|
||||||
|
`PARTITION BY` — Optional. In most cases you don't need a partition key, and if it is needed you generally don't need a partition key more granular than by month. Partitioning does not speed up queries (in contrast to the ORDER BY expression). You should never use too granular partitioning. Don't partition your data by client identifiers or names (instead, make client identifier or name the first column in the ORDER BY expression).
|
||||||
|
|
||||||
|
For partitioning by month, use the `toYYYYMM(date_column)` expression, where `date_column` is a column with a date of the type [Date](/docs/en/sql-reference/data-types/date.md). The partition names here have the `"YYYYMM"` format.
|
||||||
|
|
||||||
**Example:**
|
**Example:**
|
||||||
|
|
||||||
|
@ -13,6 +13,7 @@ This engine provides integration with [Amazon S3](https://aws.amazon.com/s3/) ec
|
|||||||
``` sql
|
``` sql
|
||||||
CREATE TABLE s3_engine_table (name String, value UInt32)
|
CREATE TABLE s3_engine_table (name String, value UInt32)
|
||||||
ENGINE = S3(path, [aws_access_key_id, aws_secret_access_key,] format, [compression])
|
ENGINE = S3(path, [aws_access_key_id, aws_secret_access_key,] format, [compression])
|
||||||
|
[PARTITION BY expr]
|
||||||
[SETTINGS ...]
|
[SETTINGS ...]
|
||||||
```
|
```
|
||||||
|
|
||||||
@ -23,6 +24,12 @@ CREATE TABLE s3_engine_table (name String, value UInt32)
|
|||||||
- `aws_access_key_id`, `aws_secret_access_key` - Long-term credentials for the [AWS](https://aws.amazon.com/) account user. You can use these to authenticate your requests. Parameter is optional. If credentials are not specified, they are used from the configuration file. For more information see [Using S3 for Data Storage](../mergetree-family/mergetree.md#table_engine-mergetree-s3).
|
- `aws_access_key_id`, `aws_secret_access_key` - Long-term credentials for the [AWS](https://aws.amazon.com/) account user. You can use these to authenticate your requests. Parameter is optional. If credentials are not specified, they are used from the configuration file. For more information see [Using S3 for Data Storage](../mergetree-family/mergetree.md#table_engine-mergetree-s3).
|
||||||
- `compression` — Compression type. Supported values: `none`, `gzip/gz`, `brotli/br`, `xz/LZMA`, `zstd/zst`. Parameter is optional. By default, it will autodetect compression by file extension.
|
- `compression` — Compression type. Supported values: `none`, `gzip/gz`, `brotli/br`, `xz/LZMA`, `zstd/zst`. Parameter is optional. By default, it will autodetect compression by file extension.
|
||||||
|
|
||||||
|
### PARTITION BY
|
||||||
|
|
||||||
|
`PARTITION BY` — Optional. In most cases you don't need a partition key, and if it is needed you generally don't need a partition key more granular than by month. Partitioning does not speed up queries (in contrast to the ORDER BY expression). You should never use too granular partitioning. Don't partition your data by client identifiers or names (instead, make client identifier or name the first column in the ORDER BY expression).
|
||||||
|
|
||||||
|
For partitioning by month, use the `toYYYYMM(date_column)` expression, where `date_column` is a column with a date of the type [Date](/docs/en/sql-reference/data-types/date.md). The partition names here have the `"YYYYMM"` format.
|
||||||
|
|
||||||
**Example**
|
**Example**
|
||||||
|
|
||||||
``` sql
|
``` sql
|
||||||
|
@ -1,15 +1,22 @@
|
|||||||
# Inverted indexes [experimental] {#table_engines-ANNIndex}
|
---
|
||||||
|
slug: /en/engines/table-engines/mergetree-family/invertedindexes
|
||||||
|
sidebar_label: Inverted Indexes
|
||||||
|
description: Quickly find search terms in text.
|
||||||
|
keywords: [full-text search, text search]
|
||||||
|
---
|
||||||
|
|
||||||
Inverted indexes are an experimental type of [secondary indexes](mergetree.md#available-types-of-indices) which provide fast text search
|
# Inverted indexes [experimental]
|
||||||
capabilities for [String](../../../sql-reference/data-types/string.md) or [FixedString](../../../sql-reference/data-types/fixedstring.md)
|
|
||||||
columns. The main idea of an inverted indexes is to store a mapping from "terms" to the rows which contains these terms. "Terms" are
|
Inverted indexes are an experimental type of [secondary indexes](/docs/en/engines/table-engines/mergetree-family/mergetree.md/#available-types-of-indices) which provide fast text search
|
||||||
tokenized cells of the string column. For example, string cell "I will be a little late" is by default tokenized into six terms "I", "will",
|
capabilities for [String](/docs/en/sql-reference/data-types/string.md) or [FixedString](/docs/en/sql-reference/data-types/fixedstring.md)
|
||||||
"be", "a", "little" and "late". Another kind of tokenizer are n-grams. For example, the result of 3-gram tokenization will be 21 terms "I w",
|
columns. The main idea of an inverted index is to store a mapping from "terms" to the rows which contain these terms. "Terms" are
|
||||||
|
tokenized cells of the string column. For example, the string cell "I will be a little late" is by default tokenized into six terms "I", "will",
|
||||||
|
"be", "a", "little" and "late". Another kind of tokenizer is n-grams. For example, the result of 3-gram tokenization will be 21 terms "I w",
|
||||||
" wi", "wil", "ill", "ll ", "l b", " be" etc. The more fine-granular the input strings are tokenized, the bigger but also the more
|
" wi", "wil", "ill", "ll ", "l b", " be" etc. The more fine-granular the input strings are tokenized, the bigger but also the more
|
||||||
useful the resulting inverted index will be.
|
useful the resulting inverted index will be.
|
||||||
|
|
||||||
:::warning
|
:::warning
|
||||||
Inverted indexes are experimental and should not be used in production environment yet. They may change in future in backwards-incompatible
|
Inverted indexes are experimental and should not be used in production environments yet. They may change in the future in backward-incompatible
|
||||||
ways, for example with respect to their DDL/DQL syntax or performance/compression characteristics.
|
ways, for example with respect to their DDL/DQL syntax or performance/compression characteristics.
|
||||||
:::
|
:::
|
||||||
|
|
||||||
@ -24,7 +31,14 @@ SET allow_experimental_inverted_index = true;
|
|||||||
An inverted index can be defined on a string column using the following syntax
|
An inverted index can be defined on a string column using the following syntax
|
||||||
|
|
||||||
``` sql
|
``` sql
|
||||||
CREATE TABLE tab (key UInt64, str String, INDEX inv_idx(s) TYPE inverted(N) GRANULARITY 1) Engine=MergeTree ORDER BY (k);
|
CREATE TABLE tab
|
||||||
|
(
|
||||||
|
`key` UInt64,
|
||||||
|
`str` String,
|
||||||
|
INDEX inv_idx(str) TYPE inverted(0) GRANULARITY 1
|
||||||
|
)
|
||||||
|
ENGINE = MergeTree
|
||||||
|
ORDER BY key
|
||||||
```
|
```
|
||||||
|
|
||||||
where `N` specifies the tokenizer:
|
where `N` specifies the tokenizer:
|
||||||
@ -32,7 +46,7 @@ where `N` specifies the tokenizer:
|
|||||||
- `inverted(0)` (or shorter: `inverted()`) set the tokenizer to "tokens", i.e. split strings along spaces,
|
- `inverted(0)` (or shorter: `inverted()`) set the tokenizer to "tokens", i.e. split strings along spaces,
|
||||||
- `inverted(N)` with `N` between 2 and 8 sets the tokenizer to "ngrams(N)"
|
- `inverted(N)` with `N` between 2 and 8 sets the tokenizer to "ngrams(N)"
|
||||||
|
|
||||||
Being a type of skipping indexes, inverted indexes can be dropped or added to a column after table creation:
|
Being a type of skipping index, inverted indexes can be dropped or added to a column after table creation:
|
||||||
|
|
||||||
``` sql
|
``` sql
|
||||||
ALTER TABLE tbl DROP INDEX inv_idx;
|
ALTER TABLE tbl DROP INDEX inv_idx;
|
||||||
@ -43,24 +57,24 @@ To use the index, no special functions or syntax are required. Typical string se
|
|||||||
examples, consider:
|
examples, consider:
|
||||||
|
|
||||||
```sql
|
```sql
|
||||||
SELECT * from tab WHERE s == 'Hello World;;
|
SELECT * from tab WHERE s == 'Hello World;
|
||||||
SELECT * from tab WHERE s IN (‘Hello’, ‘World’);
|
SELECT * from tab WHERE s IN (‘Hello’, ‘World’);
|
||||||
SELECT * from tab WHERE s LIKE ‘%Hello%’;
|
SELECT * from tab WHERE s LIKE ‘%Hello%’;
|
||||||
SELECT * from tab WHERE multiSearchAny(s, ‘Hello’, ‘World’);
|
SELECT * from tab WHERE multiSearchAny(s, ‘Hello’, ‘World’);
|
||||||
SELECT * from tab WHERE hasToken(s, ‘Hello’);
|
SELECT * from tab WHERE hasToken(s, ‘Hello’);
|
||||||
SELECT * from tab WHERE multiSearchAll(s, [‘Hello’, ‘World’])
|
SELECT * from tab WHERE multiSearchAll(s, [‘Hello’, ‘World’]);
|
||||||
```
|
```
|
||||||
|
|
||||||
The inverted index also works on columns of type `Array(String)`, `Array(FixedString)`, `Map(String)` and `Map(String)`.
|
The inverted index also works on columns of type `Array(String)`, `Array(FixedString)`, `Map(String)` and `Map(String)`.
|
||||||
|
|
||||||
Like for other secondary indices, each column part has its own inverted index. Furthermore, each inverted index is internally divided into
|
Like for other secondary indices, each column part has its own inverted index. Furthermore, each inverted index is internally divided into
|
||||||
"segments". The existence and size of the segments is generally transparent to users but the segment size determines the memory consumption
|
"segments". The existence and size of the segments are generally transparent to users but the segment size determines the memory consumption
|
||||||
during index construction (e.g. when two parts are merged). Configuration parameter "max_digestion_size_per_segment" (default: 256 MB)
|
during index construction (e.g. when two parts are merged). Configuration parameter "max_digestion_size_per_segment" (default: 256 MB)
|
||||||
controls the amount of data read consumed from the underlying column before a new segment is created. Incrementing the parameter raises the
|
controls the amount of data read consumed from the underlying column before a new segment is created. Incrementing the parameter raises the
|
||||||
intermediate memory consumption for index constuction but also improves lookup performance since fewer segments need to be checked on
|
intermediate memory consumption for index construction but also improves lookup performance since fewer segments need to be checked on
|
||||||
average to evaluate a query.
|
average to evaluate a query.
|
||||||
|
|
||||||
Unlike other secondary indices, inverted indexes (for now) map to row numbers (row ids) instead of granule ids. The reason for this design
|
Unlike other secondary indices, inverted indexes (for now) map to row numbers (row ids) instead of granule ids. The reason for this design
|
||||||
is performance. In practice, users often search for multiple terms at once. For example, filter predicate `WHERE s LIKE '%little%' OR s LIKE
|
is performance. In practice, users often search for multiple terms at once. For example, filter predicate `WHERE s LIKE '%little%' OR s LIKE
|
||||||
'%big%'` can be evaluated directly using an inverted index by forming the union of the rowid lists for terms "little" and "big". This also
|
'%big%'` can be evaluated directly using an inverted index by forming the union of the row id lists for terms "little" and "big". This also
|
||||||
means that parameter `GRANULARITY` supplied to index creation has no meaning (it may be removed from the syntax in future).
|
means that the parameter `GRANULARITY` supplied to index creation has no meaning (it may be removed from the syntax in the future).
|
||||||
|
@ -77,7 +77,7 @@ Use the `ORDER BY tuple()` syntax, if you do not need sorting. See [Selecting th
|
|||||||
|
|
||||||
#### PARTITION BY
|
#### PARTITION BY
|
||||||
|
|
||||||
`PARTITION BY` — The [partitioning key](/docs/en/engines/table-engines/mergetree-family/custom-partitioning-key.md). Optional. In most cases you don't need partition key, and in most other cases you don't need partition key more granular than by months. Partitioning does not speed up queries (in contrast to the ORDER BY expression). You should never use too granular partitioning. Don't partition your data by client identifiers or names (instead make client identifier or name the first column in the ORDER BY expression).
|
`PARTITION BY` — The [partitioning key](/docs/en/engines/table-engines/mergetree-family/custom-partitioning-key.md). Optional. In most cases, you don't need a partition key, and if you do need to partition, generally you do not need a partition key more granular than by month. Partitioning does not speed up queries (in contrast to the ORDER BY expression). You should never use too granular partitioning. Don't partition your data by client identifiers or names (instead, make client identifier or name the first column in the ORDER BY expression).
|
||||||
|
|
||||||
For partitioning by month, use the `toYYYYMM(date_column)` expression, where `date_column` is a column with a date of the type [Date](/docs/en/sql-reference/data-types/date.md). The partition names here have the `"YYYYMM"` format.
|
For partitioning by month, use the `toYYYYMM(date_column)` expression, where `date_column` is a column with a date of the type [Date](/docs/en/sql-reference/data-types/date.md). The partition names here have the `"YYYYMM"` format.
|
||||||
|
|
||||||
@ -470,6 +470,9 @@ The `set` index can be used with all functions. Function subsets for other index
|
|||||||
| [empty](/docs/en/sql-reference/functions/array-functions#function-empty) | ✔ | ✔ | ✗ | ✗ | ✗ |
|
| [empty](/docs/en/sql-reference/functions/array-functions#function-empty) | ✔ | ✔ | ✗ | ✗ | ✗ |
|
||||||
| [notEmpty](/docs/en/sql-reference/functions/array-functions#function-notempty) | ✔ | ✔ | ✗ | ✗ | ✗ |
|
| [notEmpty](/docs/en/sql-reference/functions/array-functions#function-notempty) | ✔ | ✔ | ✗ | ✗ | ✗ |
|
||||||
| hasToken | ✗ | ✗ | ✗ | ✔ | ✗ |
|
| hasToken | ✗ | ✗ | ✗ | ✔ | ✗ |
|
||||||
|
| hasTokenOrNull | ✗ | ✗ | ✗ | ✔ | ✗ |
|
||||||
|
| hasTokenCaseInsensitive | ✗ | ✗ | ✗ | ✔ | ✗ |
|
||||||
|
| hasTokenCaseInsensitiveOrNull | ✗ | ✗ | ✗ | ✔ | ✗ |
|
||||||
|
|
||||||
Functions with a constant argument that is less than ngram size can’t be used by `ngrambf_v1` for query optimization.
|
Functions with a constant argument that is less than ngram size can’t be used by `ngrambf_v1` for query optimization.
|
||||||
|
|
||||||
|
@ -86,3 +86,9 @@ $ echo -e "1,2\n3,4" | clickhouse-local -q "CREATE TABLE table (a Int64, b Int64
|
|||||||
- `SELECT ... SAMPLE`
|
- `SELECT ... SAMPLE`
|
||||||
- Indices
|
- Indices
|
||||||
- Replication
|
- Replication
|
||||||
|
|
||||||
|
## PARTITION BY
|
||||||
|
|
||||||
|
`PARTITION BY` — Optional. It is possible to create separate files by partitioning the data on a partition key. In most cases, you don't need a partition key, and if it is needed you generally don't need a partition key more granular than by month. Partitioning does not speed up queries (in contrast to the ORDER BY expression). You should never use too granular partitioning. Don't partition your data by client identifiers or names (instead, make client identifier or name the first column in the ORDER BY expression).
|
||||||
|
|
||||||
|
For partitioning by month, use the `toYYYYMM(date_column)` expression, where `date_column` is a column with a date of the type [Date](/docs/en/sql-reference/data-types/date.md). The partition names here have the `"YYYYMM"` format.
|
||||||
|
@ -96,3 +96,9 @@ SELECT * FROM url_engine_table
|
|||||||
- `ALTER` and `SELECT...SAMPLE` operations.
|
- `ALTER` and `SELECT...SAMPLE` operations.
|
||||||
- Indexes.
|
- Indexes.
|
||||||
- Replication.
|
- Replication.
|
||||||
|
|
||||||
|
## PARTITION BY
|
||||||
|
|
||||||
|
`PARTITION BY` — Optional. It is possible to create separate files by partitioning the data on a partition key. In most cases, you don't need a partition key, and if it is needed you generally don't need a partition key more granular than by month. Partitioning does not speed up queries (in contrast to the ORDER BY expression). You should never use too granular partitioning. Don't partition your data by client identifiers or names (instead, make client identifier or name the first column in the ORDER BY expression).
|
||||||
|
|
||||||
|
For partitioning by month, use the `toYYYYMM(date_column)` expression, where `date_column` is a column with a date of the type [Date](/docs/en/sql-reference/data-types/date.md). The partition names here have the `"YYYYMM"` format.
|
||||||
|
259
docs/en/getting-started/example-datasets/laion.md
Normal file
259
docs/en/getting-started/example-datasets/laion.md
Normal file
@ -0,0 +1,259 @@
|
|||||||
|
# Laion-400M dataset
|
||||||
|
|
||||||
|
The dataset contains 400 million images with English text. For more information follow this [link](https://laion.ai/blog/laion-400-open-dataset/). Laion provides even larger datasets (e.g. [5 billion](https://laion.ai/blog/laion-5b/)). Working with them will be similar.
|
||||||
|
|
||||||
|
The dataset has prepared embeddings for texts and images. This will be used to demonstrate [Approximate nearest neighbor search indexes](../../engines/table-engines/mergetree-family/annindexes.md).
|
||||||
|
|
||||||
|
## Prepare data
|
||||||
|
|
||||||
|
Embeddings are stored in `.npy` files, so we have to read them with python and merge with other data.
|
||||||
|
|
||||||
|
Download data and process it with simple `download.sh` script:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
wget --tries=100 https://deploy.laion.ai/8f83b608504d46bb81708ec86e912220/embeddings/img_emb/img_emb_${1}.npy
|
||||||
|
wget --tries=100 https://deploy.laion.ai/8f83b608504d46bb81708ec86e912220/embeddings/metadata/metadata_${1}.parquet
|
||||||
|
wget --tries=100 https://deploy.laion.ai/8f83b608504d46bb81708ec86e912220/embeddings/text_emb/text_emb_${1}.npy
|
||||||
|
python3 process.py ${1}
|
||||||
|
```
|
||||||
|
|
||||||
|
Where `process.py`:
|
||||||
|
|
||||||
|
```python
|
||||||
|
import pandas as pd
|
||||||
|
import numpy as np
|
||||||
|
import os
|
||||||
|
import sys
|
||||||
|
|
||||||
|
str_i = str(sys.argv[1])
|
||||||
|
npy_file = "img_emb_" + str_i + '.npy'
|
||||||
|
metadata_file = "metadata_" + str_i + '.parquet'
|
||||||
|
text_npy = "text_emb_" + str_i + '.npy'
|
||||||
|
|
||||||
|
# load all files
|
||||||
|
im_emb = np.load(npy_file)
|
||||||
|
text_emb = np.load(text_npy)
|
||||||
|
data = pd.read_parquet(metadata_file)
|
||||||
|
|
||||||
|
# combine them
|
||||||
|
data = pd.concat([data, pd.DataFrame({"image_embedding" : [*im_emb]}), pd.DataFrame({"text_embedding" : [*text_emb]})], axis=1, copy=False)
|
||||||
|
|
||||||
|
# you can save more columns
|
||||||
|
data = data[['url', 'caption', 'similarity', "image_embedding", "text_embedding"]]
|
||||||
|
|
||||||
|
# transform np.arrays to lists
|
||||||
|
data['image_embedding'] = data['image_embedding'].apply(lambda x: list(x))
|
||||||
|
data['text_embedding'] = data['text_embedding'].apply(lambda x: list(x))
|
||||||
|
|
||||||
|
# this small hack is needed becase caption sometimes contains all kind of quotes
|
||||||
|
data['caption'] = data['caption'].apply(lambda x: x.replace("'", " ").replace('"', " "))
|
||||||
|
|
||||||
|
# save data to file
|
||||||
|
data.to_csv(str_i + '.csv', header=False)
|
||||||
|
|
||||||
|
# previous files can be removed
|
||||||
|
os.system(f"rm {npy_file} {metadata_file} {text_npy}")
|
||||||
|
```
|
||||||
|
|
||||||
|
You can download data with
|
||||||
|
```bash
|
||||||
|
seq 0 409 | xargs -P100 -I{} bash -c './download.sh {}'
|
||||||
|
```
|
||||||
|
|
||||||
|
The dataset is divided into 409 files. If you want to work only with a certain part of the dataset, just change the limits.
|
||||||
|
|
||||||
|
## Create table for laion
|
||||||
|
|
||||||
|
Without indexes table can be created by
|
||||||
|
|
||||||
|
```sql
|
||||||
|
CREATE TABLE laion_dataset
|
||||||
|
(
|
||||||
|
`id` Int64,
|
||||||
|
`url` String,
|
||||||
|
`caption` String,
|
||||||
|
`similarity` Float32,
|
||||||
|
`image_embedding` Array(Float32),
|
||||||
|
`text_embedding` Array(Float32)
|
||||||
|
)
|
||||||
|
ENGINE = MergeTree
|
||||||
|
ORDER BY id
|
||||||
|
SETTINGS index_granularity = 8192
|
||||||
|
```
|
||||||
|
|
||||||
|
Fill table with data:
|
||||||
|
|
||||||
|
```sql
|
||||||
|
INSERT INTO laion_dataset FROM INFILE '{path_to_csv_files}/*.csv'
|
||||||
|
```
|
||||||
|
|
||||||
|
## Check data in table without indexes
|
||||||
|
|
||||||
|
Let's check the work of the following query on the part of the dataset (8 million records):
|
||||||
|
|
||||||
|
```sql
|
||||||
|
select url, caption from test_laion where similarity > 0.2 order by L2Distance(image_embedding, {target:Array(Float32)}) limit 30
|
||||||
|
```
|
||||||
|
|
||||||
|
Since the embeddings for images and texts may not match, let's also require a certain threshold of matching accuracy to get images that are more likely to satisfy our queries. The client parameter `target`, which is an array of 512 elements. See later in this article for a convenient way of obtaining such vectors. I used a random picture of a cat from the Internet as a target vector.
|
||||||
|
|
||||||
|
**The result**
|
||||||
|
|
||||||
|
```
|
||||||
|
┌─url───────────────────────────────────────────────────────────────────────────────────────────────────────────┬─caption────────────────────────────────────────────────────────────────┐
|
||||||
|
│ https://s3.amazonaws.com/filestore.rescuegroups.org/6685/pictures/animals/13884/13884995/63318230_463x463.jpg │ Adoptable Female Domestic Short Hair │
|
||||||
|
│ https://s3.amazonaws.com/pet-uploads.adoptapet.com/8/b/6/239905226.jpg │ Adopt A Pet :: Marzipan - New York, NY │
|
||||||
|
│ http://d1n3ar4lqtlydb.cloudfront.net/9/2/4/248407625.jpg │ Adopt A Pet :: Butterscotch - New Castle, DE │
|
||||||
|
│ https://s3.amazonaws.com/pet-uploads.adoptapet.com/e/e/c/245615237.jpg │ Adopt A Pet :: Tiggy - Chicago, IL │
|
||||||
|
│ http://pawsofcoronado.org/wp-content/uploads/2012/12/rsz_pumpkin.jpg │ Pumpkin an orange tabby kitten for adoption │
|
||||||
|
│ https://s3.amazonaws.com/pet-uploads.adoptapet.com/7/8/3/188700997.jpg │ Adopt A Pet :: Brian the Brad Pitt of cats - Frankfort, IL │
|
||||||
|
│ https://s3.amazonaws.com/pet-uploads.adoptapet.com/8/b/d/191533561.jpg │ Domestic Shorthair Cat for adoption in Mesa, Arizona - Charlie │
|
||||||
|
│ https://s3.amazonaws.com/pet-uploads.adoptapet.com/0/1/2/221698235.jpg │ Domestic Shorthair Cat for adoption in Marietta, Ohio - Daisy (Spayed) │
|
||||||
|
└───────────────────────────────────────────────────────────────────────────────────────────────────────────────┴────────────────────────────────────────────────────────────────────────┘
|
||||||
|
|
||||||
|
8 rows in set. Elapsed: 6.432 sec. Processed 19.65 million rows, 43.96 GB (3.06 million rows/s., 6.84 GB/s.)
|
||||||
|
```
|
||||||
|
|
||||||
|
## Add indexes
|
||||||
|
|
||||||
|
Create a new table or follow instructions from [alter documentation](../../sql-reference/statements/alter/skipping-index.md).
|
||||||
|
|
||||||
|
```sql
|
||||||
|
CREATE TABLE laion_dataset
|
||||||
|
(
|
||||||
|
`id` Int64,
|
||||||
|
`url` String,
|
||||||
|
`caption` String,
|
||||||
|
`similarity` Float32,
|
||||||
|
`image_embedding` Array(Float32),
|
||||||
|
`text_embedding` Array(Float32),
|
||||||
|
INDEX annoy_image image_embedding TYPE annoy(1000) GRANULARITY 1000,
|
||||||
|
INDEX annoy_text text_embedding TYPE annoy(1000) GRANULARITY 1000
|
||||||
|
)
|
||||||
|
ENGINE = MergeTree
|
||||||
|
ORDER BY id
|
||||||
|
SETTINGS index_granularity = 8192
|
||||||
|
```
|
||||||
|
|
||||||
|
When created, the index will be built by L2Distance. You can read more about the parameters in the [annoy documentation](../../engines/table-engines/mergetree-family/annindexes.md#annoy-annoy). It makes sense to build indexes for a large number of granules. If you need good speed, then GRANULARITY should be several times larger than the expected number of results in the search.
|
||||||
|
Now let's check again with the same query:
|
||||||
|
|
||||||
|
```sql
|
||||||
|
select url, caption from test_indexes_laion where similarity > 0.2 order by L2Distance(image_embedding, {target:Array(Float32)}) limit 8
|
||||||
|
```
|
||||||
|
|
||||||
|
**Result**
|
||||||
|
|
||||||
|
```
|
||||||
|
┌─url──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┬─caption──────────────────────────────────────────────────────────────┐
|
||||||
|
│ http://tse1.mm.bing.net/th?id=OIP.R1CUoYp_4hbeFSHBaaB5-gHaFj │ bed bugs and pets can cats carry bed bugs pets adviser │
|
||||||
|
│ http://pet-uploads.adoptapet.com/1/9/c/1963194.jpg?336w │ Domestic Longhair Cat for adoption in Quincy, Massachusetts - Ashley │
|
||||||
|
│ https://thumbs.dreamstime.com/t/cat-bed-12591021.jpg │ Cat on bed Stock Image │
|
||||||
|
│ https://us.123rf.com/450wm/penta/penta1105/penta110500004/9658511-portrait-of-british-short-hair-kitten-lieing-at-sofa-on-sun.jpg │ Portrait of british short hair kitten lieing at sofa on sun. │
|
||||||
|
│ https://www.easypetmd.com/sites/default/files/Wirehaired%20Vizsla%20(2).jpg │ Vizsla (Wirehaired) image 3 │
|
||||||
|
│ https://images.ctfassets.net/yixw23k2v6vo/0000000200009b8800000000/7950f4e1c1db335ef91bb2bc34428de9/dog-cat-flickr-Impatience_1.jpg?w=600&h=400&fm=jpg&fit=thumb&q=65&fl=progressive │ dog and cat image │
|
||||||
|
│ https://i1.wallbox.ru/wallpapers/small/201523/eaa582ee76a31fd.jpg │ cats, kittens, faces, tonkinese │
|
||||||
|
│ https://www.baxterboo.com/images/breeds/medium/cairn-terrier.jpg │ Cairn Terrier Photo │
|
||||||
|
└──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┴──────────────────────────────────────────────────────────────────────┘
|
||||||
|
|
||||||
|
8 rows in set. Elapsed: 0.641 sec. Processed 22.06 thousand rows, 49.36 MB (91.53 thousand rows/s., 204.81 MB/s.)
|
||||||
|
```
|
||||||
|
|
||||||
|
The speed has increased significantly. But now, the results sometimes differ from what you are looking for. This is due to the approximation of the search and the quality of the constructed embedding. Note that the example was given for picture embeddings, but there are also text embeddings in the dataset, which can also be used for searching.
|
||||||
|
|
||||||
|
## Scripts for embeddings
|
||||||
|
|
||||||
|
Usually, we do not want to get embeddings from existing data, but to get them for new data and look for similar ones in old data. We can use [UDF](../../sql-reference/functions/index.md#sql-user-defined-functions) for this purpose. They will allow you to set the `target` vector without leaving the client. All of the following scripts will be written for the `ViT-B/32` model, as it was used for this dataset. You can use any model, but it is necessary to build embeddings in the dataset and for new objects using the same model.
|
||||||
|
|
||||||
|
### Text embeddings
|
||||||
|
|
||||||
|
`encode_text.py`:
|
||||||
|
```python
|
||||||
|
#!/usr/bin/python3
|
||||||
|
import clip
|
||||||
|
import torch
|
||||||
|
import numpy as np
|
||||||
|
import sys
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
device = "cuda" if torch.cuda.is_available() else "cpu"
|
||||||
|
model, preprocess = clip.load("ViT-B/32", device=device)
|
||||||
|
for text in sys.stdin:
|
||||||
|
inputs = clip.tokenize(text)
|
||||||
|
with torch.no_grad():
|
||||||
|
text_features = model.encode_text(inputs)[0].tolist()
|
||||||
|
sys.stdout.flush()
|
||||||
|
```
|
||||||
|
|
||||||
|
`encode_text_function.xml`:
|
||||||
|
```xml
|
||||||
|
<functions>
|
||||||
|
<function>
|
||||||
|
<type>executable</type>
|
||||||
|
<name>encode_text</name>
|
||||||
|
<return_type>Array(Float32)</return_type>
|
||||||
|
<argument>
|
||||||
|
<type>String</type>
|
||||||
|
<name>text</name>
|
||||||
|
</argument>
|
||||||
|
<format>TabSeparated</format>
|
||||||
|
<command>encode_text.py</command>
|
||||||
|
<command_read_timeout>1000000</command_read_timeout>
|
||||||
|
</function>
|
||||||
|
</functions>
|
||||||
|
```
|
||||||
|
|
||||||
|
Now we can simply use:
|
||||||
|
|
||||||
|
```sql
|
||||||
|
SELECT encode_text('cat');
|
||||||
|
```
|
||||||
|
|
||||||
|
The first use will be slow because the model needs to be loaded. But repeated queries will be fast. Then we copy the results to ``set param_target=...`` and can easily write queries
|
||||||
|
|
||||||
|
### Image embeddings
|
||||||
|
|
||||||
|
For pictures, the process is similar, but you send the path instead of the picture (if necessary, you can implement a download picture with processing, but it will take longer)
|
||||||
|
|
||||||
|
`encode_picture.py`
|
||||||
|
```python
|
||||||
|
#!/usr/bin/python3
|
||||||
|
import clip
|
||||||
|
import torch
|
||||||
|
import numpy as np
|
||||||
|
from PIL import Image
|
||||||
|
import sys
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
device = "cuda" if torch.cuda.is_available() else "cpu"
|
||||||
|
model, preprocess = clip.load("ViT-B/32", device=device)
|
||||||
|
for text in sys.stdin:
|
||||||
|
image = preprocess(Image.open(text.strip())).unsqueeze(0).to(device)
|
||||||
|
with torch.no_grad():
|
||||||
|
image_features = model.encode_image(image)[0].tolist()
|
||||||
|
print(image_features)
|
||||||
|
sys.stdout.flush()
|
||||||
|
```
|
||||||
|
|
||||||
|
`encode_picture_function.xml`
|
||||||
|
```xml
|
||||||
|
<functions>
|
||||||
|
<function>
|
||||||
|
<type>executable_pool</type>
|
||||||
|
<name>encode_picture</name>
|
||||||
|
<return_type>Array(Float32)</return_type>
|
||||||
|
<argument>
|
||||||
|
<type>String</type>
|
||||||
|
<name>path</name>
|
||||||
|
</argument>
|
||||||
|
<format>TabSeparated</format>
|
||||||
|
<command>encode_picture.py</command>
|
||||||
|
<command_read_timeout>1000000</command_read_timeout>
|
||||||
|
</function>
|
||||||
|
</functions>
|
||||||
|
```
|
||||||
|
|
||||||
|
The query:
|
||||||
|
```sql
|
||||||
|
SELECT encode_picture('some/path/to/your/picture');
|
||||||
|
```
|
@ -119,9 +119,9 @@ We use [CSVWithNames](../../interfaces/formats.md#csvwithnames) format as the da
|
|||||||
|
|
||||||
We disable `format_csv_allow_single_quotes` as only double quotes are used for data fields and single quotes can be inside the values and should not confuse the CSV parser.
|
We disable `format_csv_allow_single_quotes` as only double quotes are used for data fields and single quotes can be inside the values and should not confuse the CSV parser.
|
||||||
|
|
||||||
We disable [input_format_null_as_default](../../operations/settings/settings.md#settings-input-format-null-as-default) as our data does not have [NULL](../../sql-reference/syntax.md#null-literal). Otherwise ClickHouse will try to parse `\N` sequences and can be confused with `\` in data.
|
We disable [input_format_null_as_default](../../operations/settings/settings-formats.md#settings-input-format-null-as-default) as our data does not have [NULL](../../sql-reference/syntax.md#null-literal). Otherwise ClickHouse will try to parse `\N` sequences and can be confused with `\` in data.
|
||||||
|
|
||||||
The setting [date_time_input_format best_effort](../../operations/settings/settings.md#settings-date_time_input_format) allows to parse [DateTime](../../sql-reference/data-types/datetime.md) fields in wide variety of formats. For example, ISO-8601 without seconds like '2000-01-01 01:02' will be recognized. Without this setting only fixed DateTime format is allowed.
|
The setting [date_time_input_format best_effort](../../operations/settings/settings-formats.md#settings-date_time_input_format) allows to parse [DateTime](../../sql-reference/data-types/datetime.md) fields in wide variety of formats. For example, ISO-8601 without seconds like '2000-01-01 01:02' will be recognized. Without this setting only fixed DateTime format is allowed.
|
||||||
|
|
||||||
## Denormalize the Data {#denormalize-data}
|
## Denormalize the Data {#denormalize-data}
|
||||||
|
|
||||||
|
@ -60,7 +60,7 @@ ls -1 flightlist_*.csv.gz | xargs -P100 -I{} bash -c 'gzip -c -d "{}" | clickhou
|
|||||||
`xargs -P100` specifies to use up to 100 parallel workers but as we only have 30 files, the number of workers will be only 30.
|
`xargs -P100` specifies to use up to 100 parallel workers but as we only have 30 files, the number of workers will be only 30.
|
||||||
- For every file, `xargs` will run a script with `bash -c`. The script has substitution in form of `{}` and the `xargs` command will substitute the filename to it (we have asked it for `xargs` with `-I{}`).
|
- For every file, `xargs` will run a script with `bash -c`. The script has substitution in form of `{}` and the `xargs` command will substitute the filename to it (we have asked it for `xargs` with `-I{}`).
|
||||||
- The script will decompress the file (`gzip -c -d "{}"`) to standard output (`-c` parameter) and the output is redirected to `clickhouse-client`.
|
- The script will decompress the file (`gzip -c -d "{}"`) to standard output (`-c` parameter) and the output is redirected to `clickhouse-client`.
|
||||||
- We also asked to parse [DateTime](../../sql-reference/data-types/datetime.md) fields with extended parser ([--date_time_input_format best_effort](../../operations/settings/settings.md#settings-date_time_input_format)) to recognize ISO-8601 format with timezone offsets.
|
- We also asked to parse [DateTime](../../sql-reference/data-types/datetime.md) fields with extended parser ([--date_time_input_format best_effort](../../operations/settings/settings-formats.md#settings-date_time_input_format)) to recognize ISO-8601 format with timezone offsets.
|
||||||
|
|
||||||
Finally, `clickhouse-client` will do insertion. It will read input data in [CSVWithNames](../../interfaces/formats.md#csvwithnames) format.
|
Finally, `clickhouse-client` will do insertion. It will read input data in [CSVWithNames](../../interfaces/formats.md#csvwithnames) format.
|
||||||
|
|
||||||
|
@ -22,6 +22,7 @@ functions in ClickHouse. The sample datasets include:
|
|||||||
- The [Cell Towers dataset](../getting-started/example-datasets/cell-towers.md) imports a CSV into ClickHouse
|
- The [Cell Towers dataset](../getting-started/example-datasets/cell-towers.md) imports a CSV into ClickHouse
|
||||||
- The [NYPD Complaint Data](../getting-started/example-datasets/nypd_complaint_data.md) demonstrates how to use data inference to simplify creating tables
|
- The [NYPD Complaint Data](../getting-started/example-datasets/nypd_complaint_data.md) demonstrates how to use data inference to simplify creating tables
|
||||||
- The ["What's on the Menu?" dataset](../getting-started/example-datasets/menus.md) has an example of denormalizing data
|
- The ["What's on the Menu?" dataset](../getting-started/example-datasets/menus.md) has an example of denormalizing data
|
||||||
|
- The [Laion dataset](../getting-started/example-datasets/laion.md) has an example of [Approximate nearest neighbor search indexes](../engines/table-engines/mergetree-family/annindexes.md) usage
|
||||||
- [Getting Data Into ClickHouse - Part 1](https://clickhouse.com/blog/getting-data-into-clickhouse-part-1) provides examples of defining a schema and loading a small Hacker News dataset
|
- [Getting Data Into ClickHouse - Part 1](https://clickhouse.com/blog/getting-data-into-clickhouse-part-1) provides examples of defining a schema and loading a small Hacker News dataset
|
||||||
- [Getting Data Into ClickHouse - Part 3 - Using S3](https://clickhouse.com/blog/getting-data-into-clickhouse-part-3-s3) has examples of loading data from s3
|
- [Getting Data Into ClickHouse - Part 3 - Using S3](https://clickhouse.com/blog/getting-data-into-clickhouse-part-3-s3) has examples of loading data from s3
|
||||||
- [Generating random data in ClickHouse](https://clickhouse.com/blog/generating-random-test-distribution-data-for-clickhouse) shows how to generate random data if none of the above fit your needs.
|
- [Generating random data in ClickHouse](https://clickhouse.com/blog/generating-random-test-distribution-data-for-clickhouse) shows how to generate random data if none of the above fit your needs.
|
||||||
|
@ -83,9 +83,10 @@ The supported formats are:
|
|||||||
| [RawBLOB](#rawblob) | ✔ | ✔ |
|
| [RawBLOB](#rawblob) | ✔ | ✔ |
|
||||||
| [MsgPack](#msgpack) | ✔ | ✔ |
|
| [MsgPack](#msgpack) | ✔ | ✔ |
|
||||||
| [MySQLDump](#mysqldump) | ✔ | ✗ |
|
| [MySQLDump](#mysqldump) | ✔ | ✗ |
|
||||||
|
| [Markdown](#markdown) | ✗ | ✔ |
|
||||||
|
|
||||||
|
|
||||||
You can control some format processing parameters with the ClickHouse settings. For more information read the [Settings](/docs/en/operations/settings/settings.md) section.
|
You can control some format processing parameters with the ClickHouse settings. For more information read the [Settings](/docs/en/operations/settings/settings-formats.md) section.
|
||||||
|
|
||||||
## TabSeparated {#tabseparated}
|
## TabSeparated {#tabseparated}
|
||||||
|
|
||||||
@ -148,10 +149,10 @@ Only a small set of symbols are escaped. You can easily stumble onto a string va
|
|||||||
|
|
||||||
Arrays are written as a list of comma-separated values in square brackets. Number items in the array are formatted as normally. `Date` and `DateTime` types are written in single quotes. Strings are written in single quotes with the same escaping rules as above.
|
Arrays are written as a list of comma-separated values in square brackets. Number items in the array are formatted as normally. `Date` and `DateTime` types are written in single quotes. Strings are written in single quotes with the same escaping rules as above.
|
||||||
|
|
||||||
[NULL](/docs/en/sql-reference/syntax.md) is formatted according to setting [format_tsv_null_representation](/docs/en/operations/settings/settings.md/#format_tsv_null_representation) (default value is `\N`).
|
[NULL](/docs/en/sql-reference/syntax.md) is formatted according to setting [format_tsv_null_representation](/docs/en/operations/settings/settings-formats.md/#format_tsv_null_representation) (default value is `\N`).
|
||||||
|
|
||||||
In input data, ENUM values can be represented as names or as ids. First, we try to match the input value to the ENUM name. If we fail and the input value is a number, we try to match this number to ENUM id.
|
In input data, ENUM values can be represented as names or as ids. First, we try to match the input value to the ENUM name. If we fail and the input value is a number, we try to match this number to ENUM id.
|
||||||
If input data contains only ENUM ids, it's recommended to enable the setting [input_format_tsv_enum_as_number](/docs/en/operations/settings/settings.md/#input_format_tsv_enum_as_number) to optimize ENUM parsing.
|
If input data contains only ENUM ids, it's recommended to enable the setting [input_format_tsv_enum_as_number](/docs/en/operations/settings/settings-formats.md/#input_format_tsv_enum_as_number) to optimize ENUM parsing.
|
||||||
|
|
||||||
Each element of [Nested](/docs/en/sql-reference/data-types/nested-data-structures/nested.md) structures is represented as an array.
|
Each element of [Nested](/docs/en/sql-reference/data-types/nested-data-structures/nested.md) structures is represented as an array.
|
||||||
|
|
||||||
@ -183,12 +184,13 @@ SELECT * FROM nestedt FORMAT TSV
|
|||||||
|
|
||||||
### TabSeparated format settings {#tabseparated-format-settings}
|
### TabSeparated format settings {#tabseparated-format-settings}
|
||||||
|
|
||||||
- [format_tsv_null_representation](/docs/en/operations/settings/settings.md/#format_tsv_null_representation) - custom NULL representation in TSV format. Default value - `\N`.
|
- [format_tsv_null_representation](/docs/en/operations/settings/settings-formats.md/#format_tsv_null_representation) - custom NULL representation in TSV format. Default value - `\N`.
|
||||||
- [input_format_tsv_empty_as_default](/docs/en/operations/settings/settings.md/#input_format_tsv_empty_as_default) - treat empty fields in TSV input as default values. Default value - `false`. For complex default expressions [input_format_defaults_for_omitted_fields](/docs/en/operations/settings/settings.md/#input_format_defaults_for_omitted_fields) must be enabled too.
|
- [input_format_tsv_empty_as_default](/docs/en/operations/settings/settings-formats.md/#input_format_tsv_empty_as_default) - treat empty fields in TSV input as default values. Default value - `false`. For complex default expressions [input_format_defaults_for_omitted_fields](/docs/en/operations/settings/settings-formats.md/#input_format_defaults_for_omitted_fields) must be enabled too.
|
||||||
- [input_format_tsv_enum_as_number](/docs/en/operations/settings/settings.md/#input_format_tsv_enum_as_number) - treat inserted enum values in TSV formats as enum indices. Default value - `false`.
|
- [input_format_tsv_enum_as_number](/docs/en/operations/settings/settings-formats.md/#input_format_tsv_enum_as_number) - treat inserted enum values in TSV formats as enum indices. Default value - `false`.
|
||||||
- [input_format_tsv_use_best_effort_in_schema_inference](/docs/en/operations/settings/settings.md/#input_format_tsv_use_best_effort_in_schema_inference) - use some tweaks and heuristics to infer schema in TSV format. If disabled, all fields will be inferred as Strings. Default value - `true`.
|
- [input_format_tsv_use_best_effort_in_schema_inference](/docs/en/operations/settings/settings-formats.md/#input_format_tsv_use_best_effort_in_schema_inference) - use some tweaks and heuristics to infer schema in TSV format. If disabled, all fields will be inferred as Strings. Default value - `true`.
|
||||||
- [output_format_tsv_crlf_end_of_line](/docs/en/operations/settings/settings.md/#output_format_tsv_crlf_end_of_line) - if it is set true, end of line in TSV output format will be `\r\n` instead of `\n`. Default value - `false`.
|
- [output_format_tsv_crlf_end_of_line](/docs/en/operations/settings/settings-formats.md/#output_format_tsv_crlf_end_of_line) - if it is set true, end of line in TSV output format will be `\r\n` instead of `\n`. Default value - `false`.
|
||||||
- [input_format_tsv_skip_first_lines](/docs/en/operations/settings/settings.md/#input_format_tsv_skip_first_lines) - skip specified number of lines at the beginning of data. Default value - `0`.
|
- [input_format_tsv_skip_first_lines](/docs/en/operations/settings/settings-formats.md/#input_format_tsv_skip_first_lines) - skip specified number of lines at the beginning of data. Default value - `0`.
|
||||||
|
- [input_format_tsv_detect_header](/docs/en/operations/settings/settings-formats.md/#input_format_tsv_detect_header) - automatically detect header with names and types in TSV format. Default value - `true`.
|
||||||
|
|
||||||
## TabSeparatedRaw {#tabseparatedraw}
|
## TabSeparatedRaw {#tabseparatedraw}
|
||||||
|
|
||||||
@ -204,8 +206,8 @@ Differs from the `TabSeparated` format in that the column names are written in t
|
|||||||
During parsing, the first row is expected to contain the column names. You can use column names to determine their position and to check their correctness.
|
During parsing, the first row is expected to contain the column names. You can use column names to determine their position and to check their correctness.
|
||||||
|
|
||||||
:::warning
|
:::warning
|
||||||
If setting [input_format_with_names_use_header](/docs/en/operations/settings/settings.md/#input_format_with_names_use_header) is set to 1,
|
If setting [input_format_with_names_use_header](/docs/en/operations/settings/settings-formats.md/#input_format_with_names_use_header) is set to 1,
|
||||||
the columns from the input data will be mapped to the columns of the table by their names, columns with unknown names will be skipped if setting [input_format_skip_unknown_fields](/docs/en/operations/settings/settings.md/#input_format_skip_unknown_fields) is set to 1.
|
the columns from the input data will be mapped to the columns of the table by their names, columns with unknown names will be skipped if setting [input_format_skip_unknown_fields](/docs/en/operations/settings/settings-formats.md/#input_format_skip_unknown_fields) is set to 1.
|
||||||
Otherwise, the first row will be skipped.
|
Otherwise, the first row will be skipped.
|
||||||
:::
|
:::
|
||||||
|
|
||||||
@ -216,10 +218,10 @@ This format is also available under the name `TSVWithNames`.
|
|||||||
Differs from the `TabSeparated` format in that the column names are written to the first row, while the column types are in the second row.
|
Differs from the `TabSeparated` format in that the column names are written to the first row, while the column types are in the second row.
|
||||||
|
|
||||||
:::warning
|
:::warning
|
||||||
If setting [input_format_with_names_use_header](/docs/en/operations/settings/settings.md/#input_format_with_names_use_header) is set to 1,
|
If setting [input_format_with_names_use_header](/docs/en/operations/settings/settings-formats.md/#input_format_with_names_use_header) is set to 1,
|
||||||
the columns from the input data will be mapped to the columns in the table by their names, columns with unknown names will be skipped if setting [input_format_skip_unknown_fields](/docs/en/operations/settings/settings.md/#input_format_skip_unknown_fields) is set to 1.
|
the columns from the input data will be mapped to the columns in the table by their names, columns with unknown names will be skipped if setting [input_format_skip_unknown_fields](/docs/en/operations/settings/settings-formats.md/#input_format_skip_unknown_fields) is set to 1.
|
||||||
Otherwise, the first row will be skipped.
|
Otherwise, the first row will be skipped.
|
||||||
If setting [input_format_with_types_use_header](/docs/en/operations/settings/settings.md/#input_format_with_types_use_header) is set to 1,
|
If setting [input_format_with_types_use_header](/docs/en/operations/settings/settings-formats.md/#input_format_with_types_use_header) is set to 1,
|
||||||
the types from input data will be compared with the types of the corresponding columns from the table. Otherwise, the second row will be skipped.
|
the types from input data will be compared with the types of the corresponding columns from the table. Otherwise, the second row will be skipped.
|
||||||
:::
|
:::
|
||||||
|
|
||||||
@ -427,49 +429,50 @@ Both data output and parsing are supported in this format. For parsing, any orde
|
|||||||
|
|
||||||
Parsing allows the presence of the additional field `tskv` without the equal sign or a value. This field is ignored.
|
Parsing allows the presence of the additional field `tskv` without the equal sign or a value. This field is ignored.
|
||||||
|
|
||||||
During import, columns with unknown names will be skipped if setting [input_format_skip_unknown_fields](/docs/en/operations/settings/settings.md/#input_format_skip_unknown_fields) is set to 1.
|
During import, columns with unknown names will be skipped if setting [input_format_skip_unknown_fields](/docs/en/operations/settings/settings-formats.md/#input_format_skip_unknown_fields) is set to 1.
|
||||||
|
|
||||||
## CSV {#csv}
|
## CSV {#csv}
|
||||||
|
|
||||||
Comma Separated Values format ([RFC](https://tools.ietf.org/html/rfc4180)).
|
Comma Separated Values format ([RFC](https://tools.ietf.org/html/rfc4180)).
|
||||||
|
|
||||||
When formatting, rows are enclosed in double quotes. A double quote inside a string is output as two double quotes in a row. There are no other rules for escaping characters. Date and date-time are enclosed in double quotes. Numbers are output without quotes. Values are separated by a delimiter character, which is `,` by default. The delimiter character is defined in the setting [format_csv_delimiter](/docs/en/operations/settings/settings.md/#format_csv_delimiter). Rows are separated using the Unix line feed (LF). Arrays are serialized in CSV as follows: first, the array is serialized to a string as in TabSeparated format, and then the resulting string is output to CSV in double quotes. Tuples in CSV format are serialized as separate columns (that is, their nesting in the tuple is lost).
|
When formatting, rows are enclosed in double quotes. A double quote inside a string is output as two double quotes in a row. There are no other rules for escaping characters. Date and date-time are enclosed in double quotes. Numbers are output without quotes. Values are separated by a delimiter character, which is `,` by default. The delimiter character is defined in the setting [format_csv_delimiter](/docs/en/operations/settings/settings-formats.md/#format_csv_delimiter). Rows are separated using the Unix line feed (LF). Arrays are serialized in CSV as follows: first, the array is serialized to a string as in TabSeparated format, and then the resulting string is output to CSV in double quotes. Tuples in CSV format are serialized as separate columns (that is, their nesting in the tuple is lost).
|
||||||
|
|
||||||
``` bash
|
``` bash
|
||||||
$ clickhouse-client --format_csv_delimiter="|" --query="INSERT INTO test.csv FORMAT CSV" < data.csv
|
$ clickhouse-client --format_csv_delimiter="|" --query="INSERT INTO test.csv FORMAT CSV" < data.csv
|
||||||
```
|
```
|
||||||
|
|
||||||
\*By default, the delimiter is `,`. See the [format_csv_delimiter](/docs/en/operations/settings/settings.md/#format_csv_delimiter) setting for more information.
|
\*By default, the delimiter is `,`. See the [format_csv_delimiter](/docs/en/operations/settings/settings-formats.md/#format_csv_delimiter) setting for more information.
|
||||||
|
|
||||||
When parsing, all values can be parsed either with or without quotes. Both double and single quotes are supported. Rows can also be arranged without quotes. In this case, they are parsed up to the delimiter character or line feed (CR or LF). In violation of the RFC, when parsing rows without quotes, the leading and trailing spaces and tabs are ignored. For the line feed, Unix (LF), Windows (CR LF) and Mac OS Classic (CR LF) types are all supported.
|
When parsing, all values can be parsed either with or without quotes. Both double and single quotes are supported. Rows can also be arranged without quotes. In this case, they are parsed up to the delimiter character or line feed (CR or LF). In violation of the RFC, when parsing rows without quotes, the leading and trailing spaces and tabs are ignored. For the line feed, Unix (LF), Windows (CR LF) and Mac OS Classic (CR LF) types are all supported.
|
||||||
|
|
||||||
`NULL` is formatted according to setting [format_csv_null_representation](/docs/en/operations/settings/settings.md/#format_csv_null_representation) (default value is `\N`).
|
`NULL` is formatted according to setting [format_csv_null_representation](/docs/en/operations/settings/settings-formats.md/#format_csv_null_representation) (default value is `\N`).
|
||||||
|
|
||||||
In input data, ENUM values can be represented as names or as ids. First, we try to match the input value to the ENUM name. If we fail and the input value is a number, we try to match this number to the ENUM id.
|
In input data, ENUM values can be represented as names or as ids. First, we try to match the input value to the ENUM name. If we fail and the input value is a number, we try to match this number to the ENUM id.
|
||||||
If input data contains only ENUM ids, it's recommended to enable the setting [input_format_csv_enum_as_number](/docs/en/operations/settings/settings.md/#input_format_csv_enum_as_number) to optimize ENUM parsing.
|
If input data contains only ENUM ids, it's recommended to enable the setting [input_format_csv_enum_as_number](/docs/en/operations/settings/settings-formats.md/#input_format_csv_enum_as_number) to optimize ENUM parsing.
|
||||||
|
|
||||||
The CSV format supports the output of totals and extremes the same way as `TabSeparated`.
|
The CSV format supports the output of totals and extremes the same way as `TabSeparated`.
|
||||||
|
|
||||||
### CSV format settings {#csv-format-settings}
|
### CSV format settings {#csv-format-settings}
|
||||||
|
|
||||||
- [format_csv_delimiter](/docs/en/operations/settings/settings.md/#format_csv_delimiter) - the character to be considered as a delimiter in CSV data. Default value - `,`.
|
- [format_csv_delimiter](/docs/en/operations/settings/settings-formats.md/#format_csv_delimiter) - the character to be considered as a delimiter in CSV data. Default value - `,`.
|
||||||
- [format_csv_allow_single_quotes](/docs/en/operations/settings/settings.md/#format_csv_allow_single_quotes) - allow strings in single quotes. Default value - `true`.
|
- [format_csv_allow_single_quotes](/docs/en/operations/settings/settings-formats.md/#format_csv_allow_single_quotes) - allow strings in single quotes. Default value - `true`.
|
||||||
- [format_csv_allow_double_quotes](/docs/en/operations/settings/settings.md/#format_csv_allow_double_quotes) - allow strings in double quotes. Default value - `true`.
|
- [format_csv_allow_double_quotes](/docs/en/operations/settings/settings-formats.md/#format_csv_allow_double_quotes) - allow strings in double quotes. Default value - `true`.
|
||||||
- [format_csv_null_representation](/docs/en/operations/settings/settings.md/#format_tsv_null_representation) - custom NULL representation in CSV format. Default value - `\N`.
|
- [format_csv_null_representation](/docs/en/operations/settings/settings-formats.md/#format_tsv_null_representation) - custom NULL representation in CSV format. Default value - `\N`.
|
||||||
- [input_format_csv_empty_as_default](/docs/en/operations/settings/settings.md/#input_format_csv_empty_as_default) - treat empty fields in CSV input as default values. Default value - `true`. For complex default expressions, [input_format_defaults_for_omitted_fields](/docs/en/operations/settings/settings.md/#input_format_defaults_for_omitted_fields) must be enabled too.
|
- [input_format_csv_empty_as_default](/docs/en/operations/settings/settings-formats.md/#input_format_csv_empty_as_default) - treat empty fields in CSV input as default values. Default value - `true`. For complex default expressions, [input_format_defaults_for_omitted_fields](/docs/en/operations/settings/settings-formats.md/#input_format_defaults_for_omitted_fields) must be enabled too.
|
||||||
- [input_format_csv_enum_as_number](/docs/en/operations/settings/settings.md/#input_format_csv_enum_as_number) - treat inserted enum values in CSV formats as enum indices. Default value - `false`.
|
- [input_format_csv_enum_as_number](/docs/en/operations/settings/settings-formats.md/#input_format_csv_enum_as_number) - treat inserted enum values in CSV formats as enum indices. Default value - `false`.
|
||||||
- [input_format_csv_use_best_effort_in_schema_inference](/docs/en/operations/settings/settings.md/#input_format_csv_use_best_effort_in_schema_inference) - use some tweaks and heuristics to infer schema in CSV format. If disabled, all fields will be inferred as Strings. Default value - `true`.
|
- [input_format_csv_use_best_effort_in_schema_inference](/docs/en/operations/settings/settings-formats.md/#input_format_csv_use_best_effort_in_schema_inference) - use some tweaks and heuristics to infer schema in CSV format. If disabled, all fields will be inferred as Strings. Default value - `true`.
|
||||||
- [input_format_csv_arrays_as_nested_csv](/docs/en/operations/settings/settings.md/#input_format_csv_arrays_as_nested_csv) - when reading Array from CSV, expect that its elements were serialized in nested CSV and then put into string. Default value - `false`.
|
- [input_format_csv_arrays_as_nested_csv](/docs/en/operations/settings/settings-formats.md/#input_format_csv_arrays_as_nested_csv) - when reading Array from CSV, expect that its elements were serialized in nested CSV and then put into string. Default value - `false`.
|
||||||
- [output_format_csv_crlf_end_of_line](/docs/en/operations/settings/settings.md/#output_format_csv_crlf_end_of_line) - if it is set to true, end of line in CSV output format will be `\r\n` instead of `\n`. Default value - `false`.
|
- [output_format_csv_crlf_end_of_line](/docs/en/operations/settings/settings-formats.md/#output_format_csv_crlf_end_of_line) - if it is set to true, end of line in CSV output format will be `\r\n` instead of `\n`. Default value - `false`.
|
||||||
- [input_format_csv_skip_first_lines](/docs/en/operations/settings/settings.md/#input_format_csv_skip_first_lines) - skip the specified number of lines at the beginning of data. Default value - `0`.
|
- [input_format_csv_skip_first_lines](/docs/en/operations/settings/settings-formats.md/#input_format_csv_skip_first_lines) - skip the specified number of lines at the beginning of data. Default value - `0`.
|
||||||
|
- [input_format_csv_detect_header](/docs/en/operations/settings/settings-formats.md/#input_format_csv_detect_header) - automatically detect header with names and types in CSV format. Default value - `true`.
|
||||||
|
|
||||||
## CSVWithNames {#csvwithnames}
|
## CSVWithNames {#csvwithnames}
|
||||||
|
|
||||||
Also prints the header row with column names, similar to [TabSeparatedWithNames](#tabseparatedwithnames).
|
Also prints the header row with column names, similar to [TabSeparatedWithNames](#tabseparatedwithnames).
|
||||||
|
|
||||||
:::warning
|
:::warning
|
||||||
If setting [input_format_with_names_use_header](/docs/en/operations/settings/settings.md/#input_format_with_names_use_header) is set to 1,
|
If setting [input_format_with_names_use_header](/docs/en/operations/settings/settings-formats.md/#input_format_with_names_use_header) is set to 1,
|
||||||
the columns from input data will be mapped to the columns from the table by their names, columns with unknown names will be skipped if setting [input_format_skip_unknown_fields](/docs/en/operations/settings/settings.md/#input_format_skip_unknown_fields) is set to 1.
|
the columns from input data will be mapped to the columns from the table by their names, columns with unknown names will be skipped if setting [input_format_skip_unknown_fields](/docs/en/operations/settings/settings-formats.md/#input_format_skip_unknown_fields) is set to 1.
|
||||||
Otherwise, the first row will be skipped.
|
Otherwise, the first row will be skipped.
|
||||||
:::
|
:::
|
||||||
|
|
||||||
@ -478,16 +481,18 @@ Otherwise, the first row will be skipped.
|
|||||||
Also prints two header rows with column names and types, similar to [TabSeparatedWithNamesAndTypes](#tabseparatedwithnamesandtypes).
|
Also prints two header rows with column names and types, similar to [TabSeparatedWithNamesAndTypes](#tabseparatedwithnamesandtypes).
|
||||||
|
|
||||||
:::warning
|
:::warning
|
||||||
If setting [input_format_with_names_use_header](/docs/en/operations/settings/settings.md/#input_format_with_names_use_header) is set to 1,
|
If setting [input_format_with_names_use_header](/docs/en/operations/settings/settings-formats.md/#input_format_with_names_use_header) is set to 1,
|
||||||
the columns from input data will be mapped to the columns from the table by their names, columns with unknown names will be skipped if setting [input_format_skip_unknown_fields](/docs/en/operations/settings/settings.md/#input_format_skip_unknown_fields) is set to 1.
|
the columns from input data will be mapped to the columns from the table by their names, columns with unknown names will be skipped if setting [input_format_skip_unknown_fields](/docs/en/operations/settings/settings-formats.md/#input_format_skip_unknown_fields) is set to 1.
|
||||||
Otherwise, the first row will be skipped.
|
Otherwise, the first row will be skipped.
|
||||||
If setting [input_format_with_types_use_header](/docs/en/operations/settings/settings.md/#input_format_with_types_use_header) is set to 1,
|
If setting [input_format_with_types_use_header](/docs/en/operations/settings/settings-formats.md/#input_format_with_types_use_header) is set to 1,
|
||||||
the types from input data will be compared with the types of the corresponding columns from the table. Otherwise, the second row will be skipped.
|
the types from input data will be compared with the types of the corresponding columns from the table. Otherwise, the second row will be skipped.
|
||||||
:::
|
:::
|
||||||
|
|
||||||
## CustomSeparated {#format-customseparated}
|
## CustomSeparated {#format-customseparated}
|
||||||
|
|
||||||
Similar to [Template](#format-template), but it prints or reads all names and types of columns and uses escaping rule from [format_custom_escaping_rule](/docs/en/operations/settings/settings.md/#format_custom_escaping_rule) setting and delimiters from [format_custom_field_delimiter](/docs/en/operations/settings/settings.md/#format_custom_field_delimiter), [format_custom_row_before_delimiter](/docs/en/operations/settings/settings.md/#format_custom_row_before_delimiter), [format_custom_row_after_delimiter](/docs/en/operations/settings/settings.md/#format_custom_row_after_delimiter), [format_custom_row_between_delimiter](/docs/en/operations/settings/settings.md/#format_custom_row_between_delimiter), [format_custom_result_before_delimiter](/docs/en/operations/settings/settings.md/#format_custom_result_before_delimiter) and [format_custom_result_after_delimiter](/docs/en/operations/settings/settings.md/#format_custom_result_after_delimiter) settings, not from format strings.
|
Similar to [Template](#format-template), but it prints or reads all names and types of columns and uses escaping rule from [format_custom_escaping_rule](/docs/en/operations/settings/settings-formats.md/#format_custom_escaping_rule) setting and delimiters from [format_custom_field_delimiter](/docs/en/operations/settings/settings-formats.md/#format_custom_field_delimiter), [format_custom_row_before_delimiter](/docs/en/operations/settings/settings-formats.md/#format_custom_row_before_delimiter), [format_custom_row_after_delimiter](/docs/en/operations/settings/settings-formats.md/#format_custom_row_after_delimiter), [format_custom_row_between_delimiter](/docs/en/operations/settings/settings-formats.md/#format_custom_row_between_delimiter), [format_custom_result_before_delimiter](/docs/en/operations/settings/settings-formats.md/#format_custom_result_before_delimiter) and [format_custom_result_after_delimiter](/docs/en/operations/settings/settings-formats.md/#format_custom_result_after_delimiter) settings, not from format strings.
|
||||||
|
|
||||||
|
If setting [input_format_custom_detect_header](/docs/en/operations/settings/settings.md/#input_format_custom_detect_header) is enabled, ClickHouse will automatically detect header with names and types if any.
|
||||||
|
|
||||||
There is also `CustomSeparatedIgnoreSpaces` format, which is similar to [TemplateIgnoreSpaces](#templateignorespaces).
|
There is also `CustomSeparatedIgnoreSpaces` format, which is similar to [TemplateIgnoreSpaces](#templateignorespaces).
|
||||||
|
|
||||||
@ -496,8 +501,8 @@ There is also `CustomSeparatedIgnoreSpaces` format, which is similar to [Templat
|
|||||||
Also prints the header row with column names, similar to [TabSeparatedWithNames](#tabseparatedwithnames).
|
Also prints the header row with column names, similar to [TabSeparatedWithNames](#tabseparatedwithnames).
|
||||||
|
|
||||||
:::warning
|
:::warning
|
||||||
If setting [input_format_with_names_use_header](/docs/en/operations/settings/settings.md/#input_format_with_names_use_header) is set to 1,
|
If setting [input_format_with_names_use_header](/docs/en/operations/settings/settings-formats.md/#input_format_with_names_use_header) is set to 1,
|
||||||
the columns from input data will be mapped to the columns from the table by their names, columns with unknown names will be skipped if setting [input_format_skip_unknown_fields](/docs/en/operations/settings/settings.md/#input_format_skip_unknown_fields) is set to 1.
|
the columns from input data will be mapped to the columns from the table by their names, columns with unknown names will be skipped if setting [input_format_skip_unknown_fields](/docs/en/operations/settings/settings-formats.md/#input_format_skip_unknown_fields) is set to 1.
|
||||||
Otherwise, the first row will be skipped.
|
Otherwise, the first row will be skipped.
|
||||||
:::
|
:::
|
||||||
|
|
||||||
@ -506,10 +511,10 @@ Otherwise, the first row will be skipped.
|
|||||||
Also prints two header rows with column names and types, similar to [TabSeparatedWithNamesAndTypes](#tabseparatedwithnamesandtypes).
|
Also prints two header rows with column names and types, similar to [TabSeparatedWithNamesAndTypes](#tabseparatedwithnamesandtypes).
|
||||||
|
|
||||||
:::warning
|
:::warning
|
||||||
If setting [input_format_with_names_use_header](/docs/en/operations/settings/settings.md/#input_format_with_names_use_header) is set to 1,
|
If setting [input_format_with_names_use_header](/docs/en/operations/settings/settings-formats.md/#input_format_with_names_use_header) is set to 1,
|
||||||
the columns from input data will be mapped to the columns from the table by their names, columns with unknown names will be skipped if setting [input_format_skip_unknown_fields](/docs/en/operations/settings/settings.md/#input_format_skip_unknown_fields) is set to 1.
|
the columns from input data will be mapped to the columns from the table by their names, columns with unknown names will be skipped if setting [input_format_skip_unknown_fields](/docs/en/operations/settings/settings-formats.md/#input_format_skip_unknown_fields) is set to 1.
|
||||||
Otherwise, the first row will be skipped.
|
Otherwise, the first row will be skipped.
|
||||||
If setting [input_format_with_types_use_header](/docs/en/operations/settings/settings.md/#input_format_with_types_use_header) is set to 1,
|
If setting [input_format_with_types_use_header](/docs/en/operations/settings/settings-formats.md/#input_format_with_types_use_header) is set to 1,
|
||||||
the types from input data will be compared with the types of the corresponding columns from the table. Otherwise, the second row will be skipped.
|
the types from input data will be compared with the types of the corresponding columns from the table. Otherwise, the second row will be skipped.
|
||||||
:::
|
:::
|
||||||
|
|
||||||
@ -535,11 +540,11 @@ To read data output by this format you can use [MySQLDump](#mysqldump) input for
|
|||||||
|
|
||||||
### SQLInsert format settings {#sqlinsert-format-settings}
|
### SQLInsert format settings {#sqlinsert-format-settings}
|
||||||
|
|
||||||
- [output_format_sql_insert_max_batch_size](/docs/en/operations/settings/settings.md/#output_format_sql_insert_max_batch_size) - The maximum number of rows in one INSERT statement. Default value - `65505`.
|
- [output_format_sql_insert_max_batch_size](/docs/en/operations/settings/settings-formats.md/#output_format_sql_insert_max_batch_size) - The maximum number of rows in one INSERT statement. Default value - `65505`.
|
||||||
- [output_format_sql_insert_table_name](/docs/en/operations/settings/settings.md/#output_format_sql_insert_table_name) - The name of the table in the output INSERT query. Default value - `'table'`.
|
- [output_format_sql_insert_table_name](/docs/en/operations/settings/settings-formats.md/#output_format_sql_insert_table_name) - The name of the table in the output INSERT query. Default value - `'table'`.
|
||||||
- [output_format_sql_insert_include_column_names](/docs/en/operations/settings/settings.md/#output_format_sql_insert_include_column_names) - Include column names in INSERT query. Default value - `true`.
|
- [output_format_sql_insert_include_column_names](/docs/en/operations/settings/settings-formats.md/#output_format_sql_insert_include_column_names) - Include column names in INSERT query. Default value - `true`.
|
||||||
- [output_format_sql_insert_use_replace](/docs/en/operations/settings/settings.md/#output_format_sql_insert_use_replace) - Use REPLACE statement instead of INSERT. Default value - `false`.
|
- [output_format_sql_insert_use_replace](/docs/en/operations/settings/settings-formats.md/#output_format_sql_insert_use_replace) - Use REPLACE statement instead of INSERT. Default value - `false`.
|
||||||
- [output_format_sql_insert_quote_names](/docs/en/operations/settings/settings.md/#output_format_sql_insert_quote_names) - Quote column names with "\`" characters. Default value - `true`.
|
- [output_format_sql_insert_quote_names](/docs/en/operations/settings/settings-formats.md/#output_format_sql_insert_quote_names) - Quote column names with "\`" characters. Default value - `true`.
|
||||||
|
|
||||||
## JSON {#json}
|
## JSON {#json}
|
||||||
|
|
||||||
@ -599,7 +604,7 @@ SELECT SearchPhrase, count() AS c FROM test.hits GROUP BY SearchPhrase WITH TOTA
|
|||||||
}
|
}
|
||||||
```
|
```
|
||||||
|
|
||||||
The JSON is compatible with JavaScript. To ensure this, some characters are additionally escaped: the slash `/` is escaped as `\/`; alternative line breaks `U+2028` and `U+2029`, which break some browsers, are escaped as `\uXXXX`. ASCII control characters are escaped: backspace, form feed, line feed, carriage return, and horizontal tab are replaced with `\b`, `\f`, `\n`, `\r`, `\t` , as well as the remaining bytes in the 00-1F range using `\uXXXX` sequences. Invalid UTF-8 sequences are changed to the replacement character <20> so the output text will consist of valid UTF-8 sequences. For compatibility with JavaScript, Int64 and UInt64 integers are enclosed in double quotes by default. To remove the quotes, you can set the configuration parameter [output_format_json_quote_64bit_integers](/docs/en/operations/settings/settings.md/#output_format_json_quote_64bit_integers) to 0.
|
The JSON is compatible with JavaScript. To ensure this, some characters are additionally escaped: the slash `/` is escaped as `\/`; alternative line breaks `U+2028` and `U+2029`, which break some browsers, are escaped as `\uXXXX`. ASCII control characters are escaped: backspace, form feed, line feed, carriage return, and horizontal tab are replaced with `\b`, `\f`, `\n`, `\r`, `\t` , as well as the remaining bytes in the 00-1F range using `\uXXXX` sequences. Invalid UTF-8 sequences are changed to the replacement character <20> so the output text will consist of valid UTF-8 sequences. For compatibility with JavaScript, Int64 and UInt64 integers are enclosed in double quotes by default. To remove the quotes, you can set the configuration parameter [output_format_json_quote_64bit_integers](/docs/en/operations/settings/settings-formats.md/#output_format_json_quote_64bit_integers) to 0.
|
||||||
|
|
||||||
`rows` – The total number of output rows.
|
`rows` – The total number of output rows.
|
||||||
|
|
||||||
@ -610,14 +615,14 @@ If the query contains GROUP BY, rows_before_limit_at_least is the exact number o
|
|||||||
|
|
||||||
`extremes` – Extreme values (when extremes are set to 1).
|
`extremes` – Extreme values (when extremes are set to 1).
|
||||||
|
|
||||||
ClickHouse supports [NULL](/docs/en/sql-reference/syntax.md), which is displayed as `null` in the JSON output. To enable `+nan`, `-nan`, `+inf`, `-inf` values in output, set the [output_format_json_quote_denormals](/docs/en/operations/settings/settings.md/#output_format_json_quote_denormals) to 1.
|
ClickHouse supports [NULL](/docs/en/sql-reference/syntax.md), which is displayed as `null` in the JSON output. To enable `+nan`, `-nan`, `+inf`, `-inf` values in output, set the [output_format_json_quote_denormals](/docs/en/operations/settings/settings-formats.md/#output_format_json_quote_denormals) to 1.
|
||||||
|
|
||||||
**See Also**
|
**See Also**
|
||||||
|
|
||||||
- [JSONEachRow](#jsoneachrow) format
|
- [JSONEachRow](#jsoneachrow) format
|
||||||
- [output_format_json_array_of_rows](/docs/en/operations/settings/settings.md/#output_format_json_array_of_rows) setting
|
- [output_format_json_array_of_rows](/docs/en/operations/settings/settings-formats.md/#output_format_json_array_of_rows) setting
|
||||||
|
|
||||||
For JSON input format, if setting [input_format_json_validate_types_from_metadata](/docs/en/operations/settings/settings.md/#input_format_json_validate_types_from_metadata) is set to 1,
|
For JSON input format, if setting [input_format_json_validate_types_from_metadata](/docs/en/operations/settings/settings-formats.md/#input_format_json_validate_types_from_metadata) is set to 1,
|
||||||
the types from metadata in input data will be compared with the types of the corresponding columns from the table.
|
the types from metadata in input data will be compared with the types of the corresponding columns from the table.
|
||||||
|
|
||||||
## JSONStrings {#jsonstrings}
|
## JSONStrings {#jsonstrings}
|
||||||
@ -690,8 +695,8 @@ Example:
|
|||||||
}
|
}
|
||||||
```
|
```
|
||||||
|
|
||||||
During import, columns with unknown names will be skipped if setting [input_format_skip_unknown_fields](/docs/en/operations/settings/settings.md/#input_format_skip_unknown_fields) is set to 1.
|
During import, columns with unknown names will be skipped if setting [input_format_skip_unknown_fields](/docs/en/operations/settings/settings-formats.md/#input_format_skip_unknown_fields) is set to 1.
|
||||||
Columns that are not present in the block will be filled with default values (you can use the [input_format_defaults_for_omitted_fields](/docs/en/operations/settings/settings.md/#input_format_defaults_for_omitted_fields) setting here)
|
Columns that are not present in the block will be filled with default values (you can use the [input_format_defaults_for_omitted_fields](/docs/en/operations/settings/settings-formats.md/#input_format_defaults_for_omitted_fields) setting here)
|
||||||
|
|
||||||
|
|
||||||
## JSONColumnsWithMetadata {#jsoncolumnsmonoblock}
|
## JSONColumnsWithMetadata {#jsoncolumnsmonoblock}
|
||||||
@ -739,7 +744,7 @@ Example:
|
|||||||
}
|
}
|
||||||
```
|
```
|
||||||
|
|
||||||
For JSONColumnsWithMetadata input format, if setting [input_format_json_validate_types_from_metadata](/docs/en/operations/settings/settings.md/#input_format_json_validate_types_from_metadata) is set to 1,
|
For JSONColumnsWithMetadata input format, if setting [input_format_json_validate_types_from_metadata](/docs/en/operations/settings/settings-formats.md/#input_format_json_validate_types_from_metadata) is set to 1,
|
||||||
the types from metadata in input data will be compared with the types of the corresponding columns from the table.
|
the types from metadata in input data will be compared with the types of the corresponding columns from the table.
|
||||||
|
|
||||||
## JSONAsString {#jsonasstring}
|
## JSONAsString {#jsonasstring}
|
||||||
@ -891,7 +896,7 @@ Example:
|
|||||||
]
|
]
|
||||||
```
|
```
|
||||||
|
|
||||||
Columns that are not present in the block will be filled with default values (you can use [input_format_defaults_for_omitted_fields](/docs/en/operations/settings/settings.md/#input_format_defaults_for_omitted_fields) setting here)
|
Columns that are not present in the block will be filled with default values (you can use [input_format_defaults_for_omitted_fields](/docs/en/operations/settings/settings-formats.md/#input_format_defaults_for_omitted_fields) setting here)
|
||||||
|
|
||||||
## JSONEachRow {#jsoneachrow}
|
## JSONEachRow {#jsoneachrow}
|
||||||
|
|
||||||
@ -905,7 +910,7 @@ Example:
|
|||||||
{"num":44,"str":"hello","arr":[0,1,2,3]}
|
{"num":44,"str":"hello","arr":[0,1,2,3]}
|
||||||
```
|
```
|
||||||
|
|
||||||
While importing data columns with unknown names will be skipped if setting [input_format_skip_unknown_fields](/docs/en/operations/settings/settings.md/#input_format_skip_unknown_fields) is set to 1.
|
While importing data columns with unknown names will be skipped if setting [input_format_skip_unknown_fields](/docs/en/operations/settings/settings-formats.md/#input_format_skip_unknown_fields) is set to 1.
|
||||||
|
|
||||||
## JSONStringsEachRow {#jsonstringseachrow}
|
## JSONStringsEachRow {#jsonstringseachrow}
|
||||||
|
|
||||||
@ -960,8 +965,8 @@ Differs from `JSONEachRow`/`JSONStringsEachRow` in that ClickHouse will also yie
|
|||||||
Differs from `JSONCompactEachRow` format in that it also prints the header row with column names, similar to [TabSeparatedWithNames](#tabseparatedwithnames).
|
Differs from `JSONCompactEachRow` format in that it also prints the header row with column names, similar to [TabSeparatedWithNames](#tabseparatedwithnames).
|
||||||
|
|
||||||
:::warning
|
:::warning
|
||||||
If setting [input_format_with_names_use_header](/docs/en/operations/settings/settings.md/#input_format_with_names_use_header) is set to 1,
|
If setting [input_format_with_names_use_header](/docs/en/operations/settings/settings-formats.md/#input_format_with_names_use_header) is set to 1,
|
||||||
the columns from input data will be mapped to the columns from the table by their names, columns with unknown names will be skipped if setting [input_format_skip_unknown_fields](/docs/en/operations/settings/settings.md/#input_format_skip_unknown_fields) is set to 1.
|
the columns from input data will be mapped to the columns from the table by their names, columns with unknown names will be skipped if setting [input_format_skip_unknown_fields](/docs/en/operations/settings/settings-formats.md/#input_format_skip_unknown_fields) is set to 1.
|
||||||
Otherwise, the first row will be skipped.
|
Otherwise, the first row will be skipped.
|
||||||
:::
|
:::
|
||||||
|
|
||||||
@ -970,10 +975,10 @@ Otherwise, the first row will be skipped.
|
|||||||
Differs from `JSONCompactEachRow` format in that it also prints two header rows with column names and types, similar to [TabSeparatedWithNamesAndTypes](#tabseparatedwithnamesandtypes).
|
Differs from `JSONCompactEachRow` format in that it also prints two header rows with column names and types, similar to [TabSeparatedWithNamesAndTypes](#tabseparatedwithnamesandtypes).
|
||||||
|
|
||||||
:::warning
|
:::warning
|
||||||
If setting [input_format_with_names_use_header](/docs/en/operations/settings/settings.md/#input_format_with_names_use_header) is set to 1,
|
If setting [input_format_with_names_use_header](/docs/en/operations/settings/settings-formats.md/#input_format_with_names_use_header) is set to 1,
|
||||||
the columns from input data will be mapped to the columns from the table by their names, columns with unknown names will be skipped if setting [input_format_skip_unknown_fields](/docs/en/operations/settings/settings.md/#input_format_skip_unknown_fields) is set to 1.
|
the columns from input data will be mapped to the columns from the table by their names, columns with unknown names will be skipped if setting [input_format_skip_unknown_fields](/docs/en/operations/settings/settings-formats.md/#input_format_skip_unknown_fields) is set to 1.
|
||||||
Otherwise, the first row will be skipped.
|
Otherwise, the first row will be skipped.
|
||||||
If setting [input_format_with_types_use_header](/docs/en/operations/settings/settings.md/#input_format_with_types_use_header) is set to 1,
|
If setting [input_format_with_types_use_header](/docs/en/operations/settings/settings-formats.md/#input_format_with_types_use_header) is set to 1,
|
||||||
the types from input data will be compared with the types of the corresponding columns from the table. Otherwise, the second row will be skipped.
|
the types from input data will be compared with the types of the corresponding columns from the table. Otherwise, the second row will be skipped.
|
||||||
:::
|
:::
|
||||||
|
|
||||||
@ -982,8 +987,8 @@ the types from input data will be compared with the types of the corresponding c
|
|||||||
Differs from `JSONCompactStringsEachRow` in that in that it also prints the header row with column names, similar to [TabSeparatedWithNames](#tabseparatedwithnames).
|
Differs from `JSONCompactStringsEachRow` in that in that it also prints the header row with column names, similar to [TabSeparatedWithNames](#tabseparatedwithnames).
|
||||||
|
|
||||||
:::warning
|
:::warning
|
||||||
If setting [input_format_with_names_use_header](/docs/en/operations/settings/settings.md/#input_format_with_names_use_header) is set to 1,
|
If setting [input_format_with_names_use_header](/docs/en/operations/settings/settings-formats.md/#input_format_with_names_use_header) is set to 1,
|
||||||
the columns from input data will be mapped to the columns from the table by their names, columns with unknown names will be skipped if setting [input_format_skip_unknown_fields](/docs/en/operations/settings/settings.md/#input_format_skip_unknown_fields) is set to 1.
|
the columns from input data will be mapped to the columns from the table by their names, columns with unknown names will be skipped if setting [input_format_skip_unknown_fields](/docs/en/operations/settings/settings-formats.md/#input_format_skip_unknown_fields) is set to 1.
|
||||||
Otherwise, the first row will be skipped.
|
Otherwise, the first row will be skipped.
|
||||||
:::
|
:::
|
||||||
|
|
||||||
@ -992,10 +997,10 @@ Otherwise, the first row will be skipped.
|
|||||||
Differs from `JSONCompactStringsEachRow` in that it also prints two header rows with column names and types, similar to [TabSeparatedWithNamesAndTypes](#tabseparatedwithnamesandtypes).
|
Differs from `JSONCompactStringsEachRow` in that it also prints two header rows with column names and types, similar to [TabSeparatedWithNamesAndTypes](#tabseparatedwithnamesandtypes).
|
||||||
|
|
||||||
:::warning
|
:::warning
|
||||||
If setting [input_format_with_names_use_header](/docs/en/operations/settings/settings.md/#input_format_with_names_use_header) is set to 1,
|
If setting [input_format_with_names_use_header](/docs/en/operations/settings/settings-formats.md/#input_format_with_names_use_header) is set to 1,
|
||||||
the columns from input data will be mapped to the columns from the table by their names, columns with unknown names will be skipped if setting [input_format_skip_unknown_fields](/docs/en/operations/settings/settings.md/#input_format_skip_unknown_fields) is set to 1.
|
the columns from input data will be mapped to the columns from the table by their names, columns with unknown names will be skipped if setting [input_format_skip_unknown_fields](/docs/en/operations/settings/settings-formats.md/#input_format_skip_unknown_fields) is set to 1.
|
||||||
Otherwise, the first row will be skipped.
|
Otherwise, the first row will be skipped.
|
||||||
If setting [input_format_with_types_use_header](/docs/en/operations/settings/settings.md/#input_format_with_types_use_header) is set to 1,
|
If setting [input_format_with_types_use_header](/docs/en/operations/settings/settings-formats.md/#input_format_with_types_use_header) is set to 1,
|
||||||
the types from input data will be compared with the types of the corresponding columns from the table. Otherwise, the second row will be skipped.
|
the types from input data will be compared with the types of the corresponding columns from the table. Otherwise, the second row will be skipped.
|
||||||
:::
|
:::
|
||||||
|
|
||||||
@ -1021,7 +1026,7 @@ Example:
|
|||||||
}
|
}
|
||||||
```
|
```
|
||||||
|
|
||||||
To use an object name as a column value you can use the special setting [format_json_object_each_row_column_for_object_name](/docs/en/operations/settings/settings.md/#format_json_object_each_row_column_for_object_name). The value of this setting is set to the name of a column, that is used as JSON key for a row in the resulting object.
|
To use an object name as a column value you can use the special setting [format_json_object_each_row_column_for_object_name](/docs/en/operations/settings/settings-formats.md/#format_json_object_each_row_column_for_object_name). The value of this setting is set to the name of a column, that is used as JSON key for a row in the resulting object.
|
||||||
Examples:
|
Examples:
|
||||||
|
|
||||||
For output:
|
For output:
|
||||||
@ -1095,7 +1100,7 @@ ClickHouse ignores spaces between elements and commas after the objects. You can
|
|||||||
|
|
||||||
ClickHouse substitutes omitted values with the default values for the corresponding [data types](/docs/en/sql-reference/data-types/index.md).
|
ClickHouse substitutes omitted values with the default values for the corresponding [data types](/docs/en/sql-reference/data-types/index.md).
|
||||||
|
|
||||||
If `DEFAULT expr` is specified, ClickHouse uses different substitution rules depending on the [input_format_defaults_for_omitted_fields](/docs/en/operations/settings/settings.md/#input_format_defaults_for_omitted_fields) setting.
|
If `DEFAULT expr` is specified, ClickHouse uses different substitution rules depending on the [input_format_defaults_for_omitted_fields](/docs/en/operations/settings/settings-formats.md/#input_format_defaults_for_omitted_fields) setting.
|
||||||
|
|
||||||
Consider the following table:
|
Consider the following table:
|
||||||
|
|
||||||
@ -1140,7 +1145,7 @@ Any set of bytes can be output in the strings. Use the `JSONEachRow` format if y
|
|||||||
|
|
||||||
### Usage of Nested Structures {#jsoneachrow-nested}
|
### Usage of Nested Structures {#jsoneachrow-nested}
|
||||||
|
|
||||||
If you have a table with [Nested](/docs/en/sql-reference/data-types/nested-data-structures/nested.md) data type columns, you can insert JSON data with the same structure. Enable this feature with the [input_format_import_nested_json](/docs/en/operations/settings/settings.md/#input_format_import_nested_json) setting.
|
If you have a table with [Nested](/docs/en/sql-reference/data-types/nested-data-structures/nested.md) data type columns, you can insert JSON data with the same structure. Enable this feature with the [input_format_import_nested_json](/docs/en/operations/settings/settings-formats.md/#input_format_import_nested_json) setting.
|
||||||
|
|
||||||
For example, consider the following table:
|
For example, consider the following table:
|
||||||
|
|
||||||
@ -1154,7 +1159,7 @@ As you can see in the `Nested` data type description, ClickHouse treats each com
|
|||||||
INSERT INTO json_each_row_nested FORMAT JSONEachRow {"n.s": ["abc", "def"], "n.i": [1, 23]}
|
INSERT INTO json_each_row_nested FORMAT JSONEachRow {"n.s": ["abc", "def"], "n.i": [1, 23]}
|
||||||
```
|
```
|
||||||
|
|
||||||
To insert data as a hierarchical JSON object, set [input_format_import_nested_json=1](/docs/en/operations/settings/settings.md/#input_format_import_nested_json).
|
To insert data as a hierarchical JSON object, set [input_format_import_nested_json=1](/docs/en/operations/settings/settings-formats.md/#input_format_import_nested_json).
|
||||||
|
|
||||||
``` json
|
``` json
|
||||||
{
|
{
|
||||||
@ -1199,20 +1204,20 @@ SELECT * FROM json_each_row_nested
|
|||||||
|
|
||||||
### JSON formats settings {#json-formats-settings}
|
### JSON formats settings {#json-formats-settings}
|
||||||
|
|
||||||
- [input_format_import_nested_json](/docs/en/operations/settings/settings.md/#input_format_import_nested_json) - map nested JSON data to nested tables (it works for JSONEachRow format). Default value - `false`.
|
- [input_format_import_nested_json](/docs/en/operations/settings/settings-formats.md/#input_format_import_nested_json) - map nested JSON data to nested tables (it works for JSONEachRow format). Default value - `false`.
|
||||||
- [input_format_json_read_bools_as_numbers](/docs/en/operations/settings/settings.md/#input_format_json_read_bools_as_numbers) - allow to parse bools as numbers in JSON input formats. Default value - `true`.
|
- [input_format_json_read_bools_as_numbers](/docs/en/operations/settings/settings-formats.md/#input_format_json_read_bools_as_numbers) - allow to parse bools as numbers in JSON input formats. Default value - `true`.
|
||||||
- [input_format_json_read_numbers_as_strings](/docs/en/operations/settings/settings.md/#input_format_json_read_numbers_as_strings) - allow to parse numbers as strings in JSON input formats. Default value - `false`.
|
- [input_format_json_read_numbers_as_strings](/docs/en/operations/settings/settings-formats.md/#input_format_json_read_numbers_as_strings) - allow to parse numbers as strings in JSON input formats. Default value - `false`.
|
||||||
- [input_format_json_read_objects_as_strings](/docs/en/operations/settings/settings.md/#input_format_json_read_objects_as_strings) - allow to parse JSON objects as strings in JSON input formats. Default value - `false`.
|
- [input_format_json_read_objects_as_strings](/docs/en/operations/settings/settings-formats.md/#input_format_json_read_objects_as_strings) - allow to parse JSON objects as strings in JSON input formats. Default value - `false`.
|
||||||
- [input_format_json_named_tuples_as_objects](/docs/en/operations/settings/settings.md/#input_format_json_named_tuples_as_objects) - parse named tuple columns as JSON objects. Default value - `true`.
|
- [input_format_json_named_tuples_as_objects](/docs/en/operations/settings/settings-formats.md/#input_format_json_named_tuples_as_objects) - parse named tuple columns as JSON objects. Default value - `true`.
|
||||||
- [input_format_json_defaults_for_missing_elements_in_named_tuple](/docs/en/operations/settings/settings.md/#input_format_json_defaults_for_missing_elements_in_named_tuple) - insert default values for missing elements in JSON object while parsing named tuple. Default value - `true`.
|
- [input_format_json_defaults_for_missing_elements_in_named_tuple](/docs/en/operations/settings/settings-formats.md/#input_format_json_defaults_for_missing_elements_in_named_tuple) - insert default values for missing elements in JSON object while parsing named tuple. Default value - `true`.
|
||||||
- [output_format_json_quote_64bit_integers](/docs/en/operations/settings/settings.md/#output_format_json_quote_64bit_integers) - controls quoting of 64-bit integers in JSON output format. Default value - `true`.
|
- [output_format_json_quote_64bit_integers](/docs/en/operations/settings/settings-formats.md/#output_format_json_quote_64bit_integers) - controls quoting of 64-bit integers in JSON output format. Default value - `true`.
|
||||||
- [output_format_json_quote_64bit_floats](/docs/en/operations/settings/settings.md/#output_format_json_quote_64bit_floats) - controls quoting of 64-bit floats in JSON output format. Default value - `false`.
|
- [output_format_json_quote_64bit_floats](/docs/en/operations/settings/settings-formats.md/#output_format_json_quote_64bit_floats) - controls quoting of 64-bit floats in JSON output format. Default value - `false`.
|
||||||
- [output_format_json_quote_denormals](/docs/en/operations/settings/settings.md/#output_format_json_quote_denormals) - enables '+nan', '-nan', '+inf', '-inf' outputs in JSON output format. Default value - `false`.
|
- [output_format_json_quote_denormals](/docs/en/operations/settings/settings-formats.md/#output_format_json_quote_denormals) - enables '+nan', '-nan', '+inf', '-inf' outputs in JSON output format. Default value - `false`.
|
||||||
- [output_format_json_quote_decimals](/docs/en/operations/settings/settings.md/#output_format_json_quote_decimals) - controls quoting of decimals in JSON output format. Default value - `false`.
|
- [output_format_json_quote_decimals](/docs/en/operations/settings/settings-formats.md/#output_format_json_quote_decimals) - controls quoting of decimals in JSON output format. Default value - `false`.
|
||||||
- [output_format_json_escape_forward_slashes](/docs/en/operations/settings/settings.md/#output_format_json_escape_forward_slashes) - controls escaping forward slashes for string outputs in JSON output format. Default value - `true`.
|
- [output_format_json_escape_forward_slashes](/docs/en/operations/settings/settings-formats.md/#output_format_json_escape_forward_slashes) - controls escaping forward slashes for string outputs in JSON output format. Default value - `true`.
|
||||||
- [output_format_json_named_tuples_as_objects](/docs/en/operations/settings/settings.md/#output_format_json_named_tuples_as_objects) - serialize named tuple columns as JSON objects. Default value - `true`.
|
- [output_format_json_named_tuples_as_objects](/docs/en/operations/settings/settings-formats.md/#output_format_json_named_tuples_as_objects) - serialize named tuple columns as JSON objects. Default value - `true`.
|
||||||
- [output_format_json_array_of_rows](/docs/en/operations/settings/settings.md/#output_format_json_array_of_rows) - output a JSON array of all rows in JSONEachRow(Compact) format. Default value - `false`.
|
- [output_format_json_array_of_rows](/docs/en/operations/settings/settings-formats.md/#output_format_json_array_of_rows) - output a JSON array of all rows in JSONEachRow(Compact) format. Default value - `false`.
|
||||||
- [output_format_json_validate_utf8](/docs/en/operations/settings/settings.md/#output_format_json_validate_utf8) - enables validation of UTF-8 sequences in JSON output formats (note that it doesn't impact formats JSON/JSONCompact/JSONColumnsWithMetadata, they always validate utf8). Default value - `false`.
|
- [output_format_json_validate_utf8](/docs/en/operations/settings/settings-formats.md/#output_format_json_validate_utf8) - enables validation of UTF-8 sequences in JSON output formats (note that it doesn't impact formats JSON/JSONCompact/JSONColumnsWithMetadata, they always validate utf8). Default value - `false`.
|
||||||
|
|
||||||
## BSONEachRow {#bsoneachrow}
|
## BSONEachRow {#bsoneachrow}
|
||||||
|
|
||||||
@ -1274,8 +1279,8 @@ Note: this format don't work properly on Big-Endian platforms.
|
|||||||
|
|
||||||
### BSON format settings {#bson-format-settings}
|
### BSON format settings {#bson-format-settings}
|
||||||
|
|
||||||
- [output_format_bson_string_as_string](/docs/en/operations/settings/settings.md/#output_format_bson_string_as_string) - use BSON String type instead of Binary for String columns. Default value - `false`.
|
- [output_format_bson_string_as_string](/docs/en/operations/settings/settings-formats.md/#output_format_bson_string_as_string) - use BSON String type instead of Binary for String columns. Default value - `false`.
|
||||||
- [input_format_bson_skip_fields_with_unsupported_types_in_schema_inference](/docs/en/operations/settings/settings.md/#input_format_bson_skip_fields_with_unsupported_types_in_schema_inference) - allow skipping columns with unsupported types while schema inference for format BSONEachRow. Default value - `false`.
|
- [input_format_bson_skip_fields_with_unsupported_types_in_schema_inference](/docs/en/operations/settings/settings-formats.md/#input_format_bson_skip_fields_with_unsupported_types_in_schema_inference) - allow skipping columns with unsupported types while schema inference for format BSONEachRow. Default value - `false`.
|
||||||
|
|
||||||
## Native {#native}
|
## Native {#native}
|
||||||
|
|
||||||
@ -1408,12 +1413,12 @@ Differs from [PrettySpaceNoEscapes](#prettyspacenoescapes) in that up to 10,000
|
|||||||
|
|
||||||
## Pretty formats settings {#pretty-formats-settings}
|
## Pretty formats settings {#pretty-formats-settings}
|
||||||
|
|
||||||
- [output_format_pretty_max_rows](/docs/en/operations/settings/settings.md/#output_format_pretty_max_rows) - rows limit for Pretty formats. Default value - `10000`.
|
- [output_format_pretty_max_rows](/docs/en/operations/settings/settings-formats.md/#output_format_pretty_max_rows) - rows limit for Pretty formats. Default value - `10000`.
|
||||||
- [output_format_pretty_max_column_pad_width](/docs/en/operations/settings/settings.md/#output_format_pretty_max_column_pad_width) - maximum width to pad all values in a column in Pretty formats. Default value - `250`.
|
- [output_format_pretty_max_column_pad_width](/docs/en/operations/settings/settings-formats.md/#output_format_pretty_max_column_pad_width) - maximum width to pad all values in a column in Pretty formats. Default value - `250`.
|
||||||
- [output_format_pretty_max_value_width](/docs/en/operations/settings/settings.md/#output_format_pretty_max_value_width) - Maximum width of value to display in Pretty formats. If greater - it will be cut. Default value - `10000`.
|
- [output_format_pretty_max_value_width](/docs/en/operations/settings/settings-formats.md/#output_format_pretty_max_value_width) - Maximum width of value to display in Pretty formats. If greater - it will be cut. Default value - `10000`.
|
||||||
- [output_format_pretty_color](/docs/en/operations/settings/settings.md/#output_format_pretty_color) - use ANSI escape sequences to paint colors in Pretty formats. Default value - `true`.
|
- [output_format_pretty_color](/docs/en/operations/settings/settings-formats.md/#output_format_pretty_color) - use ANSI escape sequences to paint colors in Pretty formats. Default value - `true`.
|
||||||
- [output_format_pretty_grid_charset](/docs/en/operations/settings/settings.md/#output_format_pretty_grid_charset) - Charset for printing grid borders. Available charsets: ASCII, UTF-8. Default value - `UTF-8`.
|
- [output_format_pretty_grid_charset](/docs/en/operations/settings/settings-formats.md/#output_format_pretty_grid_charset) - Charset for printing grid borders. Available charsets: ASCII, UTF-8. Default value - `UTF-8`.
|
||||||
- [output_format_pretty_row_numbers](/docs/en/operations/settings/settings.md/#output_format_pretty_row_numbers) - Add row numbers before each row for pretty output format. Default value - `false`.
|
- [output_format_pretty_row_numbers](/docs/en/operations/settings/settings-formats.md/#output_format_pretty_row_numbers) - Add row numbers before each row for pretty output format. Default value - `false`.
|
||||||
|
|
||||||
## RowBinary {#rowbinary}
|
## RowBinary {#rowbinary}
|
||||||
|
|
||||||
@ -1438,8 +1443,8 @@ Similar to [RowBinary](#rowbinary), but with added header:
|
|||||||
- N `String`s specifying column names
|
- N `String`s specifying column names
|
||||||
|
|
||||||
:::warning
|
:::warning
|
||||||
If setting [input_format_with_names_use_header](/docs/en/operations/settings/settings.md/#input_format_with_names_use_header) is set to 1,
|
If setting [input_format_with_names_use_header](/docs/en/operations/settings/settings-formats.md/#input_format_with_names_use_header) is set to 1,
|
||||||
the columns from input data will be mapped to the columns from the table by their names, columns with unknown names will be skipped if setting [input_format_skip_unknown_fields](/docs/en/operations/settings/settings.md/#input_format_skip_unknown_fields) is set to 1.
|
the columns from input data will be mapped to the columns from the table by their names, columns with unknown names will be skipped if setting [input_format_skip_unknown_fields](/docs/en/operations/settings/settings-formats.md/#input_format_skip_unknown_fields) is set to 1.
|
||||||
Otherwise, the first row will be skipped.
|
Otherwise, the first row will be skipped.
|
||||||
:::
|
:::
|
||||||
|
|
||||||
@ -1452,16 +1457,16 @@ Similar to [RowBinary](#rowbinary), but with added header:
|
|||||||
- N `String`s specifying column types
|
- N `String`s specifying column types
|
||||||
|
|
||||||
:::warning
|
:::warning
|
||||||
If setting [input_format_with_names_use_header](/docs/en/operations/settings/settings.md/#input_format_with_names_use_header) is set to 1,
|
If setting [input_format_with_names_use_header](/docs/en/operations/settings/settings-formats.md/#input_format_with_names_use_header) is set to 1,
|
||||||
the columns from input data will be mapped to the columns from the table by their names, columns with unknown names will be skipped if setting [input_format_skip_unknown_fields](/docs/en/operations/settings/settings.md/#input_format_skip_unknown_fields) is set to 1.
|
the columns from input data will be mapped to the columns from the table by their names, columns with unknown names will be skipped if setting [input_format_skip_unknown_fields](/docs/en/operations/settings/settings-formats.md/#input_format_skip_unknown_fields) is set to 1.
|
||||||
Otherwise, the first row will be skipped.
|
Otherwise, the first row will be skipped.
|
||||||
If setting [input_format_with_types_use_header](/docs/en/operations/settings/settings.md/#input_format_with_types_use_header) is set to 1,
|
If setting [input_format_with_types_use_header](/docs/en/operations/settings/settings-formats.md/#input_format_with_types_use_header) is set to 1,
|
||||||
the types from input data will be compared with the types of the corresponding columns from the table. Otherwise, the second row will be skipped.
|
the types from input data will be compared with the types of the corresponding columns from the table. Otherwise, the second row will be skipped.
|
||||||
:::
|
:::
|
||||||
|
|
||||||
## RowBinary format settings {#row-binary-format-settings}
|
## RowBinary format settings {#row-binary-format-settings}
|
||||||
|
|
||||||
- [format_binary_max_string_size](/docs/en/operations/settings/settings.md/#format_binary_max_string_size) - The maximum allowed size for String in RowBinary format. Default value - `1GiB`.
|
- [format_binary_max_string_size](/docs/en/operations/settings/settings-formats.md/#format_binary_max_string_size) - The maximum allowed size for String in RowBinary format. Default value - `1GiB`.
|
||||||
|
|
||||||
## Values {#data-format-values}
|
## Values {#data-format-values}
|
||||||
|
|
||||||
@ -1473,9 +1478,9 @@ This is the format that is used in `INSERT INTO t VALUES ...`, but you can also
|
|||||||
|
|
||||||
## Values format settings {#values-format-settings}
|
## Values format settings {#values-format-settings}
|
||||||
|
|
||||||
- [input_format_values_interpret_expressions](/docs/en/operations/settings/settings.md/#input_format_values_interpret_expressions) - if the field could not be parsed by streaming parser, run SQL parser and try to interpret it as SQL expression. Default value - `true`.
|
- [input_format_values_interpret_expressions](/docs/en/operations/settings/settings-formats.md/#input_format_values_interpret_expressions) - if the field could not be parsed by streaming parser, run SQL parser and try to interpret it as SQL expression. Default value - `true`.
|
||||||
- [input_format_values_deduce_templates_of_expressions](/docs/en/operations/settings/settings.md/#input_format_values_deduce_templates_of_expressions) -if the field could not be parsed by streaming parser, run SQL parser, deduce template of the SQL expression, try to parse all rows using template and then interpret expression for all rows. Default value - `true`.
|
- [input_format_values_deduce_templates_of_expressions](/docs/en/operations/settings/settings-formats.md/#input_format_values_deduce_templates_of_expressions) -if the field could not be parsed by streaming parser, run SQL parser, deduce template of the SQL expression, try to parse all rows using template and then interpret expression for all rows. Default value - `true`.
|
||||||
- [input_format_values_accurate_types_of_literals](/docs/en/operations/settings/settings.md/#input_format_values_accurate_types_of_literals) - when parsing and interpreting expressions using template, check actual type of literal to avoid possible overflow and precision issues. Default value - `true`.
|
- [input_format_values_accurate_types_of_literals](/docs/en/operations/settings/settings-formats.md/#input_format_values_accurate_types_of_literals) - when parsing and interpreting expressions using template, check actual type of literal to avoid possible overflow and precision issues. Default value - `true`.
|
||||||
|
|
||||||
|
|
||||||
## Vertical {#vertical}
|
## Vertical {#vertical}
|
||||||
@ -1615,7 +1620,7 @@ The table below shows supported data types and how they match ClickHouse [data t
|
|||||||
| `LIST` | [Array](/docs/en/sql-reference/data-types/array.md) | `LIST` |
|
| `LIST` | [Array](/docs/en/sql-reference/data-types/array.md) | `LIST` |
|
||||||
| `STRUCT` | [Tuple](/docs/en/sql-reference/data-types/tuple.md) | `STRUCT` |
|
| `STRUCT` | [Tuple](/docs/en/sql-reference/data-types/tuple.md) | `STRUCT` |
|
||||||
|
|
||||||
For working with `Enum` in CapnProto format use the [format_capn_proto_enum_comparising_mode](/docs/en/operations/settings/settings.md/#format_capn_proto_enum_comparising_mode) setting.
|
For working with `Enum` in CapnProto format use the [format_capn_proto_enum_comparising_mode](/docs/en/operations/settings/settings-formats.md/#format_capn_proto_enum_comparising_mode) setting.
|
||||||
|
|
||||||
Arrays can be nested and can have a value of the `Nullable` type as an argument. `Tuple` type also can be nested.
|
Arrays can be nested and can have a value of the `Nullable` type as an argument. `Tuple` type also can be nested.
|
||||||
|
|
||||||
@ -1809,7 +1814,7 @@ The table below shows supported data types and how they match ClickHouse [data t
|
|||||||
| `long (timestamp-millis)` \** | [DateTime64(3)](/docs/en/sql-reference/data-types/datetime.md) | `long (timestamp-millis)` \* |
|
| `long (timestamp-millis)` \** | [DateTime64(3)](/docs/en/sql-reference/data-types/datetime.md) | `long (timestamp-millis)` \* |
|
||||||
| `long (timestamp-micros)` \** | [DateTime64(6)](/docs/en/sql-reference/data-types/datetime.md) | `long (timestamp-micros)` \* |
|
| `long (timestamp-micros)` \** | [DateTime64(6)](/docs/en/sql-reference/data-types/datetime.md) | `long (timestamp-micros)` \* |
|
||||||
|
|
||||||
\* `bytes` is default, controlled by [output_format_avro_string_column_pattern](/docs/en/operations/settings/settings.md/#output_format_avro_string_column_pattern)
|
\* `bytes` is default, controlled by [output_format_avro_string_column_pattern](/docs/en/operations/settings/settings-formats.md/#output_format_avro_string_column_pattern)
|
||||||
\** [Avro logical types](https://avro.apache.org/docs/current/spec.html#Logical+Types)
|
\** [Avro logical types](https://avro.apache.org/docs/current/spec.html#Logical+Types)
|
||||||
|
|
||||||
Unsupported Avro data types: `record` (non-root), `map`
|
Unsupported Avro data types: `record` (non-root), `map`
|
||||||
@ -1831,7 +1836,7 @@ Unused fields are skipped.
|
|||||||
|
|
||||||
Data types of ClickHouse table columns can differ from the corresponding fields of the Avro data inserted. When inserting data, ClickHouse interprets data types according to the table above and then [casts](/docs/en/sql-reference/functions/type-conversion-functions.md/#type_conversion_function-cast) the data to corresponding column type.
|
Data types of ClickHouse table columns can differ from the corresponding fields of the Avro data inserted. When inserting data, ClickHouse interprets data types according to the table above and then [casts](/docs/en/sql-reference/functions/type-conversion-functions.md/#type_conversion_function-cast) the data to corresponding column type.
|
||||||
|
|
||||||
While importing data, when field is not found in schema and setting [input_format_avro_allow_missing_fields](/docs/en/operations/settings/settings.md/#input_format_avro_allow_missing_fields) is enabled, default value will be used instead of error.
|
While importing data, when field is not found in schema and setting [input_format_avro_allow_missing_fields](/docs/en/operations/settings/settings-formats.md/#input_format_avro_allow_missing_fields) is enabled, default value will be used instead of error.
|
||||||
|
|
||||||
### Selecting Data {#selecting-data-1}
|
### Selecting Data {#selecting-data-1}
|
||||||
|
|
||||||
@ -1846,7 +1851,7 @@ Column names must:
|
|||||||
- start with `[A-Za-z_]`
|
- start with `[A-Za-z_]`
|
||||||
- subsequently contain only `[A-Za-z0-9_]`
|
- subsequently contain only `[A-Za-z0-9_]`
|
||||||
|
|
||||||
Output Avro file compression and sync interval can be configured with [output_format_avro_codec](/docs/en/operations/settings/settings.md/#output_format_avro_codec) and [output_format_avro_sync_interval](/docs/en/operations/settings/settings.md/#output_format_avro_sync_interval) respectively.
|
Output Avro file compression and sync interval can be configured with [output_format_avro_codec](/docs/en/operations/settings/settings-formats.md/#output_format_avro_codec) and [output_format_avro_sync_interval](/docs/en/operations/settings/settings-formats.md/#output_format_avro_sync_interval) respectively.
|
||||||
|
|
||||||
## AvroConfluent {#data-format-avro-confluent}
|
## AvroConfluent {#data-format-avro-confluent}
|
||||||
|
|
||||||
@ -1856,7 +1861,7 @@ Each Avro message embeds a schema id that can be resolved to the actual schema w
|
|||||||
|
|
||||||
Schemas are cached once resolved.
|
Schemas are cached once resolved.
|
||||||
|
|
||||||
Schema Registry URL is configured with [format_avro_schema_registry_url](/docs/en/operations/settings/settings.md/#format_avro_schema_registry_url).
|
Schema Registry URL is configured with [format_avro_schema_registry_url](/docs/en/operations/settings/settings-formats.md/#format_avro_schema_registry_url).
|
||||||
|
|
||||||
### Data Types Matching {#data_types-matching-1}
|
### Data Types Matching {#data_types-matching-1}
|
||||||
|
|
||||||
@ -1954,12 +1959,12 @@ To exchange data with Hadoop, you can use [HDFS table engine](/docs/en/engines/t
|
|||||||
|
|
||||||
### Parquet format settings {#parquet-format-settings}
|
### Parquet format settings {#parquet-format-settings}
|
||||||
|
|
||||||
- [output_format_parquet_row_group_size](/docs/en/operations/settings/settings.md/#output_format_parquet_row_group_size) - row group size in rows while data output. Default value - `1000000`.
|
- [output_format_parquet_row_group_size](/docs/en/operations/settings/settings-formats.md/#output_format_parquet_row_group_size) - row group size in rows while data output. Default value - `1000000`.
|
||||||
- [output_format_parquet_string_as_string](/docs/en/operations/settings/settings.md/#output_format_parquet_string_as_string) - use Parquet String type instead of Binary for String columns. Default value - `false`.
|
- [output_format_parquet_string_as_string](/docs/en/operations/settings/settings-formats.md/#output_format_parquet_string_as_string) - use Parquet String type instead of Binary for String columns. Default value - `false`.
|
||||||
- [input_format_parquet_import_nested](/docs/en/operations/settings/settings.md/#input_format_parquet_import_nested) - allow inserting array of structs into [Nested](/docs/en/sql-reference/data-types/nested-data-structures/nested.md) table in Parquet input format. Default value - `false`.
|
- [input_format_parquet_import_nested](/docs/en/operations/settings/settings-formats.md/#input_format_parquet_import_nested) - allow inserting array of structs into [Nested](/docs/en/sql-reference/data-types/nested-data-structures/nested.md) table in Parquet input format. Default value - `false`.
|
||||||
- [input_format_parquet_case_insensitive_column_matching](/docs/en/operations/settings/settings.md/#input_format_parquet_case_insensitive_column_matching) - ignore case when matching Parquet columns with ClickHouse columns. Default value - `false`.
|
- [input_format_parquet_case_insensitive_column_matching](/docs/en/operations/settings/settings-formats.md/#input_format_parquet_case_insensitive_column_matching) - ignore case when matching Parquet columns with ClickHouse columns. Default value - `false`.
|
||||||
- [input_format_parquet_allow_missing_columns](/docs/en/operations/settings/settings.md/#input_format_parquet_allow_missing_columns) - allow missing columns while reading Parquet data. Default value - `false`.
|
- [input_format_parquet_allow_missing_columns](/docs/en/operations/settings/settings-formats.md/#input_format_parquet_allow_missing_columns) - allow missing columns while reading Parquet data. Default value - `false`.
|
||||||
- [input_format_parquet_skip_columns_with_unsupported_types_in_schema_inference](/docs/en/operations/settings/settings.md/#input_format_parquet_skip_columns_with_unsupported_types_in_schema_inference) - allow skipping columns with unsupported types while schema inference for Parquet format. Default value - `false`.
|
- [input_format_parquet_skip_columns_with_unsupported_types_in_schema_inference](/docs/en/operations/settings/settings-formats.md/#input_format_parquet_skip_columns_with_unsupported_types_in_schema_inference) - allow skipping columns with unsupported types while schema inference for Parquet format. Default value - `false`.
|
||||||
|
|
||||||
## Arrow {#data-format-arrow}
|
## Arrow {#data-format-arrow}
|
||||||
|
|
||||||
@ -1997,7 +2002,7 @@ The table below shows supported data types and how they match ClickHouse [data t
|
|||||||
|
|
||||||
Arrays can be nested and can have a value of the `Nullable` type as an argument. `Tuple` and `Map` types also can be nested.
|
Arrays can be nested and can have a value of the `Nullable` type as an argument. `Tuple` and `Map` types also can be nested.
|
||||||
|
|
||||||
The `DICTIONARY` type is supported for `INSERT` queries, and for `SELECT` queries there is an [output_format_arrow_low_cardinality_as_dictionary](/docs/en/operations/settings/settings.md/#output-format-arrow-low-cardinality-as-dictionary) setting that allows to output [LowCardinality](/docs/en/sql-reference/data-types/lowcardinality.md) type as a `DICTIONARY` type.
|
The `DICTIONARY` type is supported for `INSERT` queries, and for `SELECT` queries there is an [output_format_arrow_low_cardinality_as_dictionary](/docs/en/operations/settings/settings-formats.md/#output-format-arrow-low-cardinality-as-dictionary) setting that allows to output [LowCardinality](/docs/en/sql-reference/data-types/lowcardinality.md) type as a `DICTIONARY` type.
|
||||||
|
|
||||||
Unsupported Arrow data types: `FIXED_SIZE_BINARY`, `JSON`, `UUID`, `ENUM`.
|
Unsupported Arrow data types: `FIXED_SIZE_BINARY`, `JSON`, `UUID`, `ENUM`.
|
||||||
|
|
||||||
@ -2021,12 +2026,12 @@ $ clickhouse-client --query="SELECT * FROM {some_table} FORMAT Arrow" > {filenam
|
|||||||
|
|
||||||
### Arrow format settings {#parquet-format-settings}
|
### Arrow format settings {#parquet-format-settings}
|
||||||
|
|
||||||
- [output_format_arrow_low_cardinality_as_dictionary](/docs/en/operations/settings/settings.md/#output_format_arrow_low_cardinality_as_dictionary) - enable output ClickHouse LowCardinality type as Dictionary Arrow type. Default value - `false`.
|
- [output_format_arrow_low_cardinality_as_dictionary](/docs/en/operations/settings/settings-formats.md/#output_format_arrow_low_cardinality_as_dictionary) - enable output ClickHouse LowCardinality type as Dictionary Arrow type. Default value - `false`.
|
||||||
- [output_format_arrow_string_as_string](/docs/en/operations/settings/settings.md/#output_format_arrow_string_as_string) - use Arrow String type instead of Binary for String columns. Default value - `false`.
|
- [output_format_arrow_string_as_string](/docs/en/operations/settings/settings-formats.md/#output_format_arrow_string_as_string) - use Arrow String type instead of Binary for String columns. Default value - `false`.
|
||||||
- [input_format_arrow_import_nested](/docs/en/operations/settings/settings.md/#input_format_arrow_import_nested) - allow inserting array of structs into Nested table in Arrow input format. Default value - `false`.
|
- [input_format_arrow_import_nested](/docs/en/operations/settings/settings-formats.md/#input_format_arrow_import_nested) - allow inserting array of structs into Nested table in Arrow input format. Default value - `false`.
|
||||||
- [input_format_arrow_case_insensitive_column_matching](/docs/en/operations/settings/settings.md/#input_format_arrow_case_insensitive_column_matching) - ignore case when matching Arrow columns with ClickHouse columns. Default value - `false`.
|
- [input_format_arrow_case_insensitive_column_matching](/docs/en/operations/settings/settings-formats.md/#input_format_arrow_case_insensitive_column_matching) - ignore case when matching Arrow columns with ClickHouse columns. Default value - `false`.
|
||||||
- [input_format_arrow_allow_missing_columns](/docs/en/operations/settings/settings.md/#input_format_arrow_allow_missing_columns) - allow missing columns while reading Arrow data. Default value - `false`.
|
- [input_format_arrow_allow_missing_columns](/docs/en/operations/settings/settings-formats.md/#input_format_arrow_allow_missing_columns) - allow missing columns while reading Arrow data. Default value - `false`.
|
||||||
- [input_format_arrow_skip_columns_with_unsupported_types_in_schema_inference](/docs/en/operations/settings/settings.md/#input_format_arrow_skip_columns_with_unsupported_types_in_schema_inference) - allow skipping columns with unsupported types while schema inference for Arrow format. Default value - `false`.
|
- [input_format_arrow_skip_columns_with_unsupported_types_in_schema_inference](/docs/en/operations/settings/settings-formats.md/#input_format_arrow_skip_columns_with_unsupported_types_in_schema_inference) - allow skipping columns with unsupported types while schema inference for Arrow format. Default value - `false`.
|
||||||
|
|
||||||
## ArrowStream {#data-format-arrow-stream}
|
## ArrowStream {#data-format-arrow-stream}
|
||||||
|
|
||||||
@ -2081,11 +2086,11 @@ $ clickhouse-client --query="SELECT * FROM {some_table} FORMAT ORC" > {filename.
|
|||||||
|
|
||||||
### Arrow format settings {#parquet-format-settings}
|
### Arrow format settings {#parquet-format-settings}
|
||||||
|
|
||||||
- [output_format_arrow_string_as_string](/docs/en/operations/settings/settings.md/#output_format_arrow_string_as_string) - use Arrow String type instead of Binary for String columns. Default value - `false`.
|
- [output_format_arrow_string_as_string](/docs/en/operations/settings/settings-formats.md/#output_format_arrow_string_as_string) - use Arrow String type instead of Binary for String columns. Default value - `false`.
|
||||||
- [input_format_arrow_import_nested](/docs/en/operations/settings/settings.md/#input_format_arrow_import_nested) - allow inserting array of structs into Nested table in Arrow input format. Default value - `false`.
|
- [input_format_arrow_import_nested](/docs/en/operations/settings/settings-formats.md/#input_format_arrow_import_nested) - allow inserting array of structs into Nested table in Arrow input format. Default value - `false`.
|
||||||
- [input_format_arrow_case_insensitive_column_matching](/docs/en/operations/settings/settings.md/#input_format_arrow_case_insensitive_column_matching) - ignore case when matching Arrow columns with ClickHouse columns. Default value - `false`.
|
- [input_format_arrow_case_insensitive_column_matching](/docs/en/operations/settings/settings-formats.md/#input_format_arrow_case_insensitive_column_matching) - ignore case when matching Arrow columns with ClickHouse columns. Default value - `false`.
|
||||||
- [input_format_arrow_allow_missing_columns](/docs/en/operations/settings/settings.md/#input_format_arrow_allow_missing_columns) - allow missing columns while reading Arrow data. Default value - `false`.
|
- [input_format_arrow_allow_missing_columns](/docs/en/operations/settings/settings-formats.md/#input_format_arrow_allow_missing_columns) - allow missing columns while reading Arrow data. Default value - `false`.
|
||||||
- [input_format_arrow_skip_columns_with_unsupported_types_in_schema_inference](/docs/en/operations/settings/settings.md/#input_format_arrow_skip_columns_with_unsupported_types_in_schema_inference) - allow skipping columns with unsupported types while schema inference for Arrow format. Default value - `false`.
|
- [input_format_arrow_skip_columns_with_unsupported_types_in_schema_inference](/docs/en/operations/settings/settings-formats.md/#input_format_arrow_skip_columns_with_unsupported_types_in_schema_inference) - allow skipping columns with unsupported types while schema inference for Arrow format. Default value - `false`.
|
||||||
|
|
||||||
|
|
||||||
To exchange data with Hadoop, you can use [HDFS table engine](/docs/en/engines/table-engines/integrations/hdfs.md).
|
To exchange data with Hadoop, you can use [HDFS table engine](/docs/en/engines/table-engines/integrations/hdfs.md).
|
||||||
@ -2133,13 +2138,13 @@ When working with the `Regexp` format, you can use the following settings:
|
|||||||
|
|
||||||
**Usage**
|
**Usage**
|
||||||
|
|
||||||
The regular expression from [format_regexp](/docs/en/operations/settings/settings.md/#format_regexp) setting is applied to every line of imported data. The number of subpatterns in the regular expression must be equal to the number of columns in imported dataset.
|
The regular expression from [format_regexp](/docs/en/operations/settings/settings-formats.md/#format_regexp) setting is applied to every line of imported data. The number of subpatterns in the regular expression must be equal to the number of columns in imported dataset.
|
||||||
|
|
||||||
Lines of the imported data must be separated by newline character `'\n'` or DOS-style newline `"\r\n"`.
|
Lines of the imported data must be separated by newline character `'\n'` or DOS-style newline `"\r\n"`.
|
||||||
|
|
||||||
The content of every matched subpattern is parsed with the method of corresponding data type, according to [format_regexp_escaping_rule](/docs/en/operations/settings/settings.md/#format_regexp_escaping_rule) setting.
|
The content of every matched subpattern is parsed with the method of corresponding data type, according to [format_regexp_escaping_rule](/docs/en/operations/settings/settings-formats.md/#format_regexp_escaping_rule) setting.
|
||||||
|
|
||||||
If the regular expression does not match the line and [format_regexp_skip_unmatched](/docs/en/operations/settings/settings.md/#format_regexp_escaping_rule) is set to 1, the line is silently skipped. Otherwise, exception is thrown.
|
If the regular expression does not match the line and [format_regexp_skip_unmatched](/docs/en/operations/settings/settings-formats.md/#format_regexp_escaping_rule) is set to 1, the line is silently skipped. Otherwise, exception is thrown.
|
||||||
|
|
||||||
**Example**
|
**Example**
|
||||||
|
|
||||||
@ -2197,8 +2202,8 @@ in the server configuration.
|
|||||||
|
|
||||||
## Skipping Errors {#skippingerrors}
|
## Skipping Errors {#skippingerrors}
|
||||||
|
|
||||||
Some formats such as `CSV`, `TabSeparated`, `TSKV`, `JSONEachRow`, `Template`, `CustomSeparated` and `Protobuf` can skip broken row if parsing error occurred and continue parsing from the beginning of next row. See [input_format_allow_errors_num](/docs/en/operations/settings/settings.md/#input_format_allow_errors_num) and
|
Some formats such as `CSV`, `TabSeparated`, `TSKV`, `JSONEachRow`, `Template`, `CustomSeparated` and `Protobuf` can skip broken row if parsing error occurred and continue parsing from the beginning of next row. See [input_format_allow_errors_num](/docs/en/operations/settings/settings-formats.md/#input_format_allow_errors_num) and
|
||||||
[input_format_allow_errors_ratio](/docs/en/operations/settings/settings.md/#input_format_allow_errors_ratio) settings.
|
[input_format_allow_errors_ratio](/docs/en/operations/settings/settings-formats.md/#input_format_allow_errors_ratio) settings.
|
||||||
Limitations:
|
Limitations:
|
||||||
- In case of parsing error `JSONEachRow` skips all data until the new line (or EOF), so rows must be delimited by `\n` to count errors correctly.
|
- In case of parsing error `JSONEachRow` skips all data until the new line (or EOF), so rows must be delimited by `\n` to count errors correctly.
|
||||||
- `Template` and `CustomSeparated` use delimiter after the last column and delimiter between rows to find the beginning of next row, so skipping errors works only if at least one of them is not empty.
|
- `Template` and `CustomSeparated` use delimiter after the last column and delimiter between rows to find the beginning of next row, so skipping errors works only if at least one of them is not empty.
|
||||||
@ -2277,17 +2282,17 @@ $ clickhouse-client --query="SELECT * FROM msgpack FORMAT MsgPack" > tmp_msgpack
|
|||||||
|
|
||||||
### MsgPack format settings {#msgpack-format-settings}
|
### MsgPack format settings {#msgpack-format-settings}
|
||||||
|
|
||||||
- [input_format_msgpack_number_of_columns](/docs/en/operations/settings/settings.md/#input_format_msgpack_number_of_columns) - the number of columns in inserted MsgPack data. Used for automatic schema inference from data. Default value - `0`.
|
- [input_format_msgpack_number_of_columns](/docs/en/operations/settings/settings-formats.md/#input_format_msgpack_number_of_columns) - the number of columns in inserted MsgPack data. Used for automatic schema inference from data. Default value - `0`.
|
||||||
- [output_format_msgpack_uuid_representation](/docs/en/operations/settings/settings.md/#output_format_msgpack_uuid_representation) - the way how to output UUID in MsgPack format. Default value - `EXT`.
|
- [output_format_msgpack_uuid_representation](/docs/en/operations/settings/settings-formats.md/#output_format_msgpack_uuid_representation) - the way how to output UUID in MsgPack format. Default value - `EXT`.
|
||||||
|
|
||||||
## MySQLDump {#mysqldump}
|
## MySQLDump {#mysqldump}
|
||||||
|
|
||||||
ClickHouse supports reading MySQL [dumps](https://dev.mysql.com/doc/refman/8.0/en/mysqldump.html).
|
ClickHouse supports reading MySQL [dumps](https://dev.mysql.com/doc/refman/8.0/en/mysqldump.html).
|
||||||
It reads all data from INSERT queries belonging to one table in dump. If there are more than one table, by default it reads data from the first one.
|
It reads all data from INSERT queries belonging to one table in dump. If there are more than one table, by default it reads data from the first one.
|
||||||
You can specify the name of the table from which to read data from using [input_format_mysql_dump_table_name](/docs/en/operations/settings/settings.md/#input_format_mysql_dump_table_name) settings.
|
You can specify the name of the table from which to read data from using [input_format_mysql_dump_table_name](/docs/en/operations/settings/settings-formats.md/#input_format_mysql_dump_table_name) settings.
|
||||||
If setting [input_format_mysql_dump_map_columns](/docs/en/operations/settings/settings.md/#input_format_mysql_dump_map_columns) is set to 1 and
|
If setting [input_format_mysql_dump_map_columns](/docs/en/operations/settings/settings-formats.md/#input_format_mysql_dump_map_columns) is set to 1 and
|
||||||
dump contains CREATE query for specified table or column names in INSERT query the columns from input data will be mapped to the columns from the table by their names,
|
dump contains CREATE query for specified table or column names in INSERT query the columns from input data will be mapped to the columns from the table by their names,
|
||||||
columns with unknown names will be skipped if setting [input_format_skip_unknown_fields](/docs/en/operations/settings/settings.md/#input_format_skip_unknown_fields) is set to 1.
|
columns with unknown names will be skipped if setting [input_format_skip_unknown_fields](/docs/en/operations/settings/settings-formats.md/#input_format_skip_unknown_fields) is set to 1.
|
||||||
This format supports schema inference: if the dump contains CREATE query for the specified table, the structure is extracted from it, otherwise schema is inferred from the data of INSERT queries.
|
This format supports schema inference: if the dump contains CREATE query for the specified table, the structure is extracted from it, otherwise schema is inferred from the data of INSERT queries.
|
||||||
|
|
||||||
Examples:
|
Examples:
|
||||||
@ -2343,3 +2348,26 @@ FROM file(dump.sql, MySQLDump)
|
|||||||
│ 3 │
|
│ 3 │
|
||||||
└───┘
|
└───┘
|
||||||
```
|
```
|
||||||
|
|
||||||
|
## Markdown {#markdown}
|
||||||
|
|
||||||
|
You can export results using [Markdown](https://en.wikipedia.org/wiki/Markdown) format to generate output ready to be pasted into your `.md` files:
|
||||||
|
|
||||||
|
```sql
|
||||||
|
SELECT
|
||||||
|
number,
|
||||||
|
number * 2
|
||||||
|
FROM numbers(5)
|
||||||
|
FORMAT Markdown
|
||||||
|
```
|
||||||
|
```results
|
||||||
|
| number | multiply(number, 2) |
|
||||||
|
|-:|-:|
|
||||||
|
| 0 | 0 |
|
||||||
|
| 1 | 2 |
|
||||||
|
| 2 | 4 |
|
||||||
|
| 3 | 6 |
|
||||||
|
| 4 | 8 |
|
||||||
|
```
|
||||||
|
|
||||||
|
Markdown table will be generated automatically and can be used on markdown-enabled platforms, like Github. This format is used only for output.
|
||||||
|
@ -558,6 +558,8 @@ and if the value is not a number, ClickHouse treats it as a string.
|
|||||||
If you don't want ClickHouse to try to determine complex types using some parsers and heuristics, you can disable setting `input_format_csv_use_best_effort_in_schema_inference`
|
If you don't want ClickHouse to try to determine complex types using some parsers and heuristics, you can disable setting `input_format_csv_use_best_effort_in_schema_inference`
|
||||||
and ClickHouse will treat all columns as Strings.
|
and ClickHouse will treat all columns as Strings.
|
||||||
|
|
||||||
|
If setting `input_format_csv_detect_header` is enabled, ClickHouse will try to detect the header with column names (and maybe types) while inferring schema. This setting is enabled by default.
|
||||||
|
|
||||||
**Examples:**
|
**Examples:**
|
||||||
|
|
||||||
Integers, Floats, Bools, Strings:
|
Integers, Floats, Bools, Strings:
|
||||||
@ -669,6 +671,61 @@ DESC format(CSV, '"[1,2,3]",42.42,Hello World!')
|
|||||||
└──────┴──────────────────┴──────────────┴────────────────────┴─────────┴──────────────────┴────────────────┘
|
└──────┴──────────────────┴──────────────┴────────────────────┴─────────┴──────────────────┴────────────────┘
|
||||||
```
|
```
|
||||||
|
|
||||||
|
Examples of header auto-detection (when `input_format_csv_detect_header` is enabled):
|
||||||
|
|
||||||
|
Only names:
|
||||||
|
```sql
|
||||||
|
SELECT * FROM format(CSV,
|
||||||
|
$$"number","string","array"
|
||||||
|
42,"Hello","[1, 2, 3]"
|
||||||
|
43,"World","[4, 5, 6]"
|
||||||
|
$$)
|
||||||
|
```
|
||||||
|
|
||||||
|
```response
|
||||||
|
┌─number─┬─string─┬─array───┐
|
||||||
|
│ 42 │ Hello │ [1,2,3] │
|
||||||
|
│ 43 │ World │ [4,5,6] │
|
||||||
|
└────────┴────────┴─────────┘
|
||||||
|
```
|
||||||
|
|
||||||
|
Names and types:
|
||||||
|
|
||||||
|
```sql
|
||||||
|
DESC format(CSV,
|
||||||
|
$$"number","string","array"
|
||||||
|
"UInt32","String","Array(UInt16)"
|
||||||
|
42,"Hello","[1, 2, 3]"
|
||||||
|
43,"World","[4, 5, 6]"
|
||||||
|
$$)
|
||||||
|
```
|
||||||
|
|
||||||
|
```response
|
||||||
|
┌─name───┬─type──────────┬─default_type─┬─default_expression─┬─comment─┬─codec_expression─┬─ttl_expression─┐
|
||||||
|
│ number │ UInt32 │ │ │ │ │ │
|
||||||
|
│ string │ String │ │ │ │ │ │
|
||||||
|
│ array │ Array(UInt16) │ │ │ │ │ │
|
||||||
|
└────────┴───────────────┴──────────────┴────────────────────┴─────────┴──────────────────┴────────────────┘
|
||||||
|
```
|
||||||
|
|
||||||
|
Note that the header can be detected only if there is at least one column with a non-String type. If all columns have String type, the header is not detected:
|
||||||
|
|
||||||
|
```sql
|
||||||
|
SELECT * FROM format(CSV,
|
||||||
|
$$"first_column","second_column"
|
||||||
|
"Hello","World"
|
||||||
|
"World","Hello"
|
||||||
|
$$)
|
||||||
|
```
|
||||||
|
|
||||||
|
```response
|
||||||
|
┌─c1───────────┬─c2────────────┐
|
||||||
|
│ first_column │ second_column │
|
||||||
|
│ Hello │ World │
|
||||||
|
│ World │ Hello │
|
||||||
|
└──────────────┴───────────────┘
|
||||||
|
```
|
||||||
|
|
||||||
## TSV/TSKV {#tsv-tskv}
|
## TSV/TSKV {#tsv-tskv}
|
||||||
|
|
||||||
In TSV/TSKV formats ClickHouse extracts column value from the row according to tabular delimiters and then parses extracted value using
|
In TSV/TSKV formats ClickHouse extracts column value from the row according to tabular delimiters and then parses extracted value using
|
||||||
@ -677,6 +734,7 @@ the recursive parser to determine the most appropriate type. If the type cannot
|
|||||||
If you don't want ClickHouse to try to determine complex types using some parsers and heuristics, you can disable setting `input_format_tsv_use_best_effort_in_schema_inference`
|
If you don't want ClickHouse to try to determine complex types using some parsers and heuristics, you can disable setting `input_format_tsv_use_best_effort_in_schema_inference`
|
||||||
and ClickHouse will treat all columns as Strings.
|
and ClickHouse will treat all columns as Strings.
|
||||||
|
|
||||||
|
If setting `input_format_tsv_detect_header` is enabled, ClickHouse will try to detect the header with column names (and maybe types) while inferring schema. This setting is enabled by default.
|
||||||
|
|
||||||
**Examples:**
|
**Examples:**
|
||||||
|
|
||||||
@ -799,6 +857,61 @@ DESC format(TSV, '[1,2,3] 42.42 Hello World!')
|
|||||||
└──────┴──────────────────┴──────────────┴────────────────────┴─────────┴──────────────────┴────────────────┘
|
└──────┴──────────────────┴──────────────┴────────────────────┴─────────┴──────────────────┴────────────────┘
|
||||||
```
|
```
|
||||||
|
|
||||||
|
Examples of header auto-detection (when `input_format_tsv_detect_header` is enabled):
|
||||||
|
|
||||||
|
Only names:
|
||||||
|
```sql
|
||||||
|
SELECT * FROM format(TSV,
|
||||||
|
$$number string array
|
||||||
|
42 Hello [1, 2, 3]
|
||||||
|
43 World [4, 5, 6]
|
||||||
|
$$);
|
||||||
|
```
|
||||||
|
|
||||||
|
```response
|
||||||
|
┌─number─┬─string─┬─array───┐
|
||||||
|
│ 42 │ Hello │ [1,2,3] │
|
||||||
|
│ 43 │ World │ [4,5,6] │
|
||||||
|
└────────┴────────┴─────────┘
|
||||||
|
```
|
||||||
|
|
||||||
|
Names and types:
|
||||||
|
|
||||||
|
```sql
|
||||||
|
DESC format(TSV,
|
||||||
|
$$number string array
|
||||||
|
UInt32 String Array(UInt16)
|
||||||
|
42 Hello [1, 2, 3]
|
||||||
|
43 World [4, 5, 6]
|
||||||
|
$$)
|
||||||
|
```
|
||||||
|
|
||||||
|
```response
|
||||||
|
┌─name───┬─type──────────┬─default_type─┬─default_expression─┬─comment─┬─codec_expression─┬─ttl_expression─┐
|
||||||
|
│ number │ UInt32 │ │ │ │ │ │
|
||||||
|
│ string │ String │ │ │ │ │ │
|
||||||
|
│ array │ Array(UInt16) │ │ │ │ │ │
|
||||||
|
└────────┴───────────────┴──────────────┴────────────────────┴─────────┴──────────────────┴────────────────┘
|
||||||
|
```
|
||||||
|
|
||||||
|
Note that the header can be detected only if there is at least one column with a non-String type. If all columns have String type, the header is not detected:
|
||||||
|
|
||||||
|
```sql
|
||||||
|
SELECT * FROM format(TSV,
|
||||||
|
$$first_column second_column
|
||||||
|
Hello World
|
||||||
|
World Hello
|
||||||
|
$$)
|
||||||
|
```
|
||||||
|
|
||||||
|
```response
|
||||||
|
┌─c1───────────┬─c2────────────┐
|
||||||
|
│ first_column │ second_column │
|
||||||
|
│ Hello │ World │
|
||||||
|
│ World │ Hello │
|
||||||
|
└──────────────┴───────────────┘
|
||||||
|
```
|
||||||
|
|
||||||
## Values {#values}
|
## Values {#values}
|
||||||
|
|
||||||
In Values format ClickHouse extracts column value from the row and then parses it using
|
In Values format ClickHouse extracts column value from the row and then parses it using
|
||||||
@ -911,6 +1024,8 @@ DESC format(TSV, '[1,2,3] 42.42 Hello World!')
|
|||||||
In CustomSeparated format ClickHouse first extracts all column values from the row according to specified delimiters and then tries to infer
|
In CustomSeparated format ClickHouse first extracts all column values from the row according to specified delimiters and then tries to infer
|
||||||
the data type for each value according to escaping rule.
|
the data type for each value according to escaping rule.
|
||||||
|
|
||||||
|
If setting `input_format_custom_detect_header` is enabled, ClickHouse will try to detect the header with column names (and maybe types) while inferring schema. This setting is enabled by default.
|
||||||
|
|
||||||
**Example**
|
**Example**
|
||||||
|
|
||||||
```sql
|
```sql
|
||||||
@ -937,6 +1052,34 @@ $$)
|
|||||||
└──────┴────────────────────────┴──────────────┴────────────────────┴─────────┴──────────────────┴────────────────┘
|
└──────┴────────────────────────┴──────────────┴────────────────────┴─────────┴──────────────────┴────────────────┘
|
||||||
```
|
```
|
||||||
|
|
||||||
|
Example of header auto-detection (when `input_format_custom_detect_header` is enabled):
|
||||||
|
|
||||||
|
```sql
|
||||||
|
SET format_custom_row_before_delimiter = '<row_before_delimiter>',
|
||||||
|
format_custom_row_after_delimiter = '<row_after_delimiter>\n',
|
||||||
|
format_custom_row_between_delimiter = '<row_between_delimiter>\n',
|
||||||
|
format_custom_result_before_delimiter = '<result_before_delimiter>\n',
|
||||||
|
format_custom_result_after_delimiter = '<result_after_delimiter>\n',
|
||||||
|
format_custom_field_delimiter = '<field_delimiter>',
|
||||||
|
format_custom_escaping_rule = 'Quoted'
|
||||||
|
|
||||||
|
DESC format(CustomSeparated, $$<result_before_delimiter>
|
||||||
|
<row_before_delimiter>'number'<field_delimiter>'string'<field_delimiter>'array'<row_after_delimiter>
|
||||||
|
<row_between_delimiter>
|
||||||
|
<row_before_delimiter>42.42<field_delimiter>'Some string 1'<field_delimiter>[1, NULL, 3]<row_after_delimiter>
|
||||||
|
<row_between_delimiter>
|
||||||
|
<row_before_delimiter>NULL<field_delimiter>'Some string 3'<field_delimiter>[1, 2, NULL]<row_after_delimiter>
|
||||||
|
<result_after_delimiter>
|
||||||
|
$$)
|
||||||
|
```
|
||||||
|
|
||||||
|
```response
|
||||||
|
┌─number─┬─string────────┬─array──────┐
|
||||||
|
│ 42.42 │ Some string 1 │ [1,NULL,3] │
|
||||||
|
│ ᴺᵁᴸᴸ │ Some string 3 │ [1,2,NULL] │
|
||||||
|
└────────┴───────────────┴────────────┘
|
||||||
|
```
|
||||||
|
|
||||||
## Template {#template}
|
## Template {#template}
|
||||||
|
|
||||||
In Template format ClickHouse first extracts all column values from the row according to the specified template and then tries to infer the
|
In Template format ClickHouse first extracts all column values from the row according to the specified template and then tries to infer the
|
||||||
@ -1193,7 +1336,7 @@ DESC format(JSONEachRow, $$
|
|||||||
└──────────┴──────────────────┴──────────────┴────────────────────┴─────────┴──────────────────┴────────────────┘
|
└──────────┴──────────────────┴──────────────┴────────────────────┴─────────┴──────────────────┴────────────────┘
|
||||||
```
|
```
|
||||||
|
|
||||||
Note: Parsing datetimes during schema inference respect setting [date_time_input_format](/docs/en/operations/settings/settings.md#date_time_input_format)
|
Note: Parsing datetimes during schema inference respect setting [date_time_input_format](/docs/en/operations/settings/settings-formats.md#date_time_input_format)
|
||||||
|
|
||||||
### input_format_try_infer_dates
|
### input_format_try_infer_dates
|
||||||
|
|
||||||
|
@ -1,6 +1,6 @@
|
|||||||
---
|
---
|
||||||
sidebar_label: Settings Overview
|
sidebar_label: Settings Overview
|
||||||
sidebar_position: 51
|
sidebar_position: 1
|
||||||
slug: /en/operations/settings/
|
slug: /en/operations/settings/
|
||||||
pagination_next: en/operations/settings/settings
|
pagination_next: en/operations/settings/settings
|
||||||
---
|
---
|
||||||
|
@ -106,14 +106,20 @@ Possible values:
|
|||||||
Default value: 1.
|
Default value: 1.
|
||||||
|
|
||||||
The delay (in milliseconds) for `INSERT` is calculated by the formula:
|
The delay (in milliseconds) for `INSERT` is calculated by the formula:
|
||||||
|
|
||||||
```code
|
```code
|
||||||
max_k = parts_to_throw_insert - parts_to_delay_insert
|
max_k = parts_to_throw_insert - parts_to_delay_insert
|
||||||
k = 1 + parts_count_in_partition - parts_to_delay_insert
|
k = 1 + parts_count_in_partition - parts_to_delay_insert
|
||||||
delay_milliseconds = pow(max_delay_to_insert * 1000, k / max_k)
|
delay_milliseconds = pow(max_delay_to_insert * 1000, k / max_k)
|
||||||
```
|
```
|
||||||
|
For example, if a partition has 299 active parts and parts_to_throw_insert = 300, parts_to_delay_insert = 150, max_delay_to_insert = 1, `INSERT` is delayed for `pow( 1 * 1000, (1 + 299 - 150) / (300 - 150) ) = 1000` milliseconds.
|
||||||
|
|
||||||
For example if a partition has 299 active parts and parts_to_throw_insert = 300, parts_to_delay_insert = 150, max_delay_to_insert = 1, `INSERT` is delayed for `pow( 1 * 1000, (1 + 299 - 150) / (300 - 150) ) = 1000` milliseconds.
|
Starting from version 23.1 formula has been changed to:
|
||||||
|
```code
|
||||||
|
allowed_parts_over_threshold = parts_to_throw_insert - parts_to_delay_insert
|
||||||
|
parts_over_threshold = parts_count_in_partition - parts_to_delay_insert + 1
|
||||||
|
delay_milliseconds = max(min_delay_to_insert_ms, (max_delay_to_insert * 1000) * parts_over_threshold / allowed_parts_over_threshold)
|
||||||
|
```
|
||||||
|
For example, if a partition has 224 active parts and parts_to_throw_insert = 300, parts_to_delay_insert = 150, max_delay_to_insert = 1, min_delay_to_insert_ms = 10, `INSERT` is delayed for `max( 10, 1 * 1000 * (224 - 150 + 1) / (300 - 150) ) = 500` milliseconds.
|
||||||
|
|
||||||
## max_parts_in_total {#max-parts-in-total}
|
## max_parts_in_total {#max-parts-in-total}
|
||||||
|
|
||||||
|
1486
docs/en/operations/settings/settings-formats.md
Normal file
1486
docs/en/operations/settings/settings-formats.md
Normal file
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@ -9,6 +9,7 @@ Columns:
|
|||||||
|
|
||||||
- `metric` ([String](../../sql-reference/data-types/string.md)) — Metric name.
|
- `metric` ([String](../../sql-reference/data-types/string.md)) — Metric name.
|
||||||
- `value` ([Float64](../../sql-reference/data-types/float.md)) — Metric value.
|
- `value` ([Float64](../../sql-reference/data-types/float.md)) — Metric value.
|
||||||
|
- `description` ([String](../../sql-reference/data-types/string.md) - Metric description)
|
||||||
|
|
||||||
**Example**
|
**Example**
|
||||||
|
|
||||||
@ -17,18 +18,18 @@ SELECT * FROM system.asynchronous_metrics LIMIT 10
|
|||||||
```
|
```
|
||||||
|
|
||||||
``` text
|
``` text
|
||||||
┌─metric──────────────────────────────────┬──────value─┐
|
┌─metric──────────────────────────────────┬──────value─┬─description────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┐
|
||||||
│ jemalloc.background_thread.run_interval │ 0 │
|
│ AsynchronousMetricsCalculationTimeSpent │ 0.00179053 │ Time in seconds spent for calculation of asynchronous metrics (this is the overhead of asynchronous metrics). │
|
||||||
│ jemalloc.background_thread.num_runs │ 0 │
|
│ NumberOfDetachedByUserParts │ 0 │ The total number of parts detached from MergeTree tables by users with the `ALTER TABLE DETACH` query (as opposed to unexpected, broken or ignored parts). The server does not care about detached parts and they can be removed. │
|
||||||
│ jemalloc.background_thread.num_threads │ 0 │
|
│ NumberOfDetachedParts │ 0 │ The total number of parts detached from MergeTree tables. A part can be detached by a user with the `ALTER TABLE DETACH` query or by the server itself it the part is broken, unexpected or unneeded. The server does not care about detached parts and they can be removed. │
|
||||||
│ jemalloc.retained │ 422551552 │
|
│ TotalRowsOfMergeTreeTables │ 2781309 │ Total amount of rows (records) stored in all tables of MergeTree family. │
|
||||||
│ jemalloc.mapped │ 1682989056 │
|
│ TotalBytesOfMergeTreeTables │ 7741926 │ Total amount of bytes (compressed, including data and indices) stored in all tables of MergeTree family. │
|
||||||
│ jemalloc.resident │ 1656446976 │
|
│ NumberOfTables │ 93 │ Total number of tables summed across the databases on the server, excluding the databases that cannot contain MergeTree tables. The excluded database engines are those who generate the set of tables on the fly, like `Lazy`, `MySQL`, `PostgreSQL`, `SQlite`. │
|
||||||
│ jemalloc.metadata_thp │ 0 │
|
│ NumberOfDatabases │ 6 │ Total number of databases on the server. │
|
||||||
│ jemalloc.metadata │ 10226856 │
|
│ MaxPartCountForPartition │ 6 │ Maximum number of parts per partition across all partitions of all tables of MergeTree family. Values larger than 300 indicates misconfiguration, overload, or massive data loading. │
|
||||||
│ UncompressedCacheCells │ 0 │
|
│ ReplicasSumMergesInQueue │ 0 │ Sum of merge operations in the queue (still to be applied) across Replicated tables. │
|
||||||
│ MarkCacheFiles │ 0 │
|
│ ReplicasSumInsertsInQueue │ 0 │ Sum of INSERT operations in the queue (still to be replicated) across Replicated tables. │
|
||||||
└─────────────────────────────────────────┴────────────┘
|
└─────────────────────────────────────────┴────────────┴────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┘
|
||||||
```
|
```
|
||||||
|
|
||||||
**See Also**
|
**See Also**
|
||||||
|
@ -290,15 +290,11 @@ This storage method works the same way as hashed and allows using date/time (arb
|
|||||||
Example: The table contains discounts for each advertiser in the format:
|
Example: The table contains discounts for each advertiser in the format:
|
||||||
|
|
||||||
``` text
|
``` text
|
||||||
+---------|-------------|-------------|------+
|
┌─advertiser_id─┬─discount_start_date─┬─discount_end_date─┬─amount─┐
|
||||||
| advertiser id | discount start date | discount end date | amount |
|
│ 123 │ 2015-01-16 │ 2015-01-31 │ 0.25 │
|
||||||
+===============+=====================+===================+========+
|
│ 123 │ 2015-01-01 │ 2015-01-15 │ 0.15 │
|
||||||
| 123 | 2015-01-01 | 2015-01-15 | 0.15 |
|
│ 456 │ 2015-01-01 │ 2015-01-15 │ 0.05 │
|
||||||
+---------|-------------|-------------|------+
|
└───────────────┴─────────────────────┴───────────────────┴────────┘
|
||||||
| 123 | 2015-01-16 | 2015-01-31 | 0.25 |
|
|
||||||
+---------|-------------|-------------|------+
|
|
||||||
| 456 | 2015-01-01 | 2015-01-15 | 0.05 |
|
|
||||||
+---------|-------------|-------------|------+
|
|
||||||
```
|
```
|
||||||
|
|
||||||
To use a sample for date ranges, define the `range_min` and `range_max` elements in the [structure](../../../sql-reference/dictionaries/external-dictionaries/external-dicts-dict-structure.md). These elements must contain elements `name` and `type` (if `type` is not specified, the default type will be used - Date). `type` can be any numeric type (Date / DateTime / UInt64 / Int32 / others).
|
To use a sample for date ranges, define the `range_min` and `range_max` elements in the [structure](../../../sql-reference/dictionaries/external-dictionaries/external-dicts-dict-structure.md). These elements must contain elements `name` and `type` (if `type` is not specified, the default type will be used - Date). `type` can be any numeric type (Date / DateTime / UInt64 / Int32 / others).
|
||||||
@ -307,19 +303,25 @@ To use a sample for date ranges, define the `range_min` and `range_max` elements
|
|||||||
Values of `range_min` and `range_max` should fit in `Int64` type.
|
Values of `range_min` and `range_max` should fit in `Int64` type.
|
||||||
:::
|
:::
|
||||||
|
|
||||||
Example:
|
Example:
|
||||||
|
|
||||||
``` xml
|
``` xml
|
||||||
|
<layout>
|
||||||
|
<range_hashed>
|
||||||
|
<!-- Strategy for overlapping ranges (min/max). Default: min (return a matching range with the min(range_min -> range_max) value) -->
|
||||||
|
<range_lookup_strategy>min</range_lookup_strategy>
|
||||||
|
</range_hashed>
|
||||||
|
</layout>
|
||||||
<structure>
|
<structure>
|
||||||
<id>
|
<id>
|
||||||
<name>Id</name>
|
<name>advertiser_id</name>
|
||||||
</id>
|
</id>
|
||||||
<range_min>
|
<range_min>
|
||||||
<name>first</name>
|
<name>discount_start_date</name>
|
||||||
<type>Date</type>
|
<type>Date</type>
|
||||||
</range_min>
|
</range_min>
|
||||||
<range_max>
|
<range_max>
|
||||||
<name>last</name>
|
<name>discount_end_date</name>
|
||||||
<type>Date</type>
|
<type>Date</type>
|
||||||
</range_max>
|
</range_max>
|
||||||
...
|
...
|
||||||
@ -328,17 +330,17 @@ Example:
|
|||||||
or
|
or
|
||||||
|
|
||||||
``` sql
|
``` sql
|
||||||
CREATE DICTIONARY somedict (
|
CREATE DICTIONARY discounts_dict (
|
||||||
id UInt64,
|
advertiser_id UInt64,
|
||||||
first Date,
|
discount_start_date Date,
|
||||||
last Date,
|
discount_end_date Date,
|
||||||
advertiser_id UInt64
|
amount Float64
|
||||||
)
|
)
|
||||||
PRIMARY KEY id
|
PRIMARY KEY id
|
||||||
SOURCE(CLICKHOUSE(TABLE 'date_table'))
|
SOURCE(CLICKHOUSE(TABLE 'discounts'))
|
||||||
LIFETIME(MIN 1 MAX 1000)
|
LIFETIME(MIN 1 MAX 1000)
|
||||||
LAYOUT(RANGE_HASHED())
|
LAYOUT(RANGE_HASHED(range_lookup_strategy 'max'))
|
||||||
RANGE(MIN first MAX last)
|
RANGE(MIN discount_start_date MAX discount_end_date)
|
||||||
```
|
```
|
||||||
|
|
||||||
To work with these dictionaries, you need to pass an additional argument to the `dictGet` function, for which a range is selected:
|
To work with these dictionaries, you need to pass an additional argument to the `dictGet` function, for which a range is selected:
|
||||||
@ -349,16 +351,17 @@ dictGet('dict_name', 'attr_name', id, date)
|
|||||||
Query example:
|
Query example:
|
||||||
|
|
||||||
``` sql
|
``` sql
|
||||||
SELECT dictGet('somedict', 'advertiser_id', 1, '2022-10-20 23:20:10.000'::DateTime64::UInt64);
|
SELECT dictGet('discounts_dict', 'amount', 1, '2022-10-20'::Date);
|
||||||
```
|
```
|
||||||
|
|
||||||
This function returns the value for the specified `id`s and the date range that includes the passed date.
|
This function returns the value for the specified `id`s and the date range that includes the passed date.
|
||||||
|
|
||||||
Details of the algorithm:
|
Details of the algorithm:
|
||||||
|
|
||||||
- If the `id` is not found or a range is not found for the `id`, it returns the default value for the dictionary.
|
- If the `id` is not found or a range is not found for the `id`, it returns the default value of the attribute's type.
|
||||||
- If there are overlapping ranges, it returns value for any (random) range.
|
- If there are overlapping ranges and `range_lookup_strategy=min`, it returns a matching range with minimal `range_min`, if several ranges found, it returns a range with minimal `range_max`, if again several ranges found (several ranges had the same `range_min` and `range_max` it returns a random range of them.
|
||||||
- If the range delimiter is `NULL` or an invalid date (such as 1900-01-01), the range is open. The range can be open on both sides.
|
- If there are overlapping ranges and `range_lookup_strategy=max`, it returns a matching range with maximal `range_min`, if several ranges found, it returns a range with maximal `range_max`, if again several ranges found (several ranges had the same `range_min` and `range_max` it returns a random range of them.
|
||||||
|
- If the `range_max` is `NULL`, the range is open. `NULL` is treated as maximal possible value. For the `range_min` `1970-01-01` or `0` (-MAX_INT) can be used as the open value.
|
||||||
|
|
||||||
Configuration example:
|
Configuration example:
|
||||||
|
|
||||||
@ -407,6 +410,108 @@ PRIMARY KEY Abcdef
|
|||||||
RANGE(MIN StartTimeStamp MAX EndTimeStamp)
|
RANGE(MIN StartTimeStamp MAX EndTimeStamp)
|
||||||
```
|
```
|
||||||
|
|
||||||
|
Configuration example with overlapping ranges and open ranges:
|
||||||
|
|
||||||
|
```sql
|
||||||
|
CREATE TABLE discounts
|
||||||
|
(
|
||||||
|
advertiser_id UInt64,
|
||||||
|
discount_start_date Date,
|
||||||
|
discount_end_date Nullable(Date),
|
||||||
|
amount Float64
|
||||||
|
)
|
||||||
|
ENGINE = Memory;
|
||||||
|
|
||||||
|
INSERT INTO discounts VALUES (1, '2015-01-01', Null, 0.1);
|
||||||
|
INSERT INTO discounts VALUES (1, '2015-01-15', Null, 0.2);
|
||||||
|
INSERT INTO discounts VALUES (2, '2015-01-01', '2015-01-15', 0.3);
|
||||||
|
INSERT INTO discounts VALUES (2, '2015-01-04', '2015-01-10', 0.4);
|
||||||
|
INSERT INTO discounts VALUES (3, '1970-01-01', '2015-01-15', 0.5);
|
||||||
|
INSERT INTO discounts VALUES (3, '1970-01-01', '2015-01-10', 0.6);
|
||||||
|
|
||||||
|
SELECT * FROM discounts ORDER BY advertiser_id, discount_start_date;
|
||||||
|
┌─advertiser_id─┬─discount_start_date─┬─discount_end_date─┬─amount─┐
|
||||||
|
│ 1 │ 2015-01-01 │ ᴺᵁᴸᴸ │ 0.1 │
|
||||||
|
│ 1 │ 2015-01-15 │ ᴺᵁᴸᴸ │ 0.2 │
|
||||||
|
│ 2 │ 2015-01-01 │ 2015-01-15 │ 0.3 │
|
||||||
|
│ 2 │ 2015-01-04 │ 2015-01-10 │ 0.4 │
|
||||||
|
│ 3 │ 1970-01-01 │ 2015-01-15 │ 0.5 │
|
||||||
|
│ 3 │ 1970-01-01 │ 2015-01-10 │ 0.6 │
|
||||||
|
└───────────────┴─────────────────────┴───────────────────┴────────┘
|
||||||
|
|
||||||
|
-- RANGE_LOOKUP_STRATEGY 'max'
|
||||||
|
|
||||||
|
CREATE DICTIONARY discounts_dict
|
||||||
|
(
|
||||||
|
advertiser_id UInt64,
|
||||||
|
discount_start_date Date,
|
||||||
|
discount_end_date Nullable(Date),
|
||||||
|
amount Float64
|
||||||
|
)
|
||||||
|
PRIMARY KEY advertiser_id
|
||||||
|
SOURCE(CLICKHOUSE(TABLE discounts))
|
||||||
|
LIFETIME(MIN 600 MAX 900)
|
||||||
|
LAYOUT(RANGE_HASHED(RANGE_LOOKUP_STRATEGY 'max'))
|
||||||
|
RANGE(MIN discount_start_date MAX discount_end_date);
|
||||||
|
|
||||||
|
select dictGet('discounts_dict', 'amount', 1, toDate('2015-01-14')) res;
|
||||||
|
┌─res─┐
|
||||||
|
│ 0.1 │ -- the only one range is matching: 2015-01-01 - Null
|
||||||
|
└─────┘
|
||||||
|
|
||||||
|
select dictGet('discounts_dict', 'amount', 1, toDate('2015-01-16')) res;
|
||||||
|
┌─res─┐
|
||||||
|
│ 0.2 │ -- two ranges are matching, range_min 2015-01-15 (0.2) is bigger than 2015-01-01 (0.1)
|
||||||
|
└─────┘
|
||||||
|
|
||||||
|
select dictGet('discounts_dict', 'amount', 2, toDate('2015-01-06')) res;
|
||||||
|
┌─res─┐
|
||||||
|
│ 0.4 │ -- two ranges are matching, range_min 2015-01-04 (0.4) is bigger than 2015-01-01 (0.3)
|
||||||
|
└─────┘
|
||||||
|
|
||||||
|
select dictGet('discounts_dict', 'amount', 3, toDate('2015-01-01')) res;
|
||||||
|
┌─res─┐
|
||||||
|
│ 0.5 │ -- two ranges are matching, range_min are equal, 2015-01-15 (0.5) is bigger than 2015-01-10 (0.6)
|
||||||
|
└─────┘
|
||||||
|
|
||||||
|
DROP DICTIONARY discounts_dict;
|
||||||
|
|
||||||
|
-- RANGE_LOOKUP_STRATEGY 'min'
|
||||||
|
|
||||||
|
CREATE DICTIONARY discounts_dict
|
||||||
|
(
|
||||||
|
advertiser_id UInt64,
|
||||||
|
discount_start_date Date,
|
||||||
|
discount_end_date Nullable(Date),
|
||||||
|
amount Float64
|
||||||
|
)
|
||||||
|
PRIMARY KEY advertiser_id
|
||||||
|
SOURCE(CLICKHOUSE(TABLE discounts))
|
||||||
|
LIFETIME(MIN 600 MAX 900)
|
||||||
|
LAYOUT(RANGE_HASHED(RANGE_LOOKUP_STRATEGY 'min'))
|
||||||
|
RANGE(MIN discount_start_date MAX discount_end_date);
|
||||||
|
|
||||||
|
select dictGet('discounts_dict', 'amount', 1, toDate('2015-01-14')) res;
|
||||||
|
┌─res─┐
|
||||||
|
│ 0.1 │ -- the only one range is matching: 2015-01-01 - Null
|
||||||
|
└─────┘
|
||||||
|
|
||||||
|
select dictGet('discounts_dict', 'amount', 1, toDate('2015-01-16')) res;
|
||||||
|
┌─res─┐
|
||||||
|
│ 0.1 │ -- two ranges are matching, range_min 2015-01-01 (0.1) is less than 2015-01-15 (0.2)
|
||||||
|
└─────┘
|
||||||
|
|
||||||
|
select dictGet('discounts_dict', 'amount', 2, toDate('2015-01-06')) res;
|
||||||
|
┌─res─┐
|
||||||
|
│ 0.3 │ -- two ranges are matching, range_min 2015-01-01 (0.3) is less than 2015-01-04 (0.4)
|
||||||
|
└─────┘
|
||||||
|
|
||||||
|
select dictGet('discounts_dict', 'amount', 3, toDate('2015-01-01')) res;
|
||||||
|
┌─res─┐
|
||||||
|
│ 0.6 │ -- two ranges are matching, range_min are equal, 2015-01-10 (0.6) is less than 2015-01-15 (0.5)
|
||||||
|
└─────┘
|
||||||
|
```
|
||||||
|
|
||||||
### complex_key_range_hashed
|
### complex_key_range_hashed
|
||||||
|
|
||||||
The dictionary is stored in memory in the form of a hash table with an ordered array of ranges and their corresponding values (see [range_hashed](#range-hashed)). This type of storage is for use with composite [keys](../../../sql-reference/dictionaries/external-dictionaries/external-dicts-dict-structure.md).
|
The dictionary is stored in memory in the form of a hash table with an ordered array of ranges and their corresponding values (see [range_hashed](#range-hashed)). This type of storage is for use with composite [keys](../../../sql-reference/dictionaries/external-dictionaries/external-dicts-dict-structure.md).
|
||||||
|
@ -209,10 +209,25 @@ Aliases: `DAYOFMONTH`, `DAY`.
|
|||||||
|
|
||||||
## toDayOfWeek
|
## toDayOfWeek
|
||||||
|
|
||||||
Converts a date or date with time to a UInt8 number containing the number of the day of the week (Monday is 1, and Sunday is 7).
|
Converts a date or date with time to a UInt8 number containing the number of the day of the week.
|
||||||
|
|
||||||
|
The two-argument form of `toDayOfWeek()` enables you to specify whether the week starts on Monday or Sunday, and whether the return value should be in the range from 0 to 6 or 1 to 7. If the mode argument is ommited, the default mode is 0. The time zone of the date can be specified as the third argument.
|
||||||
|
|
||||||
|
| Mode | First day of week | Range |
|
||||||
|
|------|-------------------|------------------------------------------------|
|
||||||
|
| 0 | Monday | 1-7, Monday = 1, Tuesday = 2, ..., Sunday = 7 |
|
||||||
|
| 1 | Monday | 0-6, Monday = 0, Tuesday = 1, ..., Sunday = 6 |
|
||||||
|
| 2 | Sunday | 0-6, Sunday = 0, Monday = 1, ..., Saturday = 6 |
|
||||||
|
| 3 | Sunday | 1-7, Sunday = 1, Monday = 2, ..., Saturday = 7 |
|
||||||
|
|
||||||
Alias: `DAYOFWEEK`.
|
Alias: `DAYOFWEEK`.
|
||||||
|
|
||||||
|
**Syntax**
|
||||||
|
|
||||||
|
``` sql
|
||||||
|
toDayOfWeek(t[, mode[, timezone]])
|
||||||
|
```
|
||||||
|
|
||||||
## toHour
|
## toHour
|
||||||
|
|
||||||
Converts a date with time to a UInt8 number containing the number of the hour in 24-hour time (0-23).
|
Converts a date with time to a UInt8 number containing the number of the hour in 24-hour time (0-23).
|
||||||
@ -316,11 +331,17 @@ If `toLastDayOfMonth` is called with an argument of type `Date` greater then 214
|
|||||||
Rounds down a date, or date with time, to the nearest Monday.
|
Rounds down a date, or date with time, to the nearest Monday.
|
||||||
Returns the date.
|
Returns the date.
|
||||||
|
|
||||||
## toStartOfWeek(t\[,mode\])
|
## toStartOfWeek
|
||||||
|
|
||||||
Rounds down a date, or date with time, to the nearest Sunday or Monday by mode.
|
Rounds a date or date with time down to the nearest Sunday or Monday.
|
||||||
Returns the date.
|
Returns the date.
|
||||||
The mode argument works exactly like the mode argument to toWeek(). For the single-argument syntax, a mode value of 0 is used.
|
The mode argument works exactly like the mode argument in function `toWeek()`. If no mode is specified, mode is assumed as 0.
|
||||||
|
|
||||||
|
**Syntax**
|
||||||
|
|
||||||
|
``` sql
|
||||||
|
toStartOfWeek(t[, mode[, timezone]])
|
||||||
|
```
|
||||||
|
|
||||||
## toStartOfDay
|
## toStartOfDay
|
||||||
|
|
||||||
@ -455,10 +476,12 @@ Converts a date, or date with time, to a UInt16 number containing the ISO Year n
|
|||||||
|
|
||||||
Converts a date, or date with time, to a UInt8 number containing the ISO Week number.
|
Converts a date, or date with time, to a UInt8 number containing the ISO Week number.
|
||||||
|
|
||||||
## toWeek(date\[,mode\])
|
## toWeek
|
||||||
|
|
||||||
|
This function returns the week number for date or datetime. The two-argument form of `toWeek()` enables you to specify whether the week starts on Sunday or Monday and whether the return value should be in the range from 0 to 53 or from 1 to 53. If the mode argument is omitted, the default mode is 0.
|
||||||
|
|
||||||
|
`toISOWeek()` is a compatibility function that is equivalent to `toWeek(date,3)`.
|
||||||
|
|
||||||
This function returns the week number for date or datetime. The two-argument form of toWeek() enables you to specify whether the week starts on Sunday or Monday and whether the return value should be in the range from 0 to 53 or from 1 to 53. If the mode argument is omitted, the default mode is 0.
|
|
||||||
`toISOWeek()`is a compatibility function that is equivalent to `toWeek(date,3)`.
|
|
||||||
The following table describes how the mode argument works.
|
The following table describes how the mode argument works.
|
||||||
|
|
||||||
| Mode | First day of week | Range | Week 1 is the first week … |
|
| Mode | First day of week | Range | Week 1 is the first week … |
|
||||||
@ -482,13 +505,15 @@ For mode values with a meaning of “with 4 or more days this year,” weeks are
|
|||||||
|
|
||||||
For mode values with a meaning of “contains January 1”, the week contains January 1 is week 1. It does not matter how many days in the new year the week contained, even if it contained only one day.
|
For mode values with a meaning of “contains January 1”, the week contains January 1 is week 1. It does not matter how many days in the new year the week contained, even if it contained only one day.
|
||||||
|
|
||||||
|
**Syntax**
|
||||||
|
|
||||||
``` sql
|
``` sql
|
||||||
toWeek(date, [, mode][, Timezone])
|
toWeek(t[, mode[, time_zone]])
|
||||||
```
|
```
|
||||||
|
|
||||||
**Arguments**
|
**Arguments**
|
||||||
|
|
||||||
- `date` – Date or DateTime.
|
- `t` – Date or DateTime.
|
||||||
- `mode` – Optional parameter, Range of values is \[0,9\], default is 0.
|
- `mode` – Optional parameter, Range of values is \[0,9\], default is 0.
|
||||||
- `Timezone` – Optional parameter, it behaves like any other conversion function.
|
- `Timezone` – Optional parameter, it behaves like any other conversion function.
|
||||||
|
|
||||||
@ -504,13 +529,19 @@ SELECT toDate('2016-12-27') AS date, toWeek(date) AS week0, toWeek(date,1) AS we
|
|||||||
└────────────┴───────┴───────┴───────┘
|
└────────────┴───────┴───────┴───────┘
|
||||||
```
|
```
|
||||||
|
|
||||||
## toYearWeek(date\[,mode\])
|
## toYearWeek
|
||||||
|
|
||||||
Returns year and week for a date. The year in the result may be different from the year in the date argument for the first and the last week of the year.
|
Returns year and week for a date. The year in the result may be different from the year in the date argument for the first and the last week of the year.
|
||||||
|
|
||||||
The mode argument works exactly like the mode argument to toWeek(). For the single-argument syntax, a mode value of 0 is used.
|
The mode argument works exactly like the mode argument to `toWeek()`. For the single-argument syntax, a mode value of 0 is used.
|
||||||
|
|
||||||
`toISOYear()`is a compatibility function that is equivalent to `intDiv(toYearWeek(date,3),100)`.
|
`toISOYear()` is a compatibility function that is equivalent to `intDiv(toYearWeek(date,3),100)`.
|
||||||
|
|
||||||
|
**Syntax**
|
||||||
|
|
||||||
|
``` sql
|
||||||
|
toYearWeek(t[, mode[, timezone]])
|
||||||
|
```
|
||||||
|
|
||||||
**Example**
|
**Example**
|
||||||
|
|
||||||
@ -529,6 +560,7 @@ SELECT toDate('2016-12-27') AS date, toYearWeek(date) AS yearWeek0, toYearWeek(d
|
|||||||
Returns the `unit` component of the difference between `startdate` and `enddate`. The difference is calculated using a precision of 1 second.
|
Returns the `unit` component of the difference between `startdate` and `enddate`. The difference is calculated using a precision of 1 second.
|
||||||
E.g. the difference between `2021-12-29` and `2022-01-01` is 3 days for `day` unit, 0 months for `month` unit, 0 years for `year` unit.
|
E.g. the difference between `2021-12-29` and `2022-01-01` is 3 days for `day` unit, 0 months for `month` unit, 0 years for `year` unit.
|
||||||
|
|
||||||
|
For an alternative to `age`, see function `date\_diff`.
|
||||||
|
|
||||||
**Syntax**
|
**Syntax**
|
||||||
|
|
||||||
@ -600,8 +632,12 @@ Result:
|
|||||||
|
|
||||||
## date\_diff
|
## date\_diff
|
||||||
|
|
||||||
Returns the count of the specified `unit` boundaries crossed between the `startdate` and `enddate`.
|
Returns the count of the specified `unit` boundaries crossed between the `startdate` and the `enddate`.
|
||||||
The difference is calculated using relative units, e.g. the difference between `2021-12-29` and `2022-01-01` is 3 days for day unit (see [toRelativeDayNum](#torelativedaynum)), 1 month for month unit (see [toRelativeMonthNum](#torelativemonthnum)), 1 year for year unit (see [toRelativeYearNum](#torelativeyearnum)).
|
The difference is calculated using relative units, e.g. the difference between `2021-12-29` and `2022-01-01` is 3 days for unit `day` (see [toRelativeDayNum](#torelativedaynum)), 1 month for unit `month` (see [toRelativeMonthNum](#torelativemonthnum)) and 1 year for unit `year` (see [toRelativeYearNum](#torelativeyearnum)).
|
||||||
|
|
||||||
|
If unit `week` was specified, `date\_diff` assumes that weeks start on Monday. Note that this behavior is different from that of function `toWeek()` in which weeks start by default on Sunday.
|
||||||
|
|
||||||
|
For an alternative to `date\_diff`, see function `age`.
|
||||||
|
|
||||||
**Syntax**
|
**Syntax**
|
||||||
|
|
||||||
|
@ -276,14 +276,12 @@ EXPLAIN json = 1, description = 0, header = 1 SELECT 1, 2 + dummy;
|
|||||||
|
|
||||||
With `indexes` = 1, the `Indexes` key is added. It contains an array of used indexes. Each index is described as JSON with `Type` key (a string `MinMax`, `Partition`, `PrimaryKey` or `Skip`) and optional keys:
|
With `indexes` = 1, the `Indexes` key is added. It contains an array of used indexes. Each index is described as JSON with `Type` key (a string `MinMax`, `Partition`, `PrimaryKey` or `Skip`) and optional keys:
|
||||||
|
|
||||||
- `Name` — An index name (for now, is used only for `Skip` index).
|
- `Name` — The index name (currently only used for `Skip` indexes).
|
||||||
- `Keys` — An array of columns used by the index.
|
- `Keys` — The array of columns used by the index.
|
||||||
- `Condition` — A string with condition used.
|
- `Condition` — The used condition.
|
||||||
- `Description` — An index (for now, is used only for `Skip` index).
|
- `Description` — The index description (currently only used for `Skip` indexes).
|
||||||
- `Initial Parts` — A number of parts before the index is applied.
|
- `Parts` — The number of parts before/after the index is applied.
|
||||||
- `Selected Parts` — A number of parts after the index is applied.
|
- `Granules` — The number of granules before/after the index is applied.
|
||||||
- `Initial Granules` — A number of granules before the index is applied.
|
|
||||||
- `Selected Granulesis` — A number of granules after the index is applied.
|
|
||||||
|
|
||||||
Example:
|
Example:
|
||||||
|
|
||||||
@ -294,46 +292,36 @@ Example:
|
|||||||
"Type": "MinMax",
|
"Type": "MinMax",
|
||||||
"Keys": ["y"],
|
"Keys": ["y"],
|
||||||
"Condition": "(y in [1, +inf))",
|
"Condition": "(y in [1, +inf))",
|
||||||
"Initial Parts": 5,
|
"Parts": 5/4,
|
||||||
"Selected Parts": 4,
|
"Granules": 12/11
|
||||||
"Initial Granules": 12,
|
|
||||||
"Selected Granules": 11
|
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"Type": "Partition",
|
"Type": "Partition",
|
||||||
"Keys": ["y", "bitAnd(z, 3)"],
|
"Keys": ["y", "bitAnd(z, 3)"],
|
||||||
"Condition": "and((bitAnd(z, 3) not in [1, 1]), and((y in [1, +inf)), (bitAnd(z, 3) not in [1, 1])))",
|
"Condition": "and((bitAnd(z, 3) not in [1, 1]), and((y in [1, +inf)), (bitAnd(z, 3) not in [1, 1])))",
|
||||||
"Initial Parts": 4,
|
"Parts": 4/3,
|
||||||
"Selected Parts": 3,
|
"Granules": 11/10
|
||||||
"Initial Granules": 11,
|
|
||||||
"Selected Granules": 10
|
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"Type": "PrimaryKey",
|
"Type": "PrimaryKey",
|
||||||
"Keys": ["x", "y"],
|
"Keys": ["x", "y"],
|
||||||
"Condition": "and((x in [11, +inf)), (y in [1, +inf)))",
|
"Condition": "and((x in [11, +inf)), (y in [1, +inf)))",
|
||||||
"Initial Parts": 3,
|
"Parts": 3/2,
|
||||||
"Selected Parts": 2,
|
"Granules": 10/6
|
||||||
"Initial Granules": 10,
|
|
||||||
"Selected Granules": 6
|
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"Type": "Skip",
|
"Type": "Skip",
|
||||||
"Name": "t_minmax",
|
"Name": "t_minmax",
|
||||||
"Description": "minmax GRANULARITY 2",
|
"Description": "minmax GRANULARITY 2",
|
||||||
"Initial Parts": 2,
|
"Parts": 2/1,
|
||||||
"Selected Parts": 1,
|
"Granules": 6/2
|
||||||
"Initial Granules": 6,
|
|
||||||
"Selected Granules": 2
|
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"Type": "Skip",
|
"Type": "Skip",
|
||||||
"Name": "t_set",
|
"Name": "t_set",
|
||||||
"Description": "set GRANULARITY 2",
|
"Description": "set GRANULARITY 2",
|
||||||
"Initial Parts": 1,
|
"": 1/1,
|
||||||
"Selected Parts": 1,
|
"Granules": 2/1
|
||||||
"Initial Granules": 2,
|
|
||||||
"Selected Granules": 1
|
|
||||||
}
|
}
|
||||||
]
|
]
|
||||||
```
|
```
|
||||||
|
@ -23,7 +23,7 @@ When `OPTIMIZE` is used with the [ReplicatedMergeTree](../../engines/table-engin
|
|||||||
|
|
||||||
- If `OPTIMIZE` does not perform a merge for any reason, it does not notify the client. To enable notifications, use the [optimize_throw_if_noop](../../operations/settings/settings.md#setting-optimize_throw_if_noop) setting.
|
- If `OPTIMIZE` does not perform a merge for any reason, it does not notify the client. To enable notifications, use the [optimize_throw_if_noop](../../operations/settings/settings.md#setting-optimize_throw_if_noop) setting.
|
||||||
- If you specify a `PARTITION`, only the specified partition is optimized. [How to set partition expression](alter/partition.md#how-to-set-partition-expression).
|
- If you specify a `PARTITION`, only the specified partition is optimized. [How to set partition expression](alter/partition.md#how-to-set-partition-expression).
|
||||||
- If you specify `FINAL`, optimization is performed even when all the data is already in one part. Also merge is forced even if concurrent merges are performed.
|
- If you specify `FINAL`, optimization is performed even when all the data is already in one part. You can control this behaviour with [optimize_skip_merged_partitions](../../operations/settings/settings.md#optimize-skip-merged-partitions). Also, the merge is forced even if concurrent merges are performed.
|
||||||
- If you specify `DEDUPLICATE`, then completely identical rows (unless by-clause is specified) will be deduplicated (all columns are compared), it makes sense only for the MergeTree engine.
|
- If you specify `DEDUPLICATE`, then completely identical rows (unless by-clause is specified) will be deduplicated (all columns are compared), it makes sense only for the MergeTree engine.
|
||||||
|
|
||||||
You can specify how long (in seconds) to wait for inactive replicas to execute `OPTIMIZE` queries by the [replication_wait_for_inactive_replica_timeout](../../operations/settings/settings.md#replication-wait-for-inactive-replica-timeout) setting.
|
You can specify how long (in seconds) to wait for inactive replicas to execute `OPTIMIZE` queries by the [replication_wait_for_inactive_replica_timeout](../../operations/settings/settings.md#replication-wait-for-inactive-replica-timeout) setting.
|
||||||
|
@ -14,7 +14,7 @@ The `INSERT` query uses both parsers:
|
|||||||
INSERT INTO t VALUES (1, 'Hello, world'), (2, 'abc'), (3, 'def')
|
INSERT INTO t VALUES (1, 'Hello, world'), (2, 'abc'), (3, 'def')
|
||||||
```
|
```
|
||||||
|
|
||||||
The `INSERT INTO t VALUES` fragment is parsed by the full parser, and the data `(1, 'Hello, world'), (2, 'abc'), (3, 'def')` is parsed by the fast stream parser. You can also turn on the full parser for the data by using the [input_format_values_interpret_expressions](../operations/settings/settings.md#settings-input_format_values_interpret_expressions) setting. When `input_format_values_interpret_expressions = 1`, ClickHouse first tries to parse values with the fast stream parser. If it fails, ClickHouse tries to use the full parser for the data, treating it like an SQL [expression](#syntax-expressions).
|
The `INSERT INTO t VALUES` fragment is parsed by the full parser, and the data `(1, 'Hello, world'), (2, 'abc'), (3, 'def')` is parsed by the fast stream parser. You can also turn on the full parser for the data by using the [input_format_values_interpret_expressions](../operations/settings/settings-formats.md#settings-input_format_values_interpret_expressions) setting. When `input_format_values_interpret_expressions = 1`, ClickHouse first tries to parse values with the fast stream parser. If it fails, ClickHouse tries to use the full parser for the data, treating it like an SQL [expression](#syntax-expressions).
|
||||||
|
|
||||||
Data can have any format. When a query is received, the server calculates no more than [max_query_size](../operations/settings/settings.md#settings-max_query_size) bytes of the request in RAM (by default, 1 MB), and the rest is stream parsed.
|
Data can have any format. When a query is received, the server calculates no more than [max_query_size](../operations/settings/settings.md#settings-max_query_size) bytes of the request in RAM (by default, 1 MB), and the rest is stream parsed.
|
||||||
It allows for avoiding issues with large `INSERT` queries.
|
It allows for avoiding issues with large `INSERT` queries.
|
||||||
|
74
docs/en/sql-reference/table-functions/mongodb.md
Normal file
74
docs/en/sql-reference/table-functions/mongodb.md
Normal file
@ -0,0 +1,74 @@
|
|||||||
|
---
|
||||||
|
slug: /en/sql-reference/table-functions/mongodb
|
||||||
|
sidebar_position: 42
|
||||||
|
sidebar_label: mongodb
|
||||||
|
---
|
||||||
|
|
||||||
|
# mongodb
|
||||||
|
|
||||||
|
Allows `SELECT` queries to be performed on data that is stored on a remote MongoDB server.
|
||||||
|
|
||||||
|
**Syntax**
|
||||||
|
|
||||||
|
``` sql
|
||||||
|
mongodb(host:port, database, collection, user, password, structure [, options])
|
||||||
|
```
|
||||||
|
|
||||||
|
**Arguments**
|
||||||
|
|
||||||
|
- `host:port` — MongoDB server address.
|
||||||
|
|
||||||
|
- `database` — Remote database name.
|
||||||
|
|
||||||
|
- `collection` — Remote collection name.
|
||||||
|
|
||||||
|
- `user` — MongoDB user.
|
||||||
|
|
||||||
|
- `password` — User password.
|
||||||
|
|
||||||
|
- `structure` - The schema for the ClickHouse table returned from this function.
|
||||||
|
|
||||||
|
- `options` - MongoDB connection string options (optional parameter).
|
||||||
|
|
||||||
|
|
||||||
|
**Returned Value**
|
||||||
|
|
||||||
|
A table object with the same columns as the original MongoDB table.
|
||||||
|
|
||||||
|
|
||||||
|
**Examples**
|
||||||
|
|
||||||
|
Suppose we have a collection named `my_collection` defined in a MongoDB database named `test`, and we insert a couple of documents:
|
||||||
|
|
||||||
|
```sql
|
||||||
|
db.createUser({user:"test_user",pwd:"password",roles:[{role:"readWrite",db:"test"}]})
|
||||||
|
|
||||||
|
db.createCollection("my_collection")
|
||||||
|
|
||||||
|
db.my_collection.insertOne(
|
||||||
|
{ log_type: "event", host: "120.5.33.9", command: "check-cpu-usage -w 75 -c 90" }
|
||||||
|
)
|
||||||
|
|
||||||
|
db.my_collection.insertOne(
|
||||||
|
{ log_type: "event", host: "120.5.33.4", command: "system-check"}
|
||||||
|
)
|
||||||
|
```
|
||||||
|
|
||||||
|
Let's query the collection using the `mongodb` table function:
|
||||||
|
|
||||||
|
```sql
|
||||||
|
SELECT * FROM mongodb(
|
||||||
|
'127.0.0.1:27017',
|
||||||
|
'test',
|
||||||
|
'my_collection',
|
||||||
|
'test_user',
|
||||||
|
'password',
|
||||||
|
'log_type String, host String, command String',
|
||||||
|
'connectTimeoutMS=10000'
|
||||||
|
)
|
||||||
|
```
|
||||||
|
|
||||||
|
**See Also**
|
||||||
|
|
||||||
|
- [The `MongoDB` table engine](../../engines/table-engines/integrations/mongodb.md)
|
||||||
|
- [Using MongoDB as a dictionary source](../../sql-reference/dictionaries/external-dictionaries/external-dicts-dict-sources/#mongodb)
|
@ -1997,6 +1997,21 @@ SELECT * FROM test_table
|
|||||||
|
|
||||||
Значение по умолчанию: 0.
|
Значение по умолчанию: 0.
|
||||||
|
|
||||||
|
## optimize_skip_merged_partitions {#optimize-skip-merged-partitions}
|
||||||
|
|
||||||
|
Включает или отключает оптимизацию для запроса [OPTIMIZE TABLE ... FINAL](../../sql-reference/statements/optimize.md), когда есть только один парт с level > 0 и неистекший TTL.
|
||||||
|
|
||||||
|
- `OPTIMIZE TABLE ... FINAL SETTINGS optimize_skip_merged_partitions=1`
|
||||||
|
|
||||||
|
По умолчанию, `OPTIMIZE TABLE ... FINAL` перезапишет даже один парт.
|
||||||
|
|
||||||
|
Возможные значения:
|
||||||
|
|
||||||
|
- 1 - Включена
|
||||||
|
- 0 - Выключена
|
||||||
|
|
||||||
|
Значение по умолчанию: 0.
|
||||||
|
|
||||||
## optimize_functions_to_subcolumns {#optimize-functions-to-subcolumns}
|
## optimize_functions_to_subcolumns {#optimize-functions-to-subcolumns}
|
||||||
|
|
||||||
Включает или отключает оптимизацию путем преобразования некоторых функций к чтению подстолбцов, таким образом уменьшая объем данных для чтения.
|
Включает или отключает оптимизацию путем преобразования некоторых функций к чтению подстолбцов, таким образом уменьшая объем данных для чтения.
|
||||||
|
@ -248,10 +248,8 @@ EXPLAIN json = 1, description = 0, header = 1 SELECT 1, 2 + dummy;
|
|||||||
- `Keys` — массив столбцов, используемых индексом.
|
- `Keys` — массив столбцов, используемых индексом.
|
||||||
- `Condition` — строка с используемым условием.
|
- `Condition` — строка с используемым условием.
|
||||||
- `Description` — индекс (на данный момент используется только для индекса `Skip`).
|
- `Description` — индекс (на данный момент используется только для индекса `Skip`).
|
||||||
- `Initial Parts` — количество кусков до применения индекса.
|
- `Parts` — количество кусков до/после применения индекса.
|
||||||
- `Selected Parts` — количество кусков после применения индекса.
|
- `Granules` — количество гранул до/после применения индекса.
|
||||||
- `Initial Granules` — количество гранул до применения индекса.
|
|
||||||
- `Selected Granulesis` — количество гранул после применения индекса.
|
|
||||||
|
|
||||||
Пример:
|
Пример:
|
||||||
|
|
||||||
@ -262,46 +260,36 @@ EXPLAIN json = 1, description = 0, header = 1 SELECT 1, 2 + dummy;
|
|||||||
"Type": "MinMax",
|
"Type": "MinMax",
|
||||||
"Keys": ["y"],
|
"Keys": ["y"],
|
||||||
"Condition": "(y in [1, +inf))",
|
"Condition": "(y in [1, +inf))",
|
||||||
"Initial Parts": 5,
|
"Parts": 5/4,
|
||||||
"Selected Parts": 4,
|
"Granules": 12/11
|
||||||
"Initial Granules": 12,
|
|
||||||
"Selected Granules": 11
|
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"Type": "Partition",
|
"Type": "Partition",
|
||||||
"Keys": ["y", "bitAnd(z, 3)"],
|
"Keys": ["y", "bitAnd(z, 3)"],
|
||||||
"Condition": "and((bitAnd(z, 3) not in [1, 1]), and((y in [1, +inf)), (bitAnd(z, 3) not in [1, 1])))",
|
"Condition": "and((bitAnd(z, 3) not in [1, 1]), and((y in [1, +inf)), (bitAnd(z, 3) not in [1, 1])))",
|
||||||
"Initial Parts": 4,
|
"Parts": 4/3,
|
||||||
"Selected Parts": 3,
|
"Granules": 11/10
|
||||||
"Initial Granules": 11,
|
|
||||||
"Selected Granules": 10
|
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"Type": "PrimaryKey",
|
"Type": "PrimaryKey",
|
||||||
"Keys": ["x", "y"],
|
"Keys": ["x", "y"],
|
||||||
"Condition": "and((x in [11, +inf)), (y in [1, +inf)))",
|
"Condition": "and((x in [11, +inf)), (y in [1, +inf)))",
|
||||||
"Initial Parts": 3,
|
"Parts": 3/2,
|
||||||
"Selected Parts": 2,
|
"Granules": 10/6
|
||||||
"Initial Granules": 10,
|
|
||||||
"Selected Granules": 6
|
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"Type": "Skip",
|
"Type": "Skip",
|
||||||
"Name": "t_minmax",
|
"Name": "t_minmax",
|
||||||
"Description": "minmax GRANULARITY 2",
|
"Description": "minmax GRANULARITY 2",
|
||||||
"Initial Parts": 2,
|
"Parts": 2/1,
|
||||||
"Selected Parts": 1,
|
"Granules": 6/2
|
||||||
"Initial Granules": 6,
|
|
||||||
"Selected Granules": 2
|
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"Type": "Skip",
|
"Type": "Skip",
|
||||||
"Name": "t_set",
|
"Name": "t_set",
|
||||||
"Description": "set GRANULARITY 2",
|
"Description": "set GRANULARITY 2",
|
||||||
"Initial Parts": 1,
|
"": 1/1,
|
||||||
"Selected Parts": 1,
|
"Granules": 2/1
|
||||||
"Initial Granules": 2,
|
|
||||||
"Selected Granules": 1
|
|
||||||
}
|
}
|
||||||
]
|
]
|
||||||
```
|
```
|
||||||
|
@ -24,7 +24,7 @@ OPTIMIZE TABLE [db.]name [ON CLUSTER cluster] [PARTITION partition | PARTITION I
|
|||||||
- По умолчанию, если запросу `OPTIMIZE` не удалось выполнить слияние, то
|
- По умолчанию, если запросу `OPTIMIZE` не удалось выполнить слияние, то
|
||||||
ClickHouse не оповещает клиента. Чтобы включить оповещения, используйте настройку [optimize_throw_if_noop](../../operations/settings/settings.md#setting-optimize_throw_if_noop).
|
ClickHouse не оповещает клиента. Чтобы включить оповещения, используйте настройку [optimize_throw_if_noop](../../operations/settings/settings.md#setting-optimize_throw_if_noop).
|
||||||
- Если указать `PARTITION`, то оптимизация выполняется только для указанной партиции. [Как задавать имя партиции в запросах](alter/index.md#alter-how-to-specify-part-expr).
|
- Если указать `PARTITION`, то оптимизация выполняется только для указанной партиции. [Как задавать имя партиции в запросах](alter/index.md#alter-how-to-specify-part-expr).
|
||||||
- Если указать `FINAL`, то оптимизация выполняется даже в том случае, если все данные уже лежат в одном куске данных. Кроме того, слияние является принудительным, даже если выполняются параллельные слияния.
|
- Если указать `FINAL`, то оптимизация выполняется даже в том случае, если все данные уже лежат в одном куске данных. Можно контролировать с помощью настройки [optimize_skip_merged_partitions](../../operations/settings/settings.md#optimize-skip-merged-partitions). Кроме того, слияние является принудительным, даже если выполняются параллельные слияния.
|
||||||
- Если указать `DEDUPLICATE`, то произойдет схлопывание полностью одинаковых строк (сравниваются значения во всех столбцах), имеет смысл только для движка MergeTree.
|
- Если указать `DEDUPLICATE`, то произойдет схлопывание полностью одинаковых строк (сравниваются значения во всех столбцах), имеет смысл только для движка MergeTree.
|
||||||
|
|
||||||
Вы можете указать время ожидания (в секундах) выполнения запросов `OPTIMIZE` для неактивных реплик с помощью настройки [replication_wait_for_inactive_replica_timeout](../../operations/settings/settings.md#replication-wait-for-inactive-replica-timeout).
|
Вы можете указать время ожидания (в секундах) выполнения запросов `OPTIMIZE` для неактивных реплик с помощью настройки [replication_wait_for_inactive_replica_timeout](../../operations/settings/settings.md#replication-wait-for-inactive-replica-timeout).
|
||||||
@ -196,4 +196,5 @@ SELECT * FROM example;
|
|||||||
┌─primary_key─┬─secondary_key─┬─value─┬─partition_key─┐
|
┌─primary_key─┬─secondary_key─┬─value─┬─partition_key─┐
|
||||||
│ 1 │ 1 │ 2 │ 3 │
|
│ 1 │ 1 │ 2 │ 3 │
|
||||||
└─────────────┴───────────────┴───────┴───────────────┘
|
└─────────────┴───────────────┴───────┴───────────────┘
|
||||||
```
|
```
|
||||||
|
|
||||||
|
@ -334,6 +334,19 @@
|
|||||||
|
|
||||||
<max_thread_pool_size>10000</max_thread_pool_size>
|
<max_thread_pool_size>10000</max_thread_pool_size>
|
||||||
|
|
||||||
|
<!-- Configure other thread pools: -->
|
||||||
|
<!--
|
||||||
|
<background_buffer_flush_schedule_pool_size>16</background_buffer_flush_schedule_pool_size>
|
||||||
|
<background_pool_size>16</background_pool_size>
|
||||||
|
<background_merges_mutations_concurrency_ratio>2</background_merges_mutations_concurrency_ratio>
|
||||||
|
<background_move_pool_size>8</background_move_pool_size>
|
||||||
|
<background_fetches_pool_size>8</background_fetches_pool_size>
|
||||||
|
<background_common_pool_size>8</background_common_pool_size>
|
||||||
|
<background_schedule_pool_size>128</background_schedule_pool_size>
|
||||||
|
<background_message_broker_schedule_pool_size>16</background_message_broker_schedule_pool_size>
|
||||||
|
<background_distributed_schedule_pool_size>16</background_distributed_schedule_pool_size>
|
||||||
|
-->
|
||||||
|
|
||||||
<!-- Number of workers to recycle connections in background (see also drain_timeout).
|
<!-- Number of workers to recycle connections in background (see also drain_timeout).
|
||||||
If the pool is full, connection will be drained synchronously. -->
|
If the pool is full, connection will be drained synchronously. -->
|
||||||
<!-- <max_threads_for_connection_collector>10</max_threads_for_connection_collector> -->
|
<!-- <max_threads_for_connection_collector>10</max_threads_for_connection_collector> -->
|
||||||
|
@ -71,7 +71,15 @@ void ColumnNode::dumpTreeImpl(WriteBuffer & buffer, FormatState & state, size_t
|
|||||||
bool ColumnNode::isEqualImpl(const IQueryTreeNode & rhs) const
|
bool ColumnNode::isEqualImpl(const IQueryTreeNode & rhs) const
|
||||||
{
|
{
|
||||||
const auto & rhs_typed = assert_cast<const ColumnNode &>(rhs);
|
const auto & rhs_typed = assert_cast<const ColumnNode &>(rhs);
|
||||||
return column == rhs_typed.column;
|
|
||||||
|
auto source = getColumnSourceOrNull();
|
||||||
|
auto rhs_source = rhs_typed.getColumnSourceOrNull();
|
||||||
|
if (source && !rhs_source)
|
||||||
|
return false;
|
||||||
|
if (!source && rhs_source)
|
||||||
|
return false;
|
||||||
|
|
||||||
|
return column == rhs_typed.column && (!source || source->isEqual(*rhs_source));
|
||||||
}
|
}
|
||||||
|
|
||||||
void ColumnNode::updateTreeHashImpl(HashState & hash_state) const
|
void ColumnNode::updateTreeHashImpl(HashState & hash_state) const
|
||||||
|
@ -1,14 +1,15 @@
|
|||||||
#pragma once
|
#pragma once
|
||||||
|
|
||||||
#include <optional>
|
#include <base/scope_guard.h>
|
||||||
#include <utility>
|
|
||||||
#include <Common/SettingsChanges.h>
|
|
||||||
#include <Common/Exception.h>
|
#include <Common/Exception.h>
|
||||||
#include <Core/Settings.h>
|
#include <Core/Settings.h>
|
||||||
|
|
||||||
#include <Analyzer/IQueryTreeNode.h>
|
#include <Analyzer/IQueryTreeNode.h>
|
||||||
#include <Analyzer/QueryNode.h>
|
#include <Analyzer/QueryNode.h>
|
||||||
|
#include <Analyzer/UnionNode.h>
|
||||||
|
|
||||||
|
#include <Interpreters/Context.h>
|
||||||
|
|
||||||
namespace DB
|
namespace DB
|
||||||
{
|
{
|
||||||
@ -89,4 +90,134 @@ private:
|
|||||||
template <typename Derived>
|
template <typename Derived>
|
||||||
using ConstInDepthQueryTreeVisitor = InDepthQueryTreeVisitor<Derived, true /*const_visitor*/>;
|
using ConstInDepthQueryTreeVisitor = InDepthQueryTreeVisitor<Derived, true /*const_visitor*/>;
|
||||||
|
|
||||||
|
/** Same as InDepthQueryTreeVisitor and additionally keeps track of current scope context.
|
||||||
|
* This can be useful if your visitor has special logic that depends on current scope context.
|
||||||
|
*/
|
||||||
|
template <typename Derived, bool const_visitor = false>
|
||||||
|
class InDepthQueryTreeVisitorWithContext
|
||||||
|
{
|
||||||
|
public:
|
||||||
|
using VisitQueryTreeNodeType = std::conditional_t<const_visitor, const QueryTreeNodePtr, QueryTreeNodePtr>;
|
||||||
|
|
||||||
|
explicit InDepthQueryTreeVisitorWithContext(ContextPtr context)
|
||||||
|
: current_context(std::move(context))
|
||||||
|
{}
|
||||||
|
|
||||||
|
/// Return true if visitor should traverse tree top to bottom, false otherwise
|
||||||
|
bool shouldTraverseTopToBottom() const
|
||||||
|
{
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Return true if visitor should visit child, false otherwise
|
||||||
|
bool needChildVisit(VisitQueryTreeNodeType & parent [[maybe_unused]], VisitQueryTreeNodeType & child [[maybe_unused]])
|
||||||
|
{
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
const ContextPtr & getContext() const
|
||||||
|
{
|
||||||
|
return current_context;
|
||||||
|
}
|
||||||
|
|
||||||
|
const Settings & getSettings() const
|
||||||
|
{
|
||||||
|
return current_context->getSettingsRef();
|
||||||
|
}
|
||||||
|
|
||||||
|
void visit(VisitQueryTreeNodeType & query_tree_node)
|
||||||
|
{
|
||||||
|
auto current_scope_context_ptr = current_context;
|
||||||
|
SCOPE_EXIT(
|
||||||
|
current_context = std::move(current_scope_context_ptr);
|
||||||
|
);
|
||||||
|
|
||||||
|
if (auto * query_node = query_tree_node->template as<QueryNode>())
|
||||||
|
current_context = query_node->getContext();
|
||||||
|
else if (auto * union_node = query_tree_node->template as<UnionNode>())
|
||||||
|
current_context = union_node->getContext();
|
||||||
|
|
||||||
|
bool traverse_top_to_bottom = getDerived().shouldTraverseTopToBottom();
|
||||||
|
if (!traverse_top_to_bottom)
|
||||||
|
visitChildren(query_tree_node);
|
||||||
|
|
||||||
|
getDerived().visitImpl(query_tree_node);
|
||||||
|
|
||||||
|
if (traverse_top_to_bottom)
|
||||||
|
visitChildren(query_tree_node);
|
||||||
|
}
|
||||||
|
private:
|
||||||
|
Derived & getDerived()
|
||||||
|
{
|
||||||
|
return *static_cast<Derived *>(this);
|
||||||
|
}
|
||||||
|
|
||||||
|
const Derived & getDerived() const
|
||||||
|
{
|
||||||
|
return *static_cast<Derived *>(this);
|
||||||
|
}
|
||||||
|
|
||||||
|
void visitChildren(VisitQueryTreeNodeType & expression)
|
||||||
|
{
|
||||||
|
for (auto & child : expression->getChildren())
|
||||||
|
{
|
||||||
|
if (!child)
|
||||||
|
continue;
|
||||||
|
|
||||||
|
bool need_visit_child = getDerived().needChildVisit(expression, child);
|
||||||
|
|
||||||
|
if (need_visit_child)
|
||||||
|
visit(child);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
ContextPtr current_context;
|
||||||
|
};
|
||||||
|
|
||||||
|
template <typename Derived>
|
||||||
|
using ConstInDepthQueryTreeVisitorWithContext = InDepthQueryTreeVisitorWithContext<Derived, true /*const_visitor*/>;
|
||||||
|
|
||||||
|
/** Visitor that use another visitor to visit node only if condition for visiting node is true.
|
||||||
|
* For example, your visitor need to visit only query tree nodes or union nodes.
|
||||||
|
*
|
||||||
|
* Condition interface:
|
||||||
|
* struct Condition
|
||||||
|
* {
|
||||||
|
* bool operator()(VisitQueryTreeNodeType & node)
|
||||||
|
* {
|
||||||
|
* return shouldNestedVisitorVisitNode(node);
|
||||||
|
* }
|
||||||
|
* }
|
||||||
|
*/
|
||||||
|
template <typename Visitor, typename Condition, bool const_visitor = false>
|
||||||
|
class InDepthQueryTreeConditionalVisitor : public InDepthQueryTreeVisitor<InDepthQueryTreeConditionalVisitor<Visitor, Condition, const_visitor>, const_visitor>
|
||||||
|
{
|
||||||
|
public:
|
||||||
|
using Base = InDepthQueryTreeVisitor<InDepthQueryTreeConditionalVisitor<Visitor, Condition, const_visitor>, const_visitor>;
|
||||||
|
using VisitQueryTreeNodeType = typename Base::VisitQueryTreeNodeType;
|
||||||
|
|
||||||
|
explicit InDepthQueryTreeConditionalVisitor(Visitor & visitor_, Condition & condition_)
|
||||||
|
: visitor(visitor_)
|
||||||
|
, condition(condition_)
|
||||||
|
{
|
||||||
|
}
|
||||||
|
|
||||||
|
bool shouldTraverseTopToBottom() const
|
||||||
|
{
|
||||||
|
return visitor.shouldTraverseTopToBottom();
|
||||||
|
}
|
||||||
|
|
||||||
|
void visitImpl(VisitQueryTreeNodeType & query_tree_node)
|
||||||
|
{
|
||||||
|
if (condition(query_tree_node))
|
||||||
|
visitor.visit(query_tree_node);
|
||||||
|
}
|
||||||
|
|
||||||
|
Visitor & visitor;
|
||||||
|
Condition & condition;
|
||||||
|
};
|
||||||
|
|
||||||
|
template <typename Visitor, typename Condition>
|
||||||
|
using ConstInDepthQueryTreeConditionalVisitor = InDepthQueryTreeConditionalVisitor<Visitor, Condition, true /*const_visitor*/>;
|
||||||
|
|
||||||
}
|
}
|
||||||
|
@ -45,12 +45,11 @@ Field zeroField(const Field & value)
|
|||||||
* TODO: Support `groupBitAnd`, `groupBitOr`, `groupBitXor` functions.
|
* TODO: Support `groupBitAnd`, `groupBitOr`, `groupBitXor` functions.
|
||||||
* TODO: Support rewrite `f((2 * n) * n)` into '2 * f(n * n)'.
|
* TODO: Support rewrite `f((2 * n) * n)` into '2 * f(n * n)'.
|
||||||
*/
|
*/
|
||||||
class AggregateFunctionsArithmericOperationsVisitor : public InDepthQueryTreeVisitor<AggregateFunctionsArithmericOperationsVisitor>
|
class AggregateFunctionsArithmericOperationsVisitor : public InDepthQueryTreeVisitorWithContext<AggregateFunctionsArithmericOperationsVisitor>
|
||||||
{
|
{
|
||||||
public:
|
public:
|
||||||
explicit AggregateFunctionsArithmericOperationsVisitor(ContextPtr context_)
|
using Base = InDepthQueryTreeVisitorWithContext<AggregateFunctionsArithmericOperationsVisitor>;
|
||||||
: context(std::move(context_))
|
using Base::Base;
|
||||||
{}
|
|
||||||
|
|
||||||
/// Traverse tree bottom to top
|
/// Traverse tree bottom to top
|
||||||
static bool shouldTraverseTopToBottom()
|
static bool shouldTraverseTopToBottom()
|
||||||
@ -60,6 +59,9 @@ public:
|
|||||||
|
|
||||||
void visitImpl(QueryTreeNodePtr & node)
|
void visitImpl(QueryTreeNodePtr & node)
|
||||||
{
|
{
|
||||||
|
if (!getSettings().optimize_arithmetic_operations_in_aggregate_functions)
|
||||||
|
return;
|
||||||
|
|
||||||
auto * aggregate_function_node = node->as<FunctionNode>();
|
auto * aggregate_function_node = node->as<FunctionNode>();
|
||||||
if (!aggregate_function_node || !aggregate_function_node->isAggregateFunction())
|
if (!aggregate_function_node || !aggregate_function_node->isAggregateFunction())
|
||||||
return;
|
return;
|
||||||
@ -175,7 +177,7 @@ private:
|
|||||||
|
|
||||||
inline void resolveOrdinaryFunctionNode(FunctionNode & function_node, const String & function_name) const
|
inline void resolveOrdinaryFunctionNode(FunctionNode & function_node, const String & function_name) const
|
||||||
{
|
{
|
||||||
auto function = FunctionFactory::instance().get(function_name, context);
|
auto function = FunctionFactory::instance().get(function_name, getContext());
|
||||||
function_node.resolveAsFunction(function->build(function_node.getArgumentColumns()));
|
function_node.resolveAsFunction(function->build(function_node.getArgumentColumns()));
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -191,8 +193,6 @@ private:
|
|||||||
|
|
||||||
function_node.resolveAsAggregateFunction(std::move(aggregate_function));
|
function_node.resolveAsAggregateFunction(std::move(aggregate_function));
|
||||||
}
|
}
|
||||||
|
|
||||||
ContextPtr context;
|
|
||||||
};
|
};
|
||||||
|
|
||||||
}
|
}
|
||||||
|
@ -1,17 +1,23 @@
|
|||||||
|
#include <Analyzer/Passes/ConvertOrLikeChainPass.h>
|
||||||
|
|
||||||
#include <memory>
|
#include <memory>
|
||||||
#include <unordered_map>
|
#include <unordered_map>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
#include <Analyzer/Passes/ConvertOrLikeChainPass.h>
|
|
||||||
|
#include <Core/Field.h>
|
||||||
|
|
||||||
|
#include <DataTypes/DataTypesNumber.h>
|
||||||
|
|
||||||
|
#include <Functions/FunctionFactory.h>
|
||||||
|
#include <Functions/likePatternToRegexp.h>
|
||||||
|
|
||||||
|
#include <Interpreters/Context.h>
|
||||||
|
|
||||||
#include <Analyzer/ConstantNode.h>
|
#include <Analyzer/ConstantNode.h>
|
||||||
#include <Analyzer/UnionNode.h>
|
#include <Analyzer/UnionNode.h>
|
||||||
#include <Analyzer/FunctionNode.h>
|
#include <Analyzer/FunctionNode.h>
|
||||||
#include <Analyzer/HashUtils.h>
|
#include <Analyzer/HashUtils.h>
|
||||||
#include <Analyzer/InDepthQueryTreeVisitor.h>
|
#include <Analyzer/InDepthQueryTreeVisitor.h>
|
||||||
#include <Core/Field.h>
|
|
||||||
#include <DataTypes/DataTypesNumber.h>
|
|
||||||
#include <Functions/FunctionFactory.h>
|
|
||||||
#include <Functions/likePatternToRegexp.h>
|
|
||||||
#include <Interpreters/Context.h>
|
|
||||||
|
|
||||||
namespace DB
|
namespace DB
|
||||||
{
|
{
|
||||||
@ -19,36 +25,28 @@ namespace DB
|
|||||||
namespace
|
namespace
|
||||||
{
|
{
|
||||||
|
|
||||||
class ConvertOrLikeChainVisitor : public InDepthQueryTreeVisitor<ConvertOrLikeChainVisitor>
|
class ConvertOrLikeChainVisitor : public InDepthQueryTreeVisitorWithContext<ConvertOrLikeChainVisitor>
|
||||||
{
|
{
|
||||||
using FunctionNodes = std::vector<std::shared_ptr<FunctionNode>>;
|
|
||||||
|
|
||||||
const FunctionOverloadResolverPtr match_function_ref;
|
|
||||||
const FunctionOverloadResolverPtr or_function_resolver;
|
|
||||||
public:
|
public:
|
||||||
|
using Base = InDepthQueryTreeVisitorWithContext<ConvertOrLikeChainVisitor>;
|
||||||
|
using Base::Base;
|
||||||
|
|
||||||
explicit ConvertOrLikeChainVisitor(ContextPtr context)
|
explicit ConvertOrLikeChainVisitor(FunctionOverloadResolverPtr or_function_resolver_,
|
||||||
: InDepthQueryTreeVisitor<ConvertOrLikeChainVisitor>()
|
FunctionOverloadResolverPtr match_function_resolver_,
|
||||||
, match_function_ref(FunctionFactory::instance().get("multiMatchAny", context))
|
ContextPtr context)
|
||||||
, or_function_resolver(FunctionFactory::instance().get("or", context))
|
: Base(std::move(context))
|
||||||
|
, or_function_resolver(std::move(or_function_resolver_))
|
||||||
|
, match_function_resolver(std::move(match_function_resolver_))
|
||||||
{}
|
{}
|
||||||
|
|
||||||
static bool needChildVisit(VisitQueryTreeNodeType & parent, VisitQueryTreeNodeType &)
|
bool needChildVisit(VisitQueryTreeNodeType &, VisitQueryTreeNodeType &)
|
||||||
{
|
{
|
||||||
ContextPtr context;
|
const auto & settings = getSettings();
|
||||||
if (auto * query = parent->as<QueryNode>())
|
|
||||||
context = query->getContext();
|
return settings.optimize_or_like_chain
|
||||||
else if (auto * union_node = parent->as<UnionNode>())
|
&& settings.allow_hyperscan
|
||||||
context = union_node->getContext();
|
&& settings.max_hyperscan_regexp_length == 0
|
||||||
if (context)
|
&& settings.max_hyperscan_regexp_total_length == 0;
|
||||||
{
|
|
||||||
const auto & settings = context->getSettingsRef();
|
|
||||||
return settings.optimize_or_like_chain
|
|
||||||
&& settings.allow_hyperscan
|
|
||||||
&& settings.max_hyperscan_regexp_length == 0
|
|
||||||
&& settings.max_hyperscan_regexp_total_length == 0;
|
|
||||||
}
|
|
||||||
return true;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
void visitImpl(QueryTreeNodePtr & node)
|
void visitImpl(QueryTreeNodePtr & node)
|
||||||
@ -61,27 +59,28 @@ public:
|
|||||||
|
|
||||||
QueryTreeNodePtrWithHashMap<Array> node_to_patterns;
|
QueryTreeNodePtrWithHashMap<Array> node_to_patterns;
|
||||||
FunctionNodes match_functions;
|
FunctionNodes match_functions;
|
||||||
for (auto & arg : function_node->getArguments())
|
|
||||||
{
|
|
||||||
unique_elems.push_back(arg);
|
|
||||||
|
|
||||||
auto * arg_func = arg->as<FunctionNode>();
|
for (auto & argument : function_node->getArguments())
|
||||||
if (!arg_func)
|
{
|
||||||
|
unique_elems.push_back(argument);
|
||||||
|
|
||||||
|
auto * argument_function = argument->as<FunctionNode>();
|
||||||
|
if (!argument_function)
|
||||||
continue;
|
continue;
|
||||||
|
|
||||||
const bool is_like = arg_func->getFunctionName() == "like";
|
const bool is_like = argument_function->getFunctionName() == "like";
|
||||||
const bool is_ilike = arg_func->getFunctionName() == "ilike";
|
const bool is_ilike = argument_function->getFunctionName() == "ilike";
|
||||||
|
|
||||||
/// Not {i}like -> bail out.
|
/// Not {i}like -> bail out.
|
||||||
if (!is_like && !is_ilike)
|
if (!is_like && !is_ilike)
|
||||||
continue;
|
continue;
|
||||||
|
|
||||||
const auto & like_arguments = arg_func->getArguments().getNodes();
|
const auto & like_arguments = argument_function->getArguments().getNodes();
|
||||||
if (like_arguments.size() != 2)
|
if (like_arguments.size() != 2)
|
||||||
continue;
|
continue;
|
||||||
|
|
||||||
auto identifier = like_arguments[0];
|
const auto & like_first_argument = like_arguments[0];
|
||||||
auto * pattern = like_arguments[1]->as<ConstantNode>();
|
const auto * pattern = like_arguments[1]->as<ConstantNode>();
|
||||||
if (!pattern || !isString(pattern->getResultType()))
|
if (!pattern || !isString(pattern->getResultType()))
|
||||||
continue;
|
continue;
|
||||||
|
|
||||||
@ -91,17 +90,20 @@ public:
|
|||||||
regexp = "(?i)" + regexp;
|
regexp = "(?i)" + regexp;
|
||||||
|
|
||||||
unique_elems.pop_back();
|
unique_elems.pop_back();
|
||||||
auto it = node_to_patterns.find(identifier);
|
|
||||||
|
auto it = node_to_patterns.find(like_first_argument);
|
||||||
if (it == node_to_patterns.end())
|
if (it == node_to_patterns.end())
|
||||||
{
|
{
|
||||||
it = node_to_patterns.insert({identifier, Array{}}).first;
|
it = node_to_patterns.insert({like_first_argument, Array{}}).first;
|
||||||
|
|
||||||
/// The second argument will be added when all patterns are known.
|
/// The second argument will be added when all patterns are known.
|
||||||
auto match_function = std::make_shared<FunctionNode>("multiMatchAny");
|
auto match_function = std::make_shared<FunctionNode>("multiMatchAny");
|
||||||
match_function->getArguments().getNodes().push_back(identifier);
|
match_function->getArguments().getNodes().push_back(like_first_argument);
|
||||||
|
|
||||||
match_functions.push_back(match_function);
|
match_functions.push_back(match_function);
|
||||||
|
|
||||||
unique_elems.push_back(std::move(match_function));
|
unique_elems.push_back(std::move(match_function));
|
||||||
}
|
}
|
||||||
|
|
||||||
it->second.push_back(regexp);
|
it->second.push_back(regexp);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -111,23 +113,29 @@ public:
|
|||||||
auto & arguments = match_function->getArguments().getNodes();
|
auto & arguments = match_function->getArguments().getNodes();
|
||||||
auto & patterns = node_to_patterns.at(arguments[0]);
|
auto & patterns = node_to_patterns.at(arguments[0]);
|
||||||
arguments.push_back(std::make_shared<ConstantNode>(Field{std::move(patterns)}));
|
arguments.push_back(std::make_shared<ConstantNode>(Field{std::move(patterns)}));
|
||||||
match_function->resolveAsFunction(match_function_ref);
|
match_function->resolveAsFunction(match_function_resolver);
|
||||||
}
|
}
|
||||||
|
|
||||||
/// OR must have at least two arguments.
|
/// OR must have at least two arguments.
|
||||||
if (unique_elems.size() == 1)
|
if (unique_elems.size() == 1)
|
||||||
unique_elems.push_back(std::make_shared<ConstantNode>(false));
|
unique_elems.push_back(std::make_shared<ConstantNode>(static_cast<UInt8>(0)));
|
||||||
|
|
||||||
function_node->getArguments().getNodes() = std::move(unique_elems);
|
function_node->getArguments().getNodes() = std::move(unique_elems);
|
||||||
function_node->resolveAsFunction(or_function_resolver);
|
function_node->resolveAsFunction(or_function_resolver);
|
||||||
}
|
}
|
||||||
|
private:
|
||||||
|
using FunctionNodes = std::vector<std::shared_ptr<FunctionNode>>;
|
||||||
|
const FunctionOverloadResolverPtr or_function_resolver;
|
||||||
|
const FunctionOverloadResolverPtr match_function_resolver;
|
||||||
};
|
};
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
void ConvertOrLikeChainPass::run(QueryTreeNodePtr query_tree_node, ContextPtr context)
|
void ConvertOrLikeChainPass::run(QueryTreeNodePtr query_tree_node, ContextPtr context)
|
||||||
{
|
{
|
||||||
ConvertOrLikeChainVisitor visitor(context);
|
auto or_function_resolver = FunctionFactory::instance().get("or", context);
|
||||||
|
auto match_function_resolver = FunctionFactory::instance().get("multiMatchAny", context);
|
||||||
|
ConvertOrLikeChainVisitor visitor(std::move(or_function_resolver), std::move(match_function_resolver), std::move(context));
|
||||||
visitor.visit(query_tree_node);
|
visitor.visit(query_tree_node);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -16,11 +16,17 @@ namespace DB
|
|||||||
namespace
|
namespace
|
||||||
{
|
{
|
||||||
|
|
||||||
class CountDistinctVisitor : public InDepthQueryTreeVisitor<CountDistinctVisitor>
|
class CountDistinctVisitor : public InDepthQueryTreeVisitorWithContext<CountDistinctVisitor>
|
||||||
{
|
{
|
||||||
public:
|
public:
|
||||||
static void visitImpl(QueryTreeNodePtr & node)
|
using Base = InDepthQueryTreeVisitorWithContext<CountDistinctVisitor>;
|
||||||
|
using Base::Base;
|
||||||
|
|
||||||
|
void visitImpl(QueryTreeNodePtr & node)
|
||||||
{
|
{
|
||||||
|
if (!getSettings().count_distinct_optimization)
|
||||||
|
return;
|
||||||
|
|
||||||
auto * query_node = node->as<QueryNode>();
|
auto * query_node = node->as<QueryNode>();
|
||||||
|
|
||||||
/// Check that query has only SELECT clause
|
/// Check that query has only SELECT clause
|
||||||
@ -78,9 +84,9 @@ public:
|
|||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
void CountDistinctPass::run(QueryTreeNodePtr query_tree_node, ContextPtr)
|
void CountDistinctPass::run(QueryTreeNodePtr query_tree_node, ContextPtr context)
|
||||||
{
|
{
|
||||||
CountDistinctVisitor visitor;
|
CountDistinctVisitor visitor(std::move(context));
|
||||||
visitor.visit(query_tree_node);
|
visitor.visit(query_tree_node);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -16,12 +16,11 @@ namespace DB
|
|||||||
namespace
|
namespace
|
||||||
{
|
{
|
||||||
|
|
||||||
class CustomizeFunctionsVisitor : public InDepthQueryTreeVisitor<CustomizeFunctionsVisitor>
|
class CustomizeFunctionsVisitor : public InDepthQueryTreeVisitorWithContext<CustomizeFunctionsVisitor>
|
||||||
{
|
{
|
||||||
public:
|
public:
|
||||||
explicit CustomizeFunctionsVisitor(ContextPtr & context_)
|
using Base = InDepthQueryTreeVisitorWithContext<CustomizeFunctionsVisitor>;
|
||||||
: context(context_)
|
using Base::Base;
|
||||||
{}
|
|
||||||
|
|
||||||
void visitImpl(QueryTreeNodePtr & node) const
|
void visitImpl(QueryTreeNodePtr & node) const
|
||||||
{
|
{
|
||||||
@ -29,7 +28,7 @@ public:
|
|||||||
if (!function_node)
|
if (!function_node)
|
||||||
return;
|
return;
|
||||||
|
|
||||||
const auto & settings = context->getSettingsRef();
|
const auto & settings = getSettings();
|
||||||
|
|
||||||
/// After successful function replacement function name and function name lowercase must be recalculated
|
/// After successful function replacement function name and function name lowercase must be recalculated
|
||||||
auto function_name = function_node->getFunctionName();
|
auto function_name = function_node->getFunctionName();
|
||||||
@ -154,19 +153,16 @@ public:
|
|||||||
|
|
||||||
inline void resolveOrdinaryFunctionNode(FunctionNode & function_node, const String & function_name) const
|
inline void resolveOrdinaryFunctionNode(FunctionNode & function_node, const String & function_name) const
|
||||||
{
|
{
|
||||||
auto function = FunctionFactory::instance().get(function_name, context);
|
auto function = FunctionFactory::instance().get(function_name, getContext());
|
||||||
function_node.resolveAsFunction(function->build(function_node.getArgumentColumns()));
|
function_node.resolveAsFunction(function->build(function_node.getArgumentColumns()));
|
||||||
}
|
}
|
||||||
|
|
||||||
private:
|
|
||||||
ContextPtr & context;
|
|
||||||
};
|
};
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
void CustomizeFunctionsPass::run(QueryTreeNodePtr query_tree_node, ContextPtr context)
|
void CustomizeFunctionsPass::run(QueryTreeNodePtr query_tree_node, ContextPtr context)
|
||||||
{
|
{
|
||||||
CustomizeFunctionsVisitor visitor(context);
|
CustomizeFunctionsVisitor visitor(std::move(context));
|
||||||
visitor.visit(query_tree_node);
|
visitor.visit(query_tree_node);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -22,15 +22,17 @@ namespace DB
|
|||||||
namespace
|
namespace
|
||||||
{
|
{
|
||||||
|
|
||||||
class FunctionToSubcolumnsVisitor : public InDepthQueryTreeVisitor<FunctionToSubcolumnsVisitor>
|
class FunctionToSubcolumnsVisitor : public InDepthQueryTreeVisitorWithContext<FunctionToSubcolumnsVisitor>
|
||||||
{
|
{
|
||||||
public:
|
public:
|
||||||
explicit FunctionToSubcolumnsVisitor(ContextPtr & context_)
|
using Base = InDepthQueryTreeVisitorWithContext<FunctionToSubcolumnsVisitor>;
|
||||||
: context(context_)
|
using Base::Base;
|
||||||
{}
|
|
||||||
|
|
||||||
void visitImpl(QueryTreeNodePtr & node) const
|
void visitImpl(QueryTreeNodePtr & node) const
|
||||||
{
|
{
|
||||||
|
if (!getSettings().optimize_functions_to_subcolumns)
|
||||||
|
return;
|
||||||
|
|
||||||
auto * function_node = node->as<FunctionNode>();
|
auto * function_node = node->as<FunctionNode>();
|
||||||
if (!function_node)
|
if (!function_node)
|
||||||
return;
|
return;
|
||||||
@ -192,11 +194,9 @@ public:
|
|||||||
private:
|
private:
|
||||||
inline void resolveOrdinaryFunctionNode(FunctionNode & function_node, const String & function_name) const
|
inline void resolveOrdinaryFunctionNode(FunctionNode & function_node, const String & function_name) const
|
||||||
{
|
{
|
||||||
auto function = FunctionFactory::instance().get(function_name, context);
|
auto function = FunctionFactory::instance().get(function_name, getContext());
|
||||||
function_node.resolveAsFunction(function->build(function_node.getArgumentColumns()));
|
function_node.resolveAsFunction(function->build(function_node.getArgumentColumns()));
|
||||||
}
|
}
|
||||||
|
|
||||||
ContextPtr & context;
|
|
||||||
};
|
};
|
||||||
|
|
||||||
}
|
}
|
||||||
|
@ -5,7 +5,7 @@
|
|||||||
namespace DB
|
namespace DB
|
||||||
{
|
{
|
||||||
|
|
||||||
/** Transform functions to subcolumns.
|
/** Transform functions to subcolumns. Enabled using setting optimize_functions_to_subcolumns.
|
||||||
* It can help to reduce amount of read data.
|
* It can help to reduce amount of read data.
|
||||||
*
|
*
|
||||||
* Example: SELECT tupleElement(column, subcolumn) FROM test_table;
|
* Example: SELECT tupleElement(column, subcolumn) FROM test_table;
|
||||||
|
@ -26,16 +26,22 @@ namespace ErrorCodes
|
|||||||
namespace
|
namespace
|
||||||
{
|
{
|
||||||
|
|
||||||
class FuseFunctionsVisitor : public InDepthQueryTreeVisitor<FuseFunctionsVisitor>
|
class FuseFunctionsVisitor : public InDepthQueryTreeVisitorWithContext<FuseFunctionsVisitor>
|
||||||
{
|
{
|
||||||
public:
|
public:
|
||||||
|
using Base = InDepthQueryTreeVisitorWithContext<FuseFunctionsVisitor>;
|
||||||
|
using Base::Base;
|
||||||
|
|
||||||
explicit FuseFunctionsVisitor(const std::unordered_set<String> names_to_collect_)
|
explicit FuseFunctionsVisitor(const std::unordered_set<String> names_to_collect_, ContextPtr context)
|
||||||
: names_to_collect(names_to_collect_)
|
: Base(std::move(context))
|
||||||
|
, names_to_collect(names_to_collect_)
|
||||||
{}
|
{}
|
||||||
|
|
||||||
void visitImpl(QueryTreeNodePtr & node)
|
void visitImpl(QueryTreeNodePtr & node)
|
||||||
{
|
{
|
||||||
|
if (!getSettings().optimize_syntax_fuse_functions)
|
||||||
|
return;
|
||||||
|
|
||||||
auto * function_node = node->as<FunctionNode>();
|
auto * function_node = node->as<FunctionNode>();
|
||||||
if (!function_node || !function_node->isAggregateFunction() || !names_to_collect.contains(function_node->getFunctionName()))
|
if (!function_node || !function_node->isAggregateFunction() || !names_to_collect.contains(function_node->getFunctionName()))
|
||||||
return;
|
return;
|
||||||
@ -201,7 +207,7 @@ FunctionNodePtr createFusedQuantilesNode(std::vector<QueryTreeNodePtr *> & nodes
|
|||||||
|
|
||||||
void tryFuseSumCountAvg(QueryTreeNodePtr query_tree_node, ContextPtr context)
|
void tryFuseSumCountAvg(QueryTreeNodePtr query_tree_node, ContextPtr context)
|
||||||
{
|
{
|
||||||
FuseFunctionsVisitor visitor({"sum", "count", "avg"});
|
FuseFunctionsVisitor visitor({"sum", "count", "avg"}, context);
|
||||||
visitor.visit(query_tree_node);
|
visitor.visit(query_tree_node);
|
||||||
|
|
||||||
for (auto & [argument, nodes] : visitor.argument_to_functions_mapping)
|
for (auto & [argument, nodes] : visitor.argument_to_functions_mapping)
|
||||||
@ -220,7 +226,7 @@ void tryFuseSumCountAvg(QueryTreeNodePtr query_tree_node, ContextPtr context)
|
|||||||
|
|
||||||
void tryFuseQuantiles(QueryTreeNodePtr query_tree_node, ContextPtr context)
|
void tryFuseQuantiles(QueryTreeNodePtr query_tree_node, ContextPtr context)
|
||||||
{
|
{
|
||||||
FuseFunctionsVisitor visitor_quantile({"quantile"});
|
FuseFunctionsVisitor visitor_quantile({"quantile"}, context);
|
||||||
visitor_quantile.visit(query_tree_node);
|
visitor_quantile.visit(query_tree_node);
|
||||||
|
|
||||||
for (auto & [argument, nodes_set] : visitor_quantile.argument_to_functions_mapping)
|
for (auto & [argument, nodes_set] : visitor_quantile.argument_to_functions_mapping)
|
||||||
|
@ -12,15 +12,22 @@ namespace DB
|
|||||||
namespace
|
namespace
|
||||||
{
|
{
|
||||||
|
|
||||||
class IfChainToMultiIfPassVisitor : public InDepthQueryTreeVisitor<IfChainToMultiIfPassVisitor>
|
class IfChainToMultiIfPassVisitor : public InDepthQueryTreeVisitorWithContext<IfChainToMultiIfPassVisitor>
|
||||||
{
|
{
|
||||||
public:
|
public:
|
||||||
explicit IfChainToMultiIfPassVisitor(FunctionOverloadResolverPtr multi_if_function_ptr_)
|
using Base = InDepthQueryTreeVisitorWithContext<IfChainToMultiIfPassVisitor>;
|
||||||
: multi_if_function_ptr(std::move(multi_if_function_ptr_))
|
using Base::Base;
|
||||||
|
|
||||||
|
explicit IfChainToMultiIfPassVisitor(FunctionOverloadResolverPtr multi_if_function_ptr_, ContextPtr context)
|
||||||
|
: Base(std::move(context))
|
||||||
|
, multi_if_function_ptr(std::move(multi_if_function_ptr_))
|
||||||
{}
|
{}
|
||||||
|
|
||||||
void visitImpl(QueryTreeNodePtr & node)
|
void visitImpl(QueryTreeNodePtr & node)
|
||||||
{
|
{
|
||||||
|
if (!getSettings().optimize_if_chain_to_multiif)
|
||||||
|
return;
|
||||||
|
|
||||||
auto * function_node = node->as<FunctionNode>();
|
auto * function_node = node->as<FunctionNode>();
|
||||||
if (!function_node || function_node->getFunctionName() != "if" || function_node->getArguments().getNodes().size() != 3)
|
if (!function_node || function_node->getFunctionName() != "if" || function_node->getArguments().getNodes().size() != 3)
|
||||||
return;
|
return;
|
||||||
@ -68,7 +75,8 @@ private:
|
|||||||
|
|
||||||
void IfChainToMultiIfPass::run(QueryTreeNodePtr query_tree_node, ContextPtr context)
|
void IfChainToMultiIfPass::run(QueryTreeNodePtr query_tree_node, ContextPtr context)
|
||||||
{
|
{
|
||||||
IfChainToMultiIfPassVisitor visitor(FunctionFactory::instance().get("multiIf", context));
|
auto multi_if_function_ptr = FunctionFactory::instance().get("multiIf", context);
|
||||||
|
IfChainToMultiIfPassVisitor visitor(std::move(multi_if_function_ptr), std::move(context));
|
||||||
visitor.visit(query_tree_node);
|
visitor.visit(query_tree_node);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -107,21 +107,24 @@ void wrapIntoToString(FunctionNode & function_node, QueryTreeNodePtr arg, Contex
|
|||||||
assert(isString(function_node.getResultType()));
|
assert(isString(function_node.getResultType()));
|
||||||
}
|
}
|
||||||
|
|
||||||
class ConvertStringsToEnumVisitor : public InDepthQueryTreeVisitor<ConvertStringsToEnumVisitor>
|
class ConvertStringsToEnumVisitor : public InDepthQueryTreeVisitorWithContext<ConvertStringsToEnumVisitor>
|
||||||
{
|
{
|
||||||
public:
|
public:
|
||||||
explicit ConvertStringsToEnumVisitor(ContextPtr context_)
|
using Base = InDepthQueryTreeVisitorWithContext<ConvertStringsToEnumVisitor>;
|
||||||
: context(std::move(context_))
|
using Base::Base;
|
||||||
{
|
|
||||||
}
|
|
||||||
|
|
||||||
void visitImpl(QueryTreeNodePtr & node)
|
void visitImpl(QueryTreeNodePtr & node)
|
||||||
{
|
{
|
||||||
|
if (!getSettings().optimize_if_transform_strings_to_enum)
|
||||||
|
return;
|
||||||
|
|
||||||
auto * function_node = node->as<FunctionNode>();
|
auto * function_node = node->as<FunctionNode>();
|
||||||
|
|
||||||
if (!function_node)
|
if (!function_node)
|
||||||
return;
|
return;
|
||||||
|
|
||||||
|
const auto & context = getContext();
|
||||||
|
|
||||||
/// to preserve return type (String) of the current function_node, we wrap the newly
|
/// to preserve return type (String) of the current function_node, we wrap the newly
|
||||||
/// generated function nodes into toString
|
/// generated function nodes into toString
|
||||||
|
|
||||||
@ -198,16 +201,13 @@ public:
|
|||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
private:
|
|
||||||
ContextPtr context;
|
|
||||||
};
|
};
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
void IfTransformStringsToEnumPass::run(QueryTreeNodePtr query, ContextPtr context)
|
void IfTransformStringsToEnumPass::run(QueryTreeNodePtr query, ContextPtr context)
|
||||||
{
|
{
|
||||||
ConvertStringsToEnumVisitor visitor(context);
|
ConvertStringsToEnumVisitor visitor(std::move(context));
|
||||||
visitor.visit(query);
|
visitor.visit(query);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -10,15 +10,22 @@ namespace DB
|
|||||||
namespace
|
namespace
|
||||||
{
|
{
|
||||||
|
|
||||||
class MultiIfToIfVisitor : public InDepthQueryTreeVisitor<MultiIfToIfVisitor>
|
class MultiIfToIfVisitor : public InDepthQueryTreeVisitorWithContext<MultiIfToIfVisitor>
|
||||||
{
|
{
|
||||||
public:
|
public:
|
||||||
explicit MultiIfToIfVisitor(FunctionOverloadResolverPtr if_function_ptr_)
|
using Base = InDepthQueryTreeVisitorWithContext<MultiIfToIfVisitor>;
|
||||||
: if_function_ptr(if_function_ptr_)
|
using Base::Base;
|
||||||
|
|
||||||
|
explicit MultiIfToIfVisitor(FunctionOverloadResolverPtr if_function_ptr_, ContextPtr context)
|
||||||
|
: Base(std::move(context))
|
||||||
|
, if_function_ptr(std::move(if_function_ptr_))
|
||||||
{}
|
{}
|
||||||
|
|
||||||
void visitImpl(QueryTreeNodePtr & node)
|
void visitImpl(QueryTreeNodePtr & node)
|
||||||
{
|
{
|
||||||
|
if (!getSettings().optimize_multiif_to_if)
|
||||||
|
return;
|
||||||
|
|
||||||
auto * function_node = node->as<FunctionNode>();
|
auto * function_node = node->as<FunctionNode>();
|
||||||
if (!function_node || function_node->getFunctionName() != "multiIf")
|
if (!function_node || function_node->getFunctionName() != "multiIf")
|
||||||
return;
|
return;
|
||||||
@ -38,7 +45,8 @@ private:
|
|||||||
|
|
||||||
void MultiIfToIfPass::run(QueryTreeNodePtr query_tree_node, ContextPtr context)
|
void MultiIfToIfPass::run(QueryTreeNodePtr query_tree_node, ContextPtr context)
|
||||||
{
|
{
|
||||||
MultiIfToIfVisitor visitor(FunctionFactory::instance().get("if", context));
|
auto if_function_ptr = FunctionFactory::instance().get("if", context);
|
||||||
|
MultiIfToIfVisitor visitor(std::move(if_function_ptr), std::move(context));
|
||||||
visitor.visit(query_tree_node);
|
visitor.visit(query_tree_node);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -14,12 +14,17 @@ namespace DB
|
|||||||
namespace
|
namespace
|
||||||
{
|
{
|
||||||
|
|
||||||
class NormalizeCountVariantsVisitor : public InDepthQueryTreeVisitor<NormalizeCountVariantsVisitor>
|
class NormalizeCountVariantsVisitor : public InDepthQueryTreeVisitorWithContext<NormalizeCountVariantsVisitor>
|
||||||
{
|
{
|
||||||
public:
|
public:
|
||||||
explicit NormalizeCountVariantsVisitor(ContextPtr context_) : context(std::move(context_)) {}
|
using Base = InDepthQueryTreeVisitorWithContext<NormalizeCountVariantsVisitor>;
|
||||||
|
using Base::Base;
|
||||||
|
|
||||||
void visitImpl(QueryTreeNodePtr & node)
|
void visitImpl(QueryTreeNodePtr & node)
|
||||||
{
|
{
|
||||||
|
if (!getSettings().optimize_normalize_count_variants)
|
||||||
|
return;
|
||||||
|
|
||||||
auto * function_node = node->as<FunctionNode>();
|
auto * function_node = node->as<FunctionNode>();
|
||||||
if (!function_node || !function_node->isAggregateFunction() || (function_node->getFunctionName() != "count" && function_node->getFunctionName() != "sum"))
|
if (!function_node || !function_node->isAggregateFunction() || (function_node->getFunctionName() != "count" && function_node->getFunctionName() != "sum"))
|
||||||
return;
|
return;
|
||||||
@ -42,15 +47,13 @@ public:
|
|||||||
else if (function_node->getFunctionName() == "sum" &&
|
else if (function_node->getFunctionName() == "sum" &&
|
||||||
first_argument_constant_literal.getType() == Field::Types::UInt64 &&
|
first_argument_constant_literal.getType() == Field::Types::UInt64 &&
|
||||||
first_argument_constant_literal.get<UInt64>() == 1 &&
|
first_argument_constant_literal.get<UInt64>() == 1 &&
|
||||||
!context->getSettingsRef().aggregate_functions_null_for_empty)
|
!getSettings().aggregate_functions_null_for_empty)
|
||||||
{
|
{
|
||||||
resolveAsCountAggregateFunction(*function_node);
|
resolveAsCountAggregateFunction(*function_node);
|
||||||
function_node->getArguments().getNodes().clear();
|
function_node->getArguments().getNodes().clear();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
private:
|
private:
|
||||||
ContextPtr context;
|
|
||||||
|
|
||||||
static inline void resolveAsCountAggregateFunction(FunctionNode & function_node)
|
static inline void resolveAsCountAggregateFunction(FunctionNode & function_node)
|
||||||
{
|
{
|
||||||
AggregateFunctionProperties properties;
|
AggregateFunctionProperties properties;
|
||||||
|
@ -1,26 +1,33 @@
|
|||||||
#include <Analyzer/Passes/OptimizeGroupByFunctionKeysPass.h>
|
#include <Analyzer/Passes/OptimizeGroupByFunctionKeysPass.h>
|
||||||
|
|
||||||
|
#include <algorithm>
|
||||||
|
#include <queue>
|
||||||
|
|
||||||
#include <Analyzer/FunctionNode.h>
|
#include <Analyzer/FunctionNode.h>
|
||||||
#include <Analyzer/HashUtils.h>
|
#include <Analyzer/HashUtils.h>
|
||||||
#include <Analyzer/IQueryTreeNode.h>
|
#include <Analyzer/IQueryTreeNode.h>
|
||||||
#include <Analyzer/InDepthQueryTreeVisitor.h>
|
#include <Analyzer/InDepthQueryTreeVisitor.h>
|
||||||
#include <Analyzer/QueryNode.h>
|
#include <Analyzer/QueryNode.h>
|
||||||
|
|
||||||
#include <algorithm>
|
|
||||||
#include <queue>
|
|
||||||
|
|
||||||
namespace DB
|
namespace DB
|
||||||
{
|
{
|
||||||
|
|
||||||
class OptimizeGroupByFunctionKeysVisitor : public InDepthQueryTreeVisitor<OptimizeGroupByFunctionKeysVisitor>
|
class OptimizeGroupByFunctionKeysVisitor : public InDepthQueryTreeVisitorWithContext<OptimizeGroupByFunctionKeysVisitor>
|
||||||
{
|
{
|
||||||
public:
|
public:
|
||||||
|
using Base = InDepthQueryTreeVisitorWithContext<OptimizeGroupByFunctionKeysVisitor>;
|
||||||
|
using Base::Base;
|
||||||
|
|
||||||
static bool needChildVisit(QueryTreeNodePtr & /*parent*/, QueryTreeNodePtr & child)
|
static bool needChildVisit(QueryTreeNodePtr & /*parent*/, QueryTreeNodePtr & child)
|
||||||
{
|
{
|
||||||
return !child->as<FunctionNode>();
|
return !child->as<FunctionNode>();
|
||||||
}
|
}
|
||||||
|
|
||||||
static void visitImpl(QueryTreeNodePtr & node)
|
void visitImpl(QueryTreeNodePtr & node)
|
||||||
{
|
{
|
||||||
|
if (!getSettings().optimize_group_by_function_keys)
|
||||||
|
return;
|
||||||
|
|
||||||
auto * query = node->as<QueryNode>();
|
auto * query = node->as<QueryNode>();
|
||||||
if (!query)
|
if (!query)
|
||||||
return;
|
return;
|
||||||
@ -41,6 +48,11 @@ public:
|
|||||||
optimizeGroupingSet(group_by);
|
optimizeGroupingSet(group_by);
|
||||||
}
|
}
|
||||||
private:
|
private:
|
||||||
|
struct NodeWithInfo
|
||||||
|
{
|
||||||
|
QueryTreeNodePtr node;
|
||||||
|
bool parents_are_only_deterministic = false;
|
||||||
|
};
|
||||||
|
|
||||||
static bool canBeEliminated(QueryTreeNodePtr & node, const QueryTreeNodePtrWithHashSet & group_by_keys)
|
static bool canBeEliminated(QueryTreeNodePtr & node, const QueryTreeNodePtrWithHashSet & group_by_keys)
|
||||||
{
|
{
|
||||||
@ -48,16 +60,17 @@ private:
|
|||||||
if (!function || function->getArguments().getNodes().empty())
|
if (!function || function->getArguments().getNodes().empty())
|
||||||
return false;
|
return false;
|
||||||
|
|
||||||
QueryTreeNodes candidates;
|
std::vector<NodeWithInfo> candidates;
|
||||||
auto & function_arguments = function->getArguments().getNodes();
|
auto & function_arguments = function->getArguments().getNodes();
|
||||||
|
bool is_deterministic = function->getFunction()->isDeterministicInScopeOfQuery();
|
||||||
for (auto it = function_arguments.rbegin(); it != function_arguments.rend(); ++it)
|
for (auto it = function_arguments.rbegin(); it != function_arguments.rend(); ++it)
|
||||||
candidates.push_back(*it);
|
candidates.push_back({ *it, is_deterministic });
|
||||||
|
|
||||||
// Using DFS we traverse function tree and try to find if it uses other keys as function arguments.
|
// Using DFS we traverse function tree and try to find if it uses other keys as function arguments.
|
||||||
// TODO: Also process CONSTANT here. We can simplify GROUP BY x, x + 1 to GROUP BY x.
|
// TODO: Also process CONSTANT here. We can simplify GROUP BY x, x + 1 to GROUP BY x.
|
||||||
while (!candidates.empty())
|
while (!candidates.empty())
|
||||||
{
|
{
|
||||||
auto candidate = candidates.back();
|
auto [candidate, parents_are_only_deterministic] = candidates.back();
|
||||||
candidates.pop_back();
|
candidates.pop_back();
|
||||||
|
|
||||||
bool found = group_by_keys.contains(candidate);
|
bool found = group_by_keys.contains(candidate);
|
||||||
@ -73,8 +86,9 @@ private:
|
|||||||
|
|
||||||
if (!found)
|
if (!found)
|
||||||
{
|
{
|
||||||
|
bool is_deterministic_function = parents_are_only_deterministic && function->getFunction()->isDeterministicInScopeOfQuery();
|
||||||
for (auto it = arguments.rbegin(); it != arguments.rend(); ++it)
|
for (auto it = arguments.rbegin(); it != arguments.rend(); ++it)
|
||||||
candidates.push_back(*it);
|
candidates.push_back({ *it, is_deterministic_function });
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
@ -82,6 +96,10 @@ private:
|
|||||||
if (!found)
|
if (!found)
|
||||||
return false;
|
return false;
|
||||||
break;
|
break;
|
||||||
|
case QueryTreeNodeType::CONSTANT:
|
||||||
|
if (!parents_are_only_deterministic)
|
||||||
|
return false;
|
||||||
|
break;
|
||||||
default:
|
default:
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
@ -105,9 +123,10 @@ private:
|
|||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
void OptimizeGroupByFunctionKeysPass::run(QueryTreeNodePtr query_tree_node, ContextPtr /*context*/)
|
void OptimizeGroupByFunctionKeysPass::run(QueryTreeNodePtr query_tree_node, ContextPtr context)
|
||||||
{
|
{
|
||||||
OptimizeGroupByFunctionKeysVisitor().visit(query_tree_node);
|
OptimizeGroupByFunctionKeysVisitor visitor(std::move(context));
|
||||||
|
visitor.visit(query_tree_node);
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
@ -1,11 +1,13 @@
|
|||||||
#include <Analyzer/Passes/OptimizeRedundantFunctionsInOrderByPass.h>
|
#include <Analyzer/Passes/OptimizeRedundantFunctionsInOrderByPass.h>
|
||||||
|
|
||||||
|
#include <Functions/IFunction.h>
|
||||||
|
|
||||||
#include <Analyzer/ColumnNode.h>
|
#include <Analyzer/ColumnNode.h>
|
||||||
#include <Analyzer/FunctionNode.h>
|
#include <Analyzer/FunctionNode.h>
|
||||||
#include <Analyzer/HashUtils.h>
|
#include <Analyzer/HashUtils.h>
|
||||||
#include <Analyzer/InDepthQueryTreeVisitor.h>
|
#include <Analyzer/InDepthQueryTreeVisitor.h>
|
||||||
#include <Analyzer/QueryNode.h>
|
#include <Analyzer/QueryNode.h>
|
||||||
#include <Analyzer/SortNode.h>
|
#include <Analyzer/SortNode.h>
|
||||||
#include <Functions/IFunction.h>
|
|
||||||
|
|
||||||
namespace DB
|
namespace DB
|
||||||
{
|
{
|
||||||
@ -13,9 +15,12 @@ namespace DB
|
|||||||
namespace
|
namespace
|
||||||
{
|
{
|
||||||
|
|
||||||
class OptimizeRedundantFunctionsInOrderByVisitor : public InDepthQueryTreeVisitor<OptimizeRedundantFunctionsInOrderByVisitor>
|
class OptimizeRedundantFunctionsInOrderByVisitor : public InDepthQueryTreeVisitorWithContext<OptimizeRedundantFunctionsInOrderByVisitor>
|
||||||
{
|
{
|
||||||
public:
|
public:
|
||||||
|
using Base = InDepthQueryTreeVisitorWithContext<OptimizeRedundantFunctionsInOrderByVisitor>;
|
||||||
|
using Base::Base;
|
||||||
|
|
||||||
static bool needChildVisit(QueryTreeNodePtr & node, QueryTreeNodePtr & /*parent*/)
|
static bool needChildVisit(QueryTreeNodePtr & node, QueryTreeNodePtr & /*parent*/)
|
||||||
{
|
{
|
||||||
if (node->as<FunctionNode>())
|
if (node->as<FunctionNode>())
|
||||||
@ -25,6 +30,9 @@ public:
|
|||||||
|
|
||||||
void visitImpl(QueryTreeNodePtr & node)
|
void visitImpl(QueryTreeNodePtr & node)
|
||||||
{
|
{
|
||||||
|
if (!getSettings().optimize_redundant_functions_in_order_by)
|
||||||
|
return;
|
||||||
|
|
||||||
auto * query = node->as<QueryNode>();
|
auto * query = node->as<QueryNode>();
|
||||||
if (!query)
|
if (!query)
|
||||||
return;
|
return;
|
||||||
@ -116,9 +124,10 @@ private:
|
|||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
void OptimizeRedundantFunctionsInOrderByPass::run(QueryTreeNodePtr query_tree_node, ContextPtr /*context*/)
|
void OptimizeRedundantFunctionsInOrderByPass::run(QueryTreeNodePtr query_tree_node, ContextPtr context)
|
||||||
{
|
{
|
||||||
OptimizeRedundantFunctionsInOrderByVisitor().visit(query_tree_node);
|
OptimizeRedundantFunctionsInOrderByVisitor visitor(std::move(context));
|
||||||
|
visitor.visit(query_tree_node);
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
@ -20,15 +20,17 @@ namespace DB
|
|||||||
namespace
|
namespace
|
||||||
{
|
{
|
||||||
|
|
||||||
class SumIfToCountIfVisitor : public InDepthQueryTreeVisitor<SumIfToCountIfVisitor>
|
class SumIfToCountIfVisitor : public InDepthQueryTreeVisitorWithContext<SumIfToCountIfVisitor>
|
||||||
{
|
{
|
||||||
public:
|
public:
|
||||||
explicit SumIfToCountIfVisitor(ContextPtr & context_)
|
using Base = InDepthQueryTreeVisitorWithContext<SumIfToCountIfVisitor>;
|
||||||
: context(context_)
|
using Base::Base;
|
||||||
{}
|
|
||||||
|
|
||||||
void visitImpl(QueryTreeNodePtr & node)
|
void visitImpl(QueryTreeNodePtr & node)
|
||||||
{
|
{
|
||||||
|
if (!getSettings().optimize_rewrite_sum_if_to_count_if)
|
||||||
|
return;
|
||||||
|
|
||||||
auto * function_node = node->as<FunctionNode>();
|
auto * function_node = node->as<FunctionNode>();
|
||||||
if (!function_node || !function_node->isAggregateFunction())
|
if (!function_node || !function_node->isAggregateFunction())
|
||||||
return;
|
return;
|
||||||
@ -56,7 +58,7 @@ public:
|
|||||||
if (!isInt64OrUInt64FieldType(constant_value_literal.getType()))
|
if (!isInt64OrUInt64FieldType(constant_value_literal.getType()))
|
||||||
return;
|
return;
|
||||||
|
|
||||||
if (constant_value_literal.get<UInt64>() != 1 || context->getSettingsRef().aggregate_functions_null_for_empty)
|
if (constant_value_literal.get<UInt64>() != 1 || getSettings().aggregate_functions_null_for_empty)
|
||||||
return;
|
return;
|
||||||
|
|
||||||
function_node_arguments_nodes[0] = std::move(function_node_arguments_nodes[1]);
|
function_node_arguments_nodes[0] = std::move(function_node_arguments_nodes[1]);
|
||||||
@ -122,7 +124,7 @@ public:
|
|||||||
auto & not_function_arguments = not_function->getArguments().getNodes();
|
auto & not_function_arguments = not_function->getArguments().getNodes();
|
||||||
not_function_arguments.push_back(nested_if_function_arguments_nodes[0]);
|
not_function_arguments.push_back(nested_if_function_arguments_nodes[0]);
|
||||||
|
|
||||||
not_function->resolveAsFunction(FunctionFactory::instance().get("not", context)->build(not_function->getArgumentColumns()));
|
not_function->resolveAsFunction(FunctionFactory::instance().get("not", getContext())->build(not_function->getArgumentColumns()));
|
||||||
|
|
||||||
function_node_arguments_nodes[0] = std::move(not_function);
|
function_node_arguments_nodes[0] = std::move(not_function);
|
||||||
function_node_arguments_nodes.resize(1);
|
function_node_arguments_nodes.resize(1);
|
||||||
@ -143,8 +145,6 @@ private:
|
|||||||
|
|
||||||
function_node.resolveAsAggregateFunction(std::move(aggregate_function));
|
function_node.resolveAsAggregateFunction(std::move(aggregate_function));
|
||||||
}
|
}
|
||||||
|
|
||||||
ContextPtr & context;
|
|
||||||
};
|
};
|
||||||
|
|
||||||
}
|
}
|
||||||
|
@ -25,11 +25,17 @@ bool isUniqFunction(const String & function_name)
|
|||||||
function_name == "uniqTheta";
|
function_name == "uniqTheta";
|
||||||
}
|
}
|
||||||
|
|
||||||
class UniqInjectiveFunctionsEliminationVisitor : public InDepthQueryTreeVisitor<UniqInjectiveFunctionsEliminationVisitor>
|
class UniqInjectiveFunctionsEliminationVisitor : public InDepthQueryTreeVisitorWithContext<UniqInjectiveFunctionsEliminationVisitor>
|
||||||
{
|
{
|
||||||
public:
|
public:
|
||||||
static void visitImpl(QueryTreeNodePtr & node)
|
using Base = InDepthQueryTreeVisitorWithContext<UniqInjectiveFunctionsEliminationVisitor>;
|
||||||
|
using Base::Base;
|
||||||
|
|
||||||
|
void visitImpl(QueryTreeNodePtr & node)
|
||||||
{
|
{
|
||||||
|
if (!getSettings().optimize_injective_functions_inside_uniq)
|
||||||
|
return;
|
||||||
|
|
||||||
auto * function_node = node->as<FunctionNode>();
|
auto * function_node = node->as<FunctionNode>();
|
||||||
if (!function_node || !function_node->isAggregateFunction() || !isUniqFunction(function_node->getFunctionName()))
|
if (!function_node || !function_node->isAggregateFunction() || !isUniqFunction(function_node->getFunctionName()))
|
||||||
return;
|
return;
|
||||||
@ -81,9 +87,9 @@ public:
|
|||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
void UniqInjectiveFunctionsEliminationPass::run(QueryTreeNodePtr query_tree_node, ContextPtr)
|
void UniqInjectiveFunctionsEliminationPass::run(QueryTreeNodePtr query_tree_node, ContextPtr context)
|
||||||
{
|
{
|
||||||
UniqInjectiveFunctionsEliminationVisitor visitor;
|
UniqInjectiveFunctionsEliminationVisitor visitor(std::move(context));
|
||||||
visitor.visit(query_tree_node);
|
visitor.visit(query_tree_node);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1,5 +1,7 @@
|
|||||||
#include <Analyzer/QueryNode.h>
|
#include <Analyzer/QueryNode.h>
|
||||||
|
|
||||||
|
#include <fmt/core.h>
|
||||||
|
|
||||||
#include <Common/SipHash.h>
|
#include <Common/SipHash.h>
|
||||||
#include <Common/FieldVisitorToString.h>
|
#include <Common/FieldVisitorToString.h>
|
||||||
|
|
||||||
@ -17,7 +19,6 @@
|
|||||||
#include <Parsers/ASTSetQuery.h>
|
#include <Parsers/ASTSetQuery.h>
|
||||||
|
|
||||||
#include <Analyzer/Utils.h>
|
#include <Analyzer/Utils.h>
|
||||||
#include <fmt/core.h>
|
|
||||||
|
|
||||||
namespace DB
|
namespace DB
|
||||||
{
|
{
|
||||||
@ -36,7 +37,7 @@ QueryNode::QueryNode(ContextMutablePtr context_, SettingsChanges settings_change
|
|||||||
}
|
}
|
||||||
|
|
||||||
QueryNode::QueryNode(ContextMutablePtr context_)
|
QueryNode::QueryNode(ContextMutablePtr context_)
|
||||||
: QueryNode(context_, {} /*settings_changes*/)
|
: QueryNode(std::move(context_), {} /*settings_changes*/)
|
||||||
{}
|
{}
|
||||||
|
|
||||||
void QueryNode::dumpTreeImpl(WriteBuffer & buffer, FormatState & format_state, size_t indent) const
|
void QueryNode::dumpTreeImpl(WriteBuffer & buffer, FormatState & format_state, size_t indent) const
|
||||||
@ -185,10 +186,7 @@ void QueryNode::dumpTreeImpl(WriteBuffer & buffer, FormatState & format_state, s
|
|||||||
{
|
{
|
||||||
buffer << '\n' << std::string(indent + 2, ' ') << "SETTINGS";
|
buffer << '\n' << std::string(indent + 2, ' ') << "SETTINGS";
|
||||||
for (const auto & change : settings_changes)
|
for (const auto & change : settings_changes)
|
||||||
{
|
|
||||||
buffer << fmt::format(" {}={}", change.name, toString(change.value));
|
buffer << fmt::format(" {}={}", change.name, toString(change.value));
|
||||||
}
|
|
||||||
buffer << '\n';
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1,6 +1,7 @@
|
|||||||
#include <memory>
|
|
||||||
#include <Analyzer/QueryTreePassManager.h>
|
#include <Analyzer/QueryTreePassManager.h>
|
||||||
|
|
||||||
|
#include <memory>
|
||||||
|
|
||||||
#include <Common/Exception.h>
|
#include <Common/Exception.h>
|
||||||
|
|
||||||
#include <IO/WriteHelpers.h>
|
#include <IO/WriteHelpers.h>
|
||||||
@ -133,7 +134,6 @@ private:
|
|||||||
* TODO: Support setting optimize_aggregators_of_group_by_keys.
|
* TODO: Support setting optimize_aggregators_of_group_by_keys.
|
||||||
* TODO: Support setting optimize_duplicate_order_by_and_distinct.
|
* TODO: Support setting optimize_duplicate_order_by_and_distinct.
|
||||||
* TODO: Support setting optimize_monotonous_functions_in_order_by.
|
* TODO: Support setting optimize_monotonous_functions_in_order_by.
|
||||||
* TODO: Support settings.optimize_or_like_chain.
|
|
||||||
* TODO: Add optimizations based on function semantics. Example: SELECT * FROM test_table WHERE id != id. (id is not nullable column).
|
* TODO: Add optimizations based on function semantics. Example: SELECT * FROM test_table WHERE id != id. (id is not nullable column).
|
||||||
*/
|
*/
|
||||||
|
|
||||||
@ -210,53 +210,31 @@ void QueryTreePassManager::dump(WriteBuffer & buffer, size_t up_to_pass_index)
|
|||||||
|
|
||||||
void addQueryTreePasses(QueryTreePassManager & manager)
|
void addQueryTreePasses(QueryTreePassManager & manager)
|
||||||
{
|
{
|
||||||
auto context = manager.getContext();
|
|
||||||
const auto & settings = context->getSettingsRef();
|
|
||||||
|
|
||||||
manager.addPass(std::make_unique<QueryAnalysisPass>());
|
manager.addPass(std::make_unique<QueryAnalysisPass>());
|
||||||
|
manager.addPass(std::make_unique<FunctionToSubcolumnsPass>());
|
||||||
|
|
||||||
if (settings.optimize_functions_to_subcolumns)
|
manager.addPass(std::make_unique<CountDistinctPass>());
|
||||||
manager.addPass(std::make_unique<FunctionToSubcolumnsPass>());
|
manager.addPass(std::make_unique<SumIfToCountIfPass>());
|
||||||
|
manager.addPass(std::make_unique<NormalizeCountVariantsPass>());
|
||||||
if (settings.count_distinct_optimization)
|
|
||||||
manager.addPass(std::make_unique<CountDistinctPass>());
|
|
||||||
|
|
||||||
if (settings.optimize_rewrite_sum_if_to_count_if)
|
|
||||||
manager.addPass(std::make_unique<SumIfToCountIfPass>());
|
|
||||||
|
|
||||||
if (settings.optimize_normalize_count_variants)
|
|
||||||
manager.addPass(std::make_unique<NormalizeCountVariantsPass>());
|
|
||||||
|
|
||||||
manager.addPass(std::make_unique<CustomizeFunctionsPass>());
|
manager.addPass(std::make_unique<CustomizeFunctionsPass>());
|
||||||
|
|
||||||
if (settings.optimize_arithmetic_operations_in_aggregate_functions)
|
manager.addPass(std::make_unique<AggregateFunctionsArithmericOperationsPass>());
|
||||||
manager.addPass(std::make_unique<AggregateFunctionsArithmericOperationsPass>());
|
manager.addPass(std::make_unique<UniqInjectiveFunctionsEliminationPass>());
|
||||||
|
manager.addPass(std::make_unique<OptimizeGroupByFunctionKeysPass>());
|
||||||
if (settings.optimize_injective_functions_inside_uniq)
|
|
||||||
manager.addPass(std::make_unique<UniqInjectiveFunctionsEliminationPass>());
|
|
||||||
|
|
||||||
if (settings.optimize_group_by_function_keys)
|
|
||||||
manager.addPass(std::make_unique<OptimizeGroupByFunctionKeysPass>());
|
|
||||||
|
|
||||||
if (settings.optimize_multiif_to_if)
|
|
||||||
manager.addPass(std::make_unique<MultiIfToIfPass>());
|
|
||||||
|
|
||||||
|
manager.addPass(std::make_unique<MultiIfToIfPass>());
|
||||||
manager.addPass(std::make_unique<IfConstantConditionPass>());
|
manager.addPass(std::make_unique<IfConstantConditionPass>());
|
||||||
|
manager.addPass(std::make_unique<IfChainToMultiIfPass>());
|
||||||
|
|
||||||
if (settings.optimize_if_chain_to_multiif)
|
manager.addPass(std::make_unique<OptimizeRedundantFunctionsInOrderByPass>());
|
||||||
manager.addPass(std::make_unique<IfChainToMultiIfPass>());
|
|
||||||
|
|
||||||
if (settings.optimize_redundant_functions_in_order_by)
|
|
||||||
manager.addPass(std::make_unique<OptimizeRedundantFunctionsInOrderByPass>());
|
|
||||||
|
|
||||||
manager.addPass(std::make_unique<OrderByTupleEliminationPass>());
|
manager.addPass(std::make_unique<OrderByTupleEliminationPass>());
|
||||||
manager.addPass(std::make_unique<OrderByLimitByDuplicateEliminationPass>());
|
manager.addPass(std::make_unique<OrderByLimitByDuplicateEliminationPass>());
|
||||||
|
|
||||||
if (settings.optimize_syntax_fuse_functions)
|
manager.addPass(std::make_unique<FuseFunctionsPass>());
|
||||||
manager.addPass(std::make_unique<FuseFunctionsPass>());
|
|
||||||
|
|
||||||
if (settings.optimize_if_transform_strings_to_enum)
|
manager.addPass(std::make_unique<IfTransformStringsToEnumPass>());
|
||||||
manager.addPass(std::make_unique<IfTransformStringsToEnumPass>());
|
|
||||||
|
|
||||||
manager.addPass(std::make_unique<ConvertOrLikeChainPass>());
|
manager.addPass(std::make_unique<ConvertOrLikeChainPass>());
|
||||||
|
|
||||||
|
@ -8,7 +8,7 @@
|
|||||||
#include <IO/ReadBufferFromS3.h>
|
#include <IO/ReadBufferFromS3.h>
|
||||||
#include <IO/WriteBufferFromS3.h>
|
#include <IO/WriteBufferFromS3.h>
|
||||||
#include <IO/HTTPHeaderEntries.h>
|
#include <IO/HTTPHeaderEntries.h>
|
||||||
#include <IO/S3/copyDataToS3.h>
|
#include <IO/S3/copyS3File.h>
|
||||||
#include <Poco/Util/AbstractConfiguration.h>
|
#include <Poco/Util/AbstractConfiguration.h>
|
||||||
|
|
||||||
#include <aws/core/auth/AWSCredentials.h>
|
#include <aws/core/auth/AWSCredentials.h>
|
||||||
@ -167,16 +167,16 @@ void BackupWriterS3::copyFileNative(DiskPtr src_disk, const String & src_file_na
|
|||||||
auto object_storage = src_disk->getObjectStorage();
|
auto object_storage = src_disk->getObjectStorage();
|
||||||
std::string src_bucket = object_storage->getObjectsNamespace();
|
std::string src_bucket = object_storage->getObjectsNamespace();
|
||||||
auto file_path = fs::path(s3_uri.key) / dest_file_name;
|
auto file_path = fs::path(s3_uri.key) / dest_file_name;
|
||||||
copyFileS3ToS3(client, src_bucket, objects[0].absolute_path, src_offset, src_size, s3_uri.bucket, file_path, request_settings, {},
|
copyS3File(client, src_bucket, objects[0].absolute_path, src_offset, src_size, s3_uri.bucket, file_path, request_settings, {},
|
||||||
threadPoolCallbackRunner<void>(IOThreadPool::get(), "BackupWriterS3"));
|
threadPoolCallbackRunner<void>(IOThreadPool::get(), "BackupWriterS3"));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void BackupWriterS3::copyDataToFile(
|
void BackupWriterS3::copyDataToFile(
|
||||||
const CreateReadBufferFunction & create_read_buffer, UInt64 offset, UInt64 size, const String & dest_file_name)
|
const CreateReadBufferFunction & create_read_buffer, UInt64 offset, UInt64 size, const String & dest_file_name)
|
||||||
{
|
{
|
||||||
copyDataToS3(create_read_buffer, offset, size, client, s3_uri.bucket, fs::path(s3_uri.key) / dest_file_name, request_settings, {},
|
copyDataToS3File(create_read_buffer, offset, size, client, s3_uri.bucket, fs::path(s3_uri.key) / dest_file_name, request_settings, {},
|
||||||
threadPoolCallbackRunner<void>(IOThreadPool::get(), "BackupWriterS3"));
|
threadPoolCallbackRunner<void>(IOThreadPool::get(), "BackupWriterS3"));
|
||||||
}
|
}
|
||||||
|
|
||||||
BackupWriterS3::~BackupWriterS3() = default;
|
BackupWriterS3::~BackupWriterS3() = default;
|
||||||
|
@ -160,17 +160,6 @@ OperationID BackupsWorker::startMakingBackup(const ASTPtr & query, const Context
|
|||||||
else
|
else
|
||||||
backup_id = toString(*backup_settings.backup_uuid);
|
backup_id = toString(*backup_settings.backup_uuid);
|
||||||
|
|
||||||
/// Check if there are no concurrent backups
|
|
||||||
if (num_active_backups && !allow_concurrent_backups)
|
|
||||||
{
|
|
||||||
/// If its an internal backup and we currently have 1 active backup, it could be the original query, validate using backup_uuid
|
|
||||||
if (!(num_active_backups == 1 && backup_settings.internal && getAllActiveBackupInfos().at(0).id == toString(*backup_settings.backup_uuid)))
|
|
||||||
{
|
|
||||||
throw Exception(ErrorCodes::CONCURRENT_ACCESS_NOT_SUPPORTED,
|
|
||||||
"Concurrent backups not supported, turn on setting 'allow_concurrent_backups'");
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
std::shared_ptr<IBackupCoordination> backup_coordination;
|
std::shared_ptr<IBackupCoordination> backup_coordination;
|
||||||
if (backup_settings.internal)
|
if (backup_settings.internal)
|
||||||
{
|
{
|
||||||
@ -184,6 +173,13 @@ OperationID BackupsWorker::startMakingBackup(const ASTPtr & query, const Context
|
|||||||
String backup_name_for_logging = backup_info.toStringForLogging();
|
String backup_name_for_logging = backup_info.toStringForLogging();
|
||||||
try
|
try
|
||||||
{
|
{
|
||||||
|
if (!allow_concurrent_backups && hasConcurrentBackups(backup_settings))
|
||||||
|
{
|
||||||
|
/// addInfo is called here to record the failed backup details
|
||||||
|
addInfo(backup_id, backup_name_for_logging, backup_settings.internal, BackupStatus::BACKUP_FAILED);
|
||||||
|
throw Exception(ErrorCodes::CONCURRENT_ACCESS_NOT_SUPPORTED, "Concurrent backups not supported, turn on setting 'allow_concurrent_backups'");
|
||||||
|
}
|
||||||
|
|
||||||
addInfo(backup_id, backup_name_for_logging, backup_settings.internal, BackupStatus::CREATING_BACKUP);
|
addInfo(backup_id, backup_name_for_logging, backup_settings.internal, BackupStatus::CREATING_BACKUP);
|
||||||
|
|
||||||
/// Prepare context to use.
|
/// Prepare context to use.
|
||||||
@ -384,8 +380,8 @@ OperationID BackupsWorker::startRestoring(const ASTPtr & query, ContextMutablePt
|
|||||||
auto restore_query = std::static_pointer_cast<ASTBackupQuery>(query->clone());
|
auto restore_query = std::static_pointer_cast<ASTBackupQuery>(query->clone());
|
||||||
auto restore_settings = RestoreSettings::fromRestoreQuery(*restore_query);
|
auto restore_settings = RestoreSettings::fromRestoreQuery(*restore_query);
|
||||||
|
|
||||||
if (!restore_settings.backup_uuid)
|
if (!restore_settings.restore_uuid)
|
||||||
restore_settings.backup_uuid = UUIDHelpers::generateV4();
|
restore_settings.restore_uuid = UUIDHelpers::generateV4();
|
||||||
|
|
||||||
/// `restore_id` will be used as a key to the `infos` map, so it should be unique.
|
/// `restore_id` will be used as a key to the `infos` map, so it should be unique.
|
||||||
OperationID restore_id;
|
OperationID restore_id;
|
||||||
@ -394,18 +390,7 @@ OperationID BackupsWorker::startRestoring(const ASTPtr & query, ContextMutablePt
|
|||||||
else if (!restore_settings.id.empty())
|
else if (!restore_settings.id.empty())
|
||||||
restore_id = restore_settings.id;
|
restore_id = restore_settings.id;
|
||||||
else
|
else
|
||||||
restore_id = toString(*restore_settings.backup_uuid);
|
restore_id = toString(*restore_settings.restore_uuid);
|
||||||
|
|
||||||
/// Check if there are no concurrent restores
|
|
||||||
if (num_active_restores && !allow_concurrent_restores)
|
|
||||||
{
|
|
||||||
/// If its an internal restore and we currently have 1 active restore, it could be the original query, validate using iz
|
|
||||||
if (!(num_active_restores == 1 && restore_settings.internal && getAllActiveRestoreInfos().at(0).id == toString(*restore_settings.backup_uuid)))
|
|
||||||
{
|
|
||||||
throw Exception(ErrorCodes::CONCURRENT_ACCESS_NOT_SUPPORTED,
|
|
||||||
"Concurrent restores not supported, turn on setting 'allow_concurrent_restores'");
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
std::shared_ptr<IRestoreCoordination> restore_coordination;
|
std::shared_ptr<IRestoreCoordination> restore_coordination;
|
||||||
if (restore_settings.internal)
|
if (restore_settings.internal)
|
||||||
@ -420,6 +405,14 @@ OperationID BackupsWorker::startRestoring(const ASTPtr & query, ContextMutablePt
|
|||||||
{
|
{
|
||||||
auto backup_info = BackupInfo::fromAST(*restore_query->backup_name);
|
auto backup_info = BackupInfo::fromAST(*restore_query->backup_name);
|
||||||
String backup_name_for_logging = backup_info.toStringForLogging();
|
String backup_name_for_logging = backup_info.toStringForLogging();
|
||||||
|
|
||||||
|
if (!allow_concurrent_restores && hasConcurrentRestores(restore_settings))
|
||||||
|
{
|
||||||
|
/// addInfo is called here to record the failed restore details
|
||||||
|
addInfo(restore_id, backup_name_for_logging, restore_settings.internal, BackupStatus::RESTORING);
|
||||||
|
throw Exception(ErrorCodes::CONCURRENT_ACCESS_NOT_SUPPORTED, "Concurrent restores not supported, turn on setting 'allow_concurrent_restores'");
|
||||||
|
}
|
||||||
|
|
||||||
addInfo(restore_id, backup_name_for_logging, restore_settings.internal, BackupStatus::RESTORING);
|
addInfo(restore_id, backup_name_for_logging, restore_settings.internal, BackupStatus::RESTORING);
|
||||||
|
|
||||||
/// Prepare context to use.
|
/// Prepare context to use.
|
||||||
@ -499,7 +492,7 @@ void BackupsWorker::doRestore(
|
|||||||
backup_open_params.context = context;
|
backup_open_params.context = context;
|
||||||
backup_open_params.backup_info = backup_info;
|
backup_open_params.backup_info = backup_info;
|
||||||
backup_open_params.base_backup_info = restore_settings.base_backup_info;
|
backup_open_params.base_backup_info = restore_settings.base_backup_info;
|
||||||
backup_open_params.backup_uuid = restore_settings.backup_uuid;
|
backup_open_params.backup_uuid = restore_settings.restore_uuid;
|
||||||
backup_open_params.password = restore_settings.password;
|
backup_open_params.password = restore_settings.password;
|
||||||
BackupPtr backup = BackupFactory::instance().createBackup(backup_open_params);
|
BackupPtr backup = BackupFactory::instance().createBackup(backup_open_params);
|
||||||
|
|
||||||
@ -740,6 +733,34 @@ std::vector<BackupsWorker::Info> BackupsWorker::getAllActiveRestoreInfos() const
|
|||||||
return res_infos;
|
return res_infos;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
bool BackupsWorker::hasConcurrentBackups(const BackupSettings & backup_settings) const
|
||||||
|
{
|
||||||
|
/// Check if there are no concurrent backups
|
||||||
|
if (num_active_backups)
|
||||||
|
{
|
||||||
|
/// If its an internal backup and we currently have 1 active backup, it could be the original query, validate using backup_uuid
|
||||||
|
if (!(num_active_backups == 1 && backup_settings.internal && getAllActiveBackupInfos().at(0).id == toString(*backup_settings.backup_uuid)))
|
||||||
|
{
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
bool BackupsWorker::hasConcurrentRestores(const RestoreSettings & restore_settings) const
|
||||||
|
{
|
||||||
|
/// Check if there are no concurrent restores
|
||||||
|
if (num_active_restores)
|
||||||
|
{
|
||||||
|
/// If its an internal restore and we currently have 1 active restore, it could be the original query, validate using iz
|
||||||
|
if (!(num_active_restores == 1 && restore_settings.internal && getAllActiveRestoreInfos().at(0).id == toString(*restore_settings.restore_uuid)))
|
||||||
|
{
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
void BackupsWorker::shutdown()
|
void BackupsWorker::shutdown()
|
||||||
{
|
{
|
||||||
bool has_active_backups_and_restores = (num_active_backups || num_active_restores);
|
bool has_active_backups_and_restores = (num_active_backups || num_active_restores);
|
||||||
|
@ -105,6 +105,8 @@ private:
|
|||||||
void setNumFilesAndSize(const OperationID & id, size_t num_files, UInt64 uncompressed_size, UInt64 compressed_size);
|
void setNumFilesAndSize(const OperationID & id, size_t num_files, UInt64 uncompressed_size, UInt64 compressed_size);
|
||||||
std::vector<Info> getAllActiveBackupInfos() const;
|
std::vector<Info> getAllActiveBackupInfos() const;
|
||||||
std::vector<Info> getAllActiveRestoreInfos() const;
|
std::vector<Info> getAllActiveRestoreInfos() const;
|
||||||
|
bool hasConcurrentBackups(const BackupSettings & backup_settings) const;
|
||||||
|
bool hasConcurrentRestores(const RestoreSettings & restore_settings) const;
|
||||||
|
|
||||||
ThreadPool backups_thread_pool;
|
ThreadPool backups_thread_pool;
|
||||||
ThreadPool restores_thread_pool;
|
ThreadPool restores_thread_pool;
|
||||||
|
@ -164,7 +164,7 @@ namespace
|
|||||||
M(Bool, internal) \
|
M(Bool, internal) \
|
||||||
M(String, host_id) \
|
M(String, host_id) \
|
||||||
M(String, coordination_zk_path) \
|
M(String, coordination_zk_path) \
|
||||||
M(OptionalUUID, backup_uuid)
|
M(OptionalUUID, restore_uuid)
|
||||||
|
|
||||||
|
|
||||||
RestoreSettings RestoreSettings::fromRestoreQuery(const ASTBackupQuery & query)
|
RestoreSettings RestoreSettings::fromRestoreQuery(const ASTBackupQuery & query)
|
||||||
|
@ -123,9 +123,9 @@ struct RestoreSettings
|
|||||||
String coordination_zk_path;
|
String coordination_zk_path;
|
||||||
|
|
||||||
/// Internal, should not be specified by user.
|
/// Internal, should not be specified by user.
|
||||||
/// UUID of the backup. If it's not set it will be generated randomly.
|
/// UUID of the restore. If it's not set it will be generated randomly.
|
||||||
/// This is used to validate internal restores when allow_concurrent_restores is turned off
|
/// This is used to validate internal restores when allow_concurrent_restores is turned off
|
||||||
std::optional<UUID> backup_uuid;
|
std::optional<UUID> restore_uuid;
|
||||||
|
|
||||||
static RestoreSettings fromRestoreQuery(const ASTBackupQuery & query);
|
static RestoreSettings fromRestoreQuery(const ASTBackupQuery & query);
|
||||||
void copySettingsToQuery(ASTBackupQuery & query) const;
|
void copySettingsToQuery(ASTBackupQuery & query) const;
|
||||||
|
@ -79,7 +79,8 @@ FilterDescription::FilterDescription(const IColumn & column_)
|
|||||||
const NullMap & null_map = nullable_column->getNullMapData();
|
const NullMap & null_map = nullable_column->getNullMapData();
|
||||||
IColumn::Filter & res = concrete_column->getData();
|
IColumn::Filter & res = concrete_column->getData();
|
||||||
|
|
||||||
size_t size = res.size();
|
const auto size = res.size();
|
||||||
|
assert(size == null_map.size());
|
||||||
for (size_t i = 0; i < size; ++i)
|
for (size_t i = 0; i < size; ++i)
|
||||||
res[i] = res[i] && !null_map[i];
|
res[i] = res[i] && !null_map[i];
|
||||||
|
|
||||||
|
@ -66,6 +66,9 @@ AsynchronousMetrics::AsynchronousMetrics(
|
|||||||
openFileIfExists("/proc/uptime", uptime);
|
openFileIfExists("/proc/uptime", uptime);
|
||||||
openFileIfExists("/proc/net/dev", net_dev);
|
openFileIfExists("/proc/net/dev", net_dev);
|
||||||
|
|
||||||
|
openFileIfExists("/sys/fs/cgroup/memory/memory.limit_in_bytes", cgroupmem_limit_in_bytes);
|
||||||
|
openFileIfExists("/sys/fs/cgroup/memory/memory.usage_in_bytes", cgroupmem_usage_in_bytes);
|
||||||
|
|
||||||
openSensors();
|
openSensors();
|
||||||
openBlockDevices();
|
openBlockDevices();
|
||||||
openEDAC();
|
openEDAC();
|
||||||
@ -879,6 +882,35 @@ void AsynchronousMetrics::update(TimePoint update_time)
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (cgroupmem_limit_in_bytes && cgroupmem_usage_in_bytes)
|
||||||
|
{
|
||||||
|
try {
|
||||||
|
cgroupmem_limit_in_bytes->rewind();
|
||||||
|
cgroupmem_usage_in_bytes->rewind();
|
||||||
|
|
||||||
|
uint64_t cgroup_mem_limit_in_bytes = 0;
|
||||||
|
uint64_t cgroup_mem_usage_in_bytes = 0;
|
||||||
|
|
||||||
|
readText(cgroup_mem_limit_in_bytes, *cgroupmem_limit_in_bytes);
|
||||||
|
readText(cgroup_mem_usage_in_bytes, *cgroupmem_usage_in_bytes);
|
||||||
|
|
||||||
|
if (cgroup_mem_limit_in_bytes && cgroup_mem_usage_in_bytes)
|
||||||
|
{
|
||||||
|
new_values["CgroupMemoryTotal"] = { cgroup_mem_limit_in_bytes, "The total amount of memory in cgroup, in bytes." };
|
||||||
|
new_values["CgroupMemoryUsed"] = { cgroup_mem_usage_in_bytes, "The amount of memory used in cgroup, in bytes." };
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
LOG_DEBUG(log, "Cannot read statistics about the cgroup memory total and used. Total got '{}', Used got '{}'.",
|
||||||
|
cgroup_mem_limit_in_bytes, cgroup_mem_usage_in_bytes);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
catch (...)
|
||||||
|
{
|
||||||
|
tryLogCurrentException(__PRETTY_FUNCTION__);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
if (meminfo)
|
if (meminfo)
|
||||||
{
|
{
|
||||||
try
|
try
|
||||||
|
@ -108,6 +108,9 @@ private:
|
|||||||
std::optional<ReadBufferFromFilePRead> uptime;
|
std::optional<ReadBufferFromFilePRead> uptime;
|
||||||
std::optional<ReadBufferFromFilePRead> net_dev;
|
std::optional<ReadBufferFromFilePRead> net_dev;
|
||||||
|
|
||||||
|
std::optional<ReadBufferFromFilePRead> cgroupmem_limit_in_bytes;
|
||||||
|
std::optional<ReadBufferFromFilePRead> cgroupmem_usage_in_bytes;
|
||||||
|
|
||||||
std::vector<std::unique_ptr<ReadBufferFromFilePRead>> thermal;
|
std::vector<std::unique_ptr<ReadBufferFromFilePRead>> thermal;
|
||||||
|
|
||||||
std::unordered_map<String /* device name */,
|
std::unordered_map<String /* device name */,
|
||||||
|
@ -98,9 +98,15 @@ void CancelableSharedMutex::lock_shared()
|
|||||||
bool CancelableSharedMutex::try_lock_shared()
|
bool CancelableSharedMutex::try_lock_shared()
|
||||||
{
|
{
|
||||||
UInt64 value = state.load();
|
UInt64 value = state.load();
|
||||||
if (!(value & writers) && state.compare_exchange_strong(value, value + 1)) // overflow is not realistic
|
while (true)
|
||||||
return true;
|
{
|
||||||
return false;
|
if (value & writers)
|
||||||
|
return false;
|
||||||
|
if (state.compare_exchange_strong(value, value + 1)) // overflow is not realistic
|
||||||
|
break;
|
||||||
|
// Concurrent try_lock_shared() should not fail, so we have to retry CAS, but avoid blocking wait
|
||||||
|
}
|
||||||
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
void CancelableSharedMutex::unlock_shared()
|
void CancelableSharedMutex::unlock_shared()
|
||||||
|
@ -39,6 +39,15 @@ enum class WeekModeFlag : UInt8
|
|||||||
};
|
};
|
||||||
using YearWeek = std::pair<UInt16, UInt8>;
|
using YearWeek = std::pair<UInt16, UInt8>;
|
||||||
|
|
||||||
|
/// Modes for toDayOfWeek() function.
|
||||||
|
enum class WeekDayMode
|
||||||
|
{
|
||||||
|
WeekStartsMonday1 = 0,
|
||||||
|
WeekStartsMonday0 = 1,
|
||||||
|
WeekStartsSunday0 = 2,
|
||||||
|
WeekStartsSunday1 = 3
|
||||||
|
};
|
||||||
|
|
||||||
/** Lookup table to conversion of time to date, and to month / year / day of week / day of month and so on.
|
/** Lookup table to conversion of time to date, and to month / year / day of week / day of month and so on.
|
||||||
* First time was implemented for OLAPServer, that needed to do billions of such transformations.
|
* First time was implemented for OLAPServer, that needed to do billions of such transformations.
|
||||||
*/
|
*/
|
||||||
@ -619,9 +628,28 @@ public:
|
|||||||
template <typename DateOrTime>
|
template <typename DateOrTime>
|
||||||
inline Int16 toYear(DateOrTime v) const { return lut[toLUTIndex(v)].year; }
|
inline Int16 toYear(DateOrTime v) const { return lut[toLUTIndex(v)].year; }
|
||||||
|
|
||||||
|
/// 1-based, starts on Monday
|
||||||
template <typename DateOrTime>
|
template <typename DateOrTime>
|
||||||
inline UInt8 toDayOfWeek(DateOrTime v) const { return lut[toLUTIndex(v)].day_of_week; }
|
inline UInt8 toDayOfWeek(DateOrTime v) const { return lut[toLUTIndex(v)].day_of_week; }
|
||||||
|
|
||||||
|
template <typename DateOrTime>
|
||||||
|
inline UInt8 toDayOfWeek(DateOrTime v, UInt8 week_day_mode) const
|
||||||
|
{
|
||||||
|
WeekDayMode mode = check_week_day_mode(week_day_mode);
|
||||||
|
|
||||||
|
UInt8 res = toDayOfWeek(v);
|
||||||
|
using enum WeekDayMode;
|
||||||
|
bool start_from_sunday = (mode == WeekStartsSunday0 || mode == WeekStartsSunday1);
|
||||||
|
bool zero_based = (mode == WeekStartsMonday0 || mode == WeekStartsSunday0);
|
||||||
|
|
||||||
|
if (start_from_sunday)
|
||||||
|
res = res % 7 + 1;
|
||||||
|
if (zero_based)
|
||||||
|
--res;
|
||||||
|
|
||||||
|
return res;
|
||||||
|
}
|
||||||
|
|
||||||
template <typename DateOrTime>
|
template <typename DateOrTime>
|
||||||
inline UInt8 toDayOfMonth(DateOrTime v) const { return lut[toLUTIndex(v)].day_of_month; }
|
inline UInt8 toDayOfMonth(DateOrTime v) const { return lut[toLUTIndex(v)].day_of_month; }
|
||||||
|
|
||||||
@ -844,6 +872,12 @@ public:
|
|||||||
return week_format;
|
return week_format;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Check and change mode to effective.
|
||||||
|
inline WeekDayMode check_week_day_mode(UInt8 mode) const /// NOLINT
|
||||||
|
{
|
||||||
|
return static_cast<WeekDayMode>(mode & 3);
|
||||||
|
}
|
||||||
|
|
||||||
/** Calculate weekday from d.
|
/** Calculate weekday from d.
|
||||||
* Returns 0 for monday, 1 for tuesday...
|
* Returns 0 for monday, 1 for tuesday...
|
||||||
*/
|
*/
|
||||||
|
@ -12,6 +12,7 @@
|
|||||||
|
|
||||||
namespace DB
|
namespace DB
|
||||||
{
|
{
|
||||||
|
|
||||||
namespace ErrorCodes
|
namespace ErrorCodes
|
||||||
{
|
{
|
||||||
extern const int BAD_ARGUMENTS;
|
extern const int BAD_ARGUMENTS;
|
||||||
@ -20,7 +21,12 @@ namespace ErrorCodes
|
|||||||
namespace FST
|
namespace FST
|
||||||
{
|
{
|
||||||
|
|
||||||
UInt64 Arc::serialize(WriteBuffer& write_buffer) const
|
Arc::Arc(Output output_, const StatePtr & target_)
|
||||||
|
: output(output_)
|
||||||
|
, target(target_)
|
||||||
|
{}
|
||||||
|
|
||||||
|
UInt64 Arc::serialize(WriteBuffer & write_buffer) const
|
||||||
{
|
{
|
||||||
UInt64 written_bytes = 0;
|
UInt64 written_bytes = 0;
|
||||||
bool has_output = output != 0;
|
bool has_output = output != 0;
|
||||||
@ -55,6 +61,14 @@ void LabelsAsBitmap::addLabel(char label)
|
|||||||
data |= bit_label;
|
data |= bit_label;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
bool LabelsAsBitmap::hasLabel(char label) const
|
||||||
|
{
|
||||||
|
UInt8 index = label;
|
||||||
|
UInt256 bit_label = 1;
|
||||||
|
bit_label <<= index;
|
||||||
|
return ((data & bit_label) != 0);
|
||||||
|
}
|
||||||
|
|
||||||
UInt64 LabelsAsBitmap::getIndex(char label) const
|
UInt64 LabelsAsBitmap::getIndex(char label) const
|
||||||
{
|
{
|
||||||
UInt64 bit_count = 0;
|
UInt64 bit_count = 0;
|
||||||
@ -78,7 +92,7 @@ UInt64 LabelsAsBitmap::getIndex(char label) const
|
|||||||
return bit_count;
|
return bit_count;
|
||||||
}
|
}
|
||||||
|
|
||||||
UInt64 LabelsAsBitmap::serialize(WriteBuffer& write_buffer)
|
UInt64 LabelsAsBitmap::serialize(WriteBuffer & write_buffer)
|
||||||
{
|
{
|
||||||
writeVarUInt(data.items[0], write_buffer);
|
writeVarUInt(data.items[0], write_buffer);
|
||||||
writeVarUInt(data.items[1], write_buffer);
|
writeVarUInt(data.items[1], write_buffer);
|
||||||
@ -91,19 +105,28 @@ UInt64 LabelsAsBitmap::serialize(WriteBuffer& write_buffer)
|
|||||||
+ getLengthOfVarUInt(data.items[3]);
|
+ getLengthOfVarUInt(data.items[3]);
|
||||||
}
|
}
|
||||||
|
|
||||||
bool LabelsAsBitmap::hasLabel(char label) const
|
UInt64 State::hash() const
|
||||||
{
|
{
|
||||||
UInt8 index = label;
|
std::vector<char> values;
|
||||||
UInt256 bit_label = 1;
|
values.reserve(arcs.size() * (sizeof(Output) + sizeof(UInt64) + 1));
|
||||||
bit_label <<= index;
|
|
||||||
|
|
||||||
return ((data & bit_label) != 0);
|
for (const auto & [label, arc] : arcs)
|
||||||
|
{
|
||||||
|
values.push_back(label);
|
||||||
|
const auto * ptr = reinterpret_cast<const char *>(&arc.output);
|
||||||
|
std::copy(ptr, ptr + sizeof(Output), std::back_inserter(values));
|
||||||
|
|
||||||
|
ptr = reinterpret_cast<const char *>(&arc.target->id);
|
||||||
|
std::copy(ptr, ptr + sizeof(UInt64), std::back_inserter(values));
|
||||||
|
}
|
||||||
|
|
||||||
|
return CityHash_v1_0_2::CityHash64(values.data(), values.size());
|
||||||
}
|
}
|
||||||
|
|
||||||
Arc* State::getArc(char label) const
|
Arc * State::getArc(char label) const
|
||||||
{
|
{
|
||||||
auto it = arcs.find(label);
|
auto it = arcs.find(label);
|
||||||
if (it == arcs.cend())
|
if (it == arcs.end())
|
||||||
return nullptr;
|
return nullptr;
|
||||||
|
|
||||||
return const_cast<Arc *>(&it->second);
|
return const_cast<Arc *>(&it->second);
|
||||||
@ -118,46 +141,11 @@ void State::clear()
|
|||||||
{
|
{
|
||||||
id = 0;
|
id = 0;
|
||||||
state_index = 0;
|
state_index = 0;
|
||||||
flag = 0;
|
|
||||||
|
|
||||||
arcs.clear();
|
arcs.clear();
|
||||||
|
flag = 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
UInt64 State::hash() const
|
UInt64 State::serialize(WriteBuffer & write_buffer)
|
||||||
{
|
|
||||||
std::vector<char> values;
|
|
||||||
values.reserve(arcs.size() * (sizeof(Output) + sizeof(UInt64) + 1));
|
|
||||||
for (const auto & [label, arc] : arcs)
|
|
||||||
{
|
|
||||||
values.push_back(label);
|
|
||||||
const auto * ptr = reinterpret_cast<const char*>(&arc.output);
|
|
||||||
std::copy(ptr, ptr + sizeof(Output), std::back_inserter(values));
|
|
||||||
|
|
||||||
ptr = reinterpret_cast<const char*>(&arc.target->id);
|
|
||||||
std::copy(ptr, ptr + sizeof(UInt64), std::back_inserter(values));
|
|
||||||
}
|
|
||||||
|
|
||||||
return CityHash_v1_0_2::CityHash64(values.data(), values.size());
|
|
||||||
}
|
|
||||||
|
|
||||||
bool operator== (const State & state1, const State & state2)
|
|
||||||
{
|
|
||||||
if (state1.arcs.size() != state2.arcs.size())
|
|
||||||
return false;
|
|
||||||
|
|
||||||
for (const auto & [label, arc] : state1.arcs)
|
|
||||||
{
|
|
||||||
const auto it = state2.arcs.find(label);
|
|
||||||
if (it == state2.arcs.cend())
|
|
||||||
return false;
|
|
||||||
|
|
||||||
if (it->second != arc)
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
UInt64 State::serialize(WriteBuffer& write_buffer)
|
|
||||||
{
|
{
|
||||||
UInt64 written_bytes = 0;
|
UInt64 written_bytes = 0;
|
||||||
|
|
||||||
@ -171,10 +159,8 @@ UInt64 State::serialize(WriteBuffer& write_buffer)
|
|||||||
std::vector<char> labels;
|
std::vector<char> labels;
|
||||||
labels.reserve(arcs.size());
|
labels.reserve(arcs.size());
|
||||||
|
|
||||||
for (auto& [label, state] : arcs)
|
for (auto & [label, state] : arcs)
|
||||||
{
|
|
||||||
labels.push_back(label);
|
labels.push_back(label);
|
||||||
}
|
|
||||||
|
|
||||||
UInt8 label_size = labels.size();
|
UInt8 label_size = labels.size();
|
||||||
write_buffer.write(label_size);
|
write_buffer.write(label_size);
|
||||||
@ -186,7 +172,7 @@ UInt64 State::serialize(WriteBuffer& write_buffer)
|
|||||||
/// Serialize all arcs
|
/// Serialize all arcs
|
||||||
for (char label : labels)
|
for (char label : labels)
|
||||||
{
|
{
|
||||||
Arc* arc = getArc(label);
|
Arc * arc = getArc(label);
|
||||||
assert(arc != nullptr);
|
assert(arc != nullptr);
|
||||||
written_bytes += arc->serialize(write_buffer);
|
written_bytes += arc->serialize(write_buffer);
|
||||||
}
|
}
|
||||||
@ -196,15 +182,13 @@ UInt64 State::serialize(WriteBuffer& write_buffer)
|
|||||||
/// Serialize bitmap
|
/// Serialize bitmap
|
||||||
LabelsAsBitmap bmp;
|
LabelsAsBitmap bmp;
|
||||||
for (auto & [label, state] : arcs)
|
for (auto & [label, state] : arcs)
|
||||||
{
|
|
||||||
bmp.addLabel(label);
|
bmp.addLabel(label);
|
||||||
}
|
|
||||||
written_bytes += bmp.serialize(write_buffer);
|
written_bytes += bmp.serialize(write_buffer);
|
||||||
|
|
||||||
/// Serialize all arcs
|
/// Serialize all arcs
|
||||||
for (auto & [label, state] : arcs)
|
for (auto & [label, state] : arcs)
|
||||||
{
|
{
|
||||||
Arc* arc = getArc(label);
|
Arc * arc = getArc(label);
|
||||||
assert(arc != nullptr);
|
assert(arc != nullptr);
|
||||||
written_bytes += arc->serialize(write_buffer);
|
written_bytes += arc->serialize(write_buffer);
|
||||||
}
|
}
|
||||||
@ -213,16 +197,36 @@ UInt64 State::serialize(WriteBuffer& write_buffer)
|
|||||||
return written_bytes;
|
return written_bytes;
|
||||||
}
|
}
|
||||||
|
|
||||||
FSTBuilder::FSTBuilder(WriteBuffer& write_buffer_) : write_buffer(write_buffer_)
|
bool operator==(const State & state1, const State & state2)
|
||||||
|
{
|
||||||
|
if (state1.arcs.size() != state2.arcs.size())
|
||||||
|
return false;
|
||||||
|
|
||||||
|
for (const auto & [label, arc] : state1.arcs)
|
||||||
|
{
|
||||||
|
const auto it = state2.arcs.find(label);
|
||||||
|
if (it == state2.arcs.end())
|
||||||
|
return false;
|
||||||
|
|
||||||
|
if (it->second != arc)
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
void State::readFlag(ReadBuffer & read_buffer)
|
||||||
|
{
|
||||||
|
read_buffer.readStrict(reinterpret_cast<char &>(flag));
|
||||||
|
}
|
||||||
|
|
||||||
|
FstBuilder::FstBuilder(WriteBuffer & write_buffer_) : write_buffer(write_buffer_)
|
||||||
{
|
{
|
||||||
for (auto & temp_state : temp_states)
|
for (auto & temp_state : temp_states)
|
||||||
{
|
|
||||||
temp_state = std::make_shared<State>();
|
temp_state = std::make_shared<State>();
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/// See FindMinimized in the paper pseudo code l11-l21.
|
/// See FindMinimized in the paper pseudo code l11-l21.
|
||||||
StatePtr FSTBuilder::findMinimized(const State & state, bool & found)
|
StatePtr FstBuilder::findMinimized(const State & state, bool & found)
|
||||||
{
|
{
|
||||||
found = false;
|
found = false;
|
||||||
auto hash = state.hash();
|
auto hash = state.hash();
|
||||||
@ -230,7 +234,7 @@ StatePtr FSTBuilder::findMinimized(const State & state, bool & found)
|
|||||||
/// MEMBER: in the paper pseudo code l15
|
/// MEMBER: in the paper pseudo code l15
|
||||||
auto it = minimized_states.find(hash);
|
auto it = minimized_states.find(hash);
|
||||||
|
|
||||||
if (it != minimized_states.cend() && *it->second == state)
|
if (it != minimized_states.end() && *it->second == state)
|
||||||
{
|
{
|
||||||
found = true;
|
found = true;
|
||||||
return it->second;
|
return it->second;
|
||||||
@ -244,8 +248,11 @@ StatePtr FSTBuilder::findMinimized(const State & state, bool & found)
|
|||||||
return p;
|
return p;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
namespace
|
||||||
|
{
|
||||||
|
|
||||||
/// See the paper pseudo code l33-34.
|
/// See the paper pseudo code l33-34.
|
||||||
size_t FSTBuilder::getCommonPrefixLength(const String & word1, const String & word2)
|
size_t getCommonPrefixLength(std::string_view word1, std::string_view word2)
|
||||||
{
|
{
|
||||||
size_t i = 0;
|
size_t i = 0;
|
||||||
while (i < word1.size() && i < word2.size() && word1[i] == word2[i])
|
while (i < word1.size() && i < word2.size() && word1[i] == word2[i])
|
||||||
@ -253,8 +260,10 @@ size_t FSTBuilder::getCommonPrefixLength(const String & word1, const String & wo
|
|||||||
return i;
|
return i;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
/// See the paper pseudo code l33-39 and l70-72(when down_to is 0).
|
/// See the paper pseudo code l33-39 and l70-72(when down_to is 0).
|
||||||
void FSTBuilder::minimizePreviousWordSuffix(Int64 down_to)
|
void FstBuilder::minimizePreviousWordSuffix(Int64 down_to)
|
||||||
{
|
{
|
||||||
for (Int64 i = static_cast<Int64>(previous_word.size()); i >= down_to; --i)
|
for (Int64 i = static_cast<Int64>(previous_word.size()); i >= down_to; --i)
|
||||||
{
|
{
|
||||||
@ -264,7 +273,7 @@ void FSTBuilder::minimizePreviousWordSuffix(Int64 down_to)
|
|||||||
if (i != 0)
|
if (i != 0)
|
||||||
{
|
{
|
||||||
Output output = 0;
|
Output output = 0;
|
||||||
Arc* arc = temp_states[i - 1]->getArc(previous_word[i - 1]);
|
Arc * arc = temp_states[i - 1]->getArc(previous_word[i - 1]);
|
||||||
if (arc)
|
if (arc)
|
||||||
output = arc->output;
|
output = arc->output;
|
||||||
|
|
||||||
@ -287,7 +296,7 @@ void FSTBuilder::minimizePreviousWordSuffix(Int64 down_to)
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void FSTBuilder::add(const std::string & current_word, Output current_output)
|
void FstBuilder::add(std::string_view current_word, Output current_output)
|
||||||
{
|
{
|
||||||
/// We assume word size is no greater than MAX_TERM_LENGTH(256).
|
/// We assume word size is no greater than MAX_TERM_LENGTH(256).
|
||||||
/// FSTs without word size limitation would be inefficient and easy to cause memory bloat
|
/// FSTs without word size limitation would be inefficient and easy to cause memory bloat
|
||||||
@ -295,10 +304,10 @@ void FSTBuilder::add(const std::string & current_word, Output current_output)
|
|||||||
/// MAX_TERM_LENGTH, the granule cannot be dropped and will be fully-scanned. It doesn't affect "ngram" tokenizers.
|
/// MAX_TERM_LENGTH, the granule cannot be dropped and will be fully-scanned. It doesn't affect "ngram" tokenizers.
|
||||||
/// Another limitation is that if the query string has tokens which exceed this length
|
/// Another limitation is that if the query string has tokens which exceed this length
|
||||||
/// it will fallback to default searching when using "split" tokenizers.
|
/// it will fallback to default searching when using "split" tokenizers.
|
||||||
auto current_word_len = current_word.size();
|
size_t current_word_len = current_word.size();
|
||||||
|
|
||||||
if (current_word_len > MAX_TERM_LENGTH)
|
if (current_word_len > MAX_TERM_LENGTH)
|
||||||
throw DB::Exception(DB::ErrorCodes::BAD_ARGUMENTS, "Too long term ({}) passed to FST builder.", current_word_len);
|
throw DB::Exception(DB::ErrorCodes::BAD_ARGUMENTS, "Cannot build inverted index: The maximum term length is {}, this is exceeded by term {}", MAX_TERM_LENGTH, current_word_len);
|
||||||
|
|
||||||
size_t prefix_length_plus1 = getCommonPrefixLength(current_word, previous_word) + 1;
|
size_t prefix_length_plus1 = getCommonPrefixLength(current_word, previous_word) + 1;
|
||||||
|
|
||||||
@ -333,9 +342,7 @@ void FSTBuilder::add(const std::string & current_word, Output current_output)
|
|||||||
if (word_suffix != 0)
|
if (word_suffix != 0)
|
||||||
{
|
{
|
||||||
for (auto & [label, arc] : temp_states[i]->arcs)
|
for (auto & [label, arc] : temp_states[i]->arcs)
|
||||||
{
|
|
||||||
arc.output += word_suffix;
|
arc.output += word_suffix;
|
||||||
}
|
|
||||||
}
|
}
|
||||||
/// Reduce current_output
|
/// Reduce current_output
|
||||||
current_output -= common_prefix;
|
current_output -= common_prefix;
|
||||||
@ -350,7 +357,7 @@ void FSTBuilder::add(const std::string & current_word, Output current_output)
|
|||||||
previous_word = current_word;
|
previous_word = current_word;
|
||||||
}
|
}
|
||||||
|
|
||||||
UInt64 FSTBuilder::build()
|
UInt64 FstBuilder::build()
|
||||||
{
|
{
|
||||||
minimizePreviousWordSuffix(0);
|
minimizePreviousWordSuffix(0);
|
||||||
|
|
||||||
@ -364,7 +371,8 @@ UInt64 FSTBuilder::build()
|
|||||||
return previous_state_index + previous_written_bytes + length + 1;
|
return previous_state_index + previous_written_bytes + length + 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
FiniteStateTransducer::FiniteStateTransducer(std::vector<UInt8> data_) : data(std::move(data_))
|
FiniteStateTransducer::FiniteStateTransducer(std::vector<UInt8> data_)
|
||||||
|
: data(std::move(data_))
|
||||||
{
|
{
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -373,28 +381,28 @@ void FiniteStateTransducer::clear()
|
|||||||
data.clear();
|
data.clear();
|
||||||
}
|
}
|
||||||
|
|
||||||
std::pair<UInt64, bool> FiniteStateTransducer::getOutput(const String & term)
|
std::pair<UInt64, bool> FiniteStateTransducer::getOutput(std::string_view term)
|
||||||
{
|
{
|
||||||
std::pair<UInt64, bool> result{ 0, false };
|
std::pair<UInt64, bool> result(0, false);
|
||||||
|
|
||||||
/// Read index of initial state
|
/// Read index of initial state
|
||||||
ReadBufferFromMemory read_buffer(data.data(), data.size());
|
ReadBufferFromMemory read_buffer(data.data(), data.size());
|
||||||
read_buffer.seek(data.size()-1, SEEK_SET);
|
read_buffer.seek(data.size() - 1, SEEK_SET);
|
||||||
|
|
||||||
UInt8 length{ 0 };
|
UInt8 length = 0;
|
||||||
read_buffer.readStrict(reinterpret_cast<char&>(length));
|
read_buffer.readStrict(reinterpret_cast<char &>(length));
|
||||||
|
|
||||||
/// FST contains no terms
|
/// FST contains no terms
|
||||||
if (length == 0)
|
if (length == 0)
|
||||||
return { 0, false };
|
return {0, false};
|
||||||
|
|
||||||
read_buffer.seek(data.size() - 1 - length, SEEK_SET);
|
read_buffer.seek(data.size() - 1 - length, SEEK_SET);
|
||||||
UInt64 state_index{ 0 };
|
UInt64 state_index = 0;
|
||||||
readVarUInt(state_index, read_buffer);
|
readVarUInt(state_index, read_buffer);
|
||||||
|
|
||||||
for (size_t i = 0; i <= term.size(); ++i)
|
for (size_t i = 0; i <= term.size(); ++i)
|
||||||
{
|
{
|
||||||
UInt64 arc_output{ 0 };
|
UInt64 arc_output = 0;
|
||||||
|
|
||||||
/// Read flag
|
/// Read flag
|
||||||
State temp_state;
|
State temp_state;
|
||||||
@ -411,22 +419,22 @@ std::pair<UInt64, bool> FiniteStateTransducer::getOutput(const String & term)
|
|||||||
if (temp_state.getEncodingMethod() == State::EncodingMethod::Sequential)
|
if (temp_state.getEncodingMethod() == State::EncodingMethod::Sequential)
|
||||||
{
|
{
|
||||||
/// Read number of labels
|
/// Read number of labels
|
||||||
UInt8 label_num{ 0 };
|
UInt8 label_num = 0;
|
||||||
read_buffer.readStrict(reinterpret_cast<char&>(label_num));
|
read_buffer.readStrict(reinterpret_cast<char &>(label_num));
|
||||||
|
|
||||||
if (label_num == 0)
|
if (label_num == 0)
|
||||||
return { 0, false };
|
return {0, false};
|
||||||
|
|
||||||
auto labels_position = read_buffer.getPosition();
|
auto labels_position = read_buffer.getPosition();
|
||||||
|
|
||||||
/// Find the index of the label from "labels" bytes
|
/// Find the index of the label from "labels" bytes
|
||||||
auto begin_it{ data.begin() + labels_position };
|
auto begin_it = data.begin() + labels_position;
|
||||||
auto end_it{ data.begin() + labels_position + label_num };
|
auto end_it = data.begin() + labels_position + label_num;
|
||||||
|
|
||||||
auto pos = std::find(begin_it, end_it, label);
|
auto pos = std::find(begin_it, end_it, label);
|
||||||
|
|
||||||
if (pos == end_it)
|
if (pos == end_it)
|
||||||
return { 0, false };
|
return {0, false};
|
||||||
|
|
||||||
/// Read the arc for the label
|
/// Read the arc for the label
|
||||||
UInt64 arc_index = (pos - begin_it);
|
UInt64 arc_index = (pos - begin_it);
|
||||||
@ -439,9 +447,7 @@ std::pair<UInt64, bool> FiniteStateTransducer::getOutput(const String & term)
|
|||||||
arc_output = 0;
|
arc_output = 0;
|
||||||
readVarUInt(state_index, read_buffer);
|
readVarUInt(state_index, read_buffer);
|
||||||
if (state_index & 0x1) // output is followed
|
if (state_index & 0x1) // output is followed
|
||||||
{
|
|
||||||
readVarUInt(arc_output, read_buffer);
|
readVarUInt(arc_output, read_buffer);
|
||||||
}
|
|
||||||
state_index >>= 1;
|
state_index >>= 1;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -455,7 +461,7 @@ std::pair<UInt64, bool> FiniteStateTransducer::getOutput(const String & term)
|
|||||||
readVarUInt(bmp.data.items[3], read_buffer);
|
readVarUInt(bmp.data.items[3], read_buffer);
|
||||||
|
|
||||||
if (!bmp.hasLabel(label))
|
if (!bmp.hasLabel(label))
|
||||||
return { 0, false };
|
return {0, false};
|
||||||
|
|
||||||
/// Read the arc for the label
|
/// Read the arc for the label
|
||||||
size_t arc_index = bmp.getIndex(label);
|
size_t arc_index = bmp.getIndex(label);
|
||||||
@ -465,9 +471,7 @@ std::pair<UInt64, bool> FiniteStateTransducer::getOutput(const String & term)
|
|||||||
arc_output = 0;
|
arc_output = 0;
|
||||||
readVarUInt(state_index, read_buffer);
|
readVarUInt(state_index, read_buffer);
|
||||||
if (state_index & 0x1) // output is followed
|
if (state_index & 0x1) // output is followed
|
||||||
{
|
|
||||||
readVarUInt(arc_output, read_buffer);
|
readVarUInt(arc_output, read_buffer);
|
||||||
}
|
|
||||||
state_index >>= 1;
|
state_index >>= 1;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -476,5 +480,7 @@ std::pair<UInt64, bool> FiniteStateTransducer::getOutput(const String & term)
|
|||||||
}
|
}
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
@ -19,18 +19,18 @@ namespace DB
|
|||||||
/// [Direct Construction of Minimal Acyclic Subsequential Transduers] by Stoyan Mihov and Denis Maurel, University of Tours, France
|
/// [Direct Construction of Minimal Acyclic Subsequential Transduers] by Stoyan Mihov and Denis Maurel, University of Tours, France
|
||||||
namespace FST
|
namespace FST
|
||||||
{
|
{
|
||||||
|
|
||||||
using Output = UInt64;
|
using Output = UInt64;
|
||||||
|
|
||||||
class State;
|
class State;
|
||||||
using StatePtr = std::shared_ptr<State>;
|
using StatePtr = std::shared_ptr<State>;
|
||||||
|
|
||||||
/// Arc represents a transition from one state to another
|
/// Arc represents a transition from one state to another.
|
||||||
/// It includes the target state to which the arc points and the arc's output.
|
/// It includes the target state to which the arc points and the arc's output.
|
||||||
struct Arc
|
struct Arc
|
||||||
{
|
{
|
||||||
Arc() = default;
|
Arc() = default;
|
||||||
|
Arc(Output output_, const StatePtr & target_);
|
||||||
explicit Arc(Output output_, const StatePtr & target_) : output{output_}, target{target_} { }
|
|
||||||
|
|
||||||
/// 0 means the arc has no output
|
/// 0 means the arc has no output
|
||||||
Output output = 0;
|
Output output = 0;
|
||||||
@ -53,13 +53,15 @@ public:
|
|||||||
/// computes the rank
|
/// computes the rank
|
||||||
UInt64 getIndex(char label) const;
|
UInt64 getIndex(char label) const;
|
||||||
|
|
||||||
UInt64 serialize(WriteBuffer& write_buffer);
|
UInt64 serialize(WriteBuffer & write_buffer);
|
||||||
|
|
||||||
private:
|
private:
|
||||||
friend class State;
|
|
||||||
friend class FiniteStateTransducer;
|
|
||||||
/// data holds a 256-bit bitmap for all labels of a state. Its 256 bits correspond to 256
|
/// data holds a 256-bit bitmap for all labels of a state. Its 256 bits correspond to 256
|
||||||
/// possible label values.
|
/// possible label values.
|
||||||
UInt256 data{ 0 };
|
UInt256 data = 0;
|
||||||
|
|
||||||
|
friend class State;
|
||||||
|
friend class FiniteStateTransducer;
|
||||||
};
|
};
|
||||||
|
|
||||||
/// State implements the State in Finite State Transducer
|
/// State implements the State in Finite State Transducer
|
||||||
@ -77,9 +79,9 @@ public:
|
|||||||
/// Note this is NOT enabled for now since it is experimental
|
/// Note this is NOT enabled for now since it is experimental
|
||||||
Bitmap,
|
Bitmap,
|
||||||
};
|
};
|
||||||
State() = default;
|
|
||||||
|
|
||||||
State(const State & state) = default;
|
State() = default;
|
||||||
|
State(const State & State) = default;
|
||||||
|
|
||||||
UInt64 hash() const;
|
UInt64 hash() const;
|
||||||
|
|
||||||
@ -91,22 +93,12 @@ public:
|
|||||||
|
|
||||||
UInt64 serialize(WriteBuffer & write_buffer);
|
UInt64 serialize(WriteBuffer & write_buffer);
|
||||||
|
|
||||||
bool isFinal() const
|
bool isFinal() const { return flag_values.is_final == 1; }
|
||||||
{
|
void setFinal(bool value) { flag_values.is_final = value; }
|
||||||
return flag_values.is_final == 1;
|
|
||||||
}
|
EncodingMethod getEncodingMethod() const { return flag_values.encoding_method; }
|
||||||
void setFinal(bool value)
|
|
||||||
{
|
void readFlag(ReadBuffer & read_buffer);
|
||||||
flag_values.is_final = value;
|
|
||||||
}
|
|
||||||
EncodingMethod getEncodingMethod() const
|
|
||||||
{
|
|
||||||
return flag_values.encoding_method;
|
|
||||||
}
|
|
||||||
void readFlag(ReadBuffer & read_buffer)
|
|
||||||
{
|
|
||||||
read_buffer.readStrict(reinterpret_cast<char&>(flag));
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Transient ID of the state which is used for building FST. It won't be serialized
|
/// Transient ID of the state which is used for building FST. It won't be serialized
|
||||||
UInt64 id = 0;
|
UInt64 id = 0;
|
||||||
@ -116,6 +108,7 @@ public:
|
|||||||
|
|
||||||
/// Arcs which are started from state, the 'char' is the label on the arc
|
/// Arcs which are started from state, the 'char' is the label on the arc
|
||||||
std::unordered_map<char, Arc> arcs;
|
std::unordered_map<char, Arc> arcs;
|
||||||
|
|
||||||
private:
|
private:
|
||||||
struct FlagValues
|
struct FlagValues
|
||||||
{
|
{
|
||||||
@ -132,22 +125,21 @@ private:
|
|||||||
|
|
||||||
bool operator==(const State & state1, const State & state2);
|
bool operator==(const State & state1, const State & state2);
|
||||||
|
|
||||||
inline constexpr size_t MAX_TERM_LENGTH = 256;
|
static constexpr size_t MAX_TERM_LENGTH = 256;
|
||||||
|
|
||||||
/// FSTBuilder is used to build Finite State Transducer by adding words incrementally.
|
/// FstBuilder is used to build Finite State Transducer by adding words incrementally.
|
||||||
/// Note that all the words have to be added in sorted order in order to achieve minimized result.
|
/// Note that all the words have to be added in sorted order in order to achieve minimized result.
|
||||||
/// In the end, the caller should call build() to serialize minimized FST to WriteBuffer
|
/// In the end, the caller should call build() to serialize minimized FST to WriteBuffer.
|
||||||
class FSTBuilder
|
class FstBuilder
|
||||||
{
|
{
|
||||||
public:
|
public:
|
||||||
explicit FSTBuilder(WriteBuffer & write_buffer_);
|
explicit FstBuilder(WriteBuffer & write_buffer_);
|
||||||
|
|
||||||
void add(const std::string & word, Output output);
|
void add(std::string_view word, Output output);
|
||||||
UInt64 build();
|
UInt64 build();
|
||||||
private:
|
private:
|
||||||
StatePtr findMinimized(const State & s, bool & found);
|
StatePtr findMinimized(const State & s, bool & found);
|
||||||
void minimizePreviousWordSuffix(Int64 down_to);
|
void minimizePreviousWordSuffix(Int64 down_to);
|
||||||
static size_t getCommonPrefixLength(const String & word1, const String & word2);
|
|
||||||
|
|
||||||
std::array<StatePtr, MAX_TERM_LENGTH + 1> temp_states;
|
std::array<StatePtr, MAX_TERM_LENGTH + 1> temp_states;
|
||||||
String previous_word;
|
String previous_word;
|
||||||
@ -171,8 +163,8 @@ class FiniteStateTransducer
|
|||||||
public:
|
public:
|
||||||
FiniteStateTransducer() = default;
|
FiniteStateTransducer() = default;
|
||||||
explicit FiniteStateTransducer(std::vector<UInt8> data_);
|
explicit FiniteStateTransducer(std::vector<UInt8> data_);
|
||||||
std::pair<UInt64, bool> getOutput(const String & term);
|
|
||||||
void clear();
|
void clear();
|
||||||
|
std::pair<UInt64, bool> getOutput(std::string_view term);
|
||||||
std::vector<UInt8> & getData() { return data; }
|
std::vector<UInt8> & getData() { return data; }
|
||||||
|
|
||||||
private:
|
private:
|
||||||
|
@ -1,6 +1,6 @@
|
|||||||
#pragma once
|
#pragma once
|
||||||
|
|
||||||
#include <shared_mutex>
|
#include <Common/SharedMutex.h>
|
||||||
#include <Common/ProfileEvents.h>
|
#include <Common/ProfileEvents.h>
|
||||||
#include <Common/Stopwatch.h>
|
#include <Common/Stopwatch.h>
|
||||||
|
|
||||||
@ -12,7 +12,7 @@ class ProfilingScopedWriteRWLock
|
|||||||
{
|
{
|
||||||
public:
|
public:
|
||||||
|
|
||||||
ProfilingScopedWriteRWLock(std::shared_mutex & rwl_, ProfileEvents::Event event) :
|
ProfilingScopedWriteRWLock(SharedMutex & rwl_, ProfileEvents::Event event) :
|
||||||
scoped_write_lock(rwl_)
|
scoped_write_lock(rwl_)
|
||||||
{
|
{
|
||||||
ProfileEvents::increment(event, watch.elapsed());
|
ProfileEvents::increment(event, watch.elapsed());
|
||||||
@ -20,14 +20,14 @@ public:
|
|||||||
|
|
||||||
private:
|
private:
|
||||||
Stopwatch watch;
|
Stopwatch watch;
|
||||||
std::unique_lock<std::shared_mutex> scoped_write_lock;
|
std::unique_lock<SharedMutex> scoped_write_lock;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
class ProfilingScopedReadRWLock
|
class ProfilingScopedReadRWLock
|
||||||
{
|
{
|
||||||
public:
|
public:
|
||||||
ProfilingScopedReadRWLock(std::shared_mutex & rwl, ProfileEvents::Event event) :
|
ProfilingScopedReadRWLock(SharedMutex & rwl, ProfileEvents::Event event) :
|
||||||
scoped_read_lock(rwl)
|
scoped_read_lock(rwl)
|
||||||
{
|
{
|
||||||
ProfileEvents::increment(event, watch.elapsed());
|
ProfileEvents::increment(event, watch.elapsed());
|
||||||
@ -35,7 +35,7 @@ public:
|
|||||||
|
|
||||||
private:
|
private:
|
||||||
Stopwatch watch;
|
Stopwatch watch;
|
||||||
std::shared_lock<std::shared_mutex> scoped_read_lock;
|
std::shared_lock<SharedMutex> scoped_read_lock;
|
||||||
};
|
};
|
||||||
|
|
||||||
}
|
}
|
||||||
|
@ -37,9 +37,7 @@ void SharedMutex::lock()
|
|||||||
bool SharedMutex::try_lock()
|
bool SharedMutex::try_lock()
|
||||||
{
|
{
|
||||||
UInt64 value = 0;
|
UInt64 value = 0;
|
||||||
if (state.compare_exchange_strong(value, writers))
|
return state.compare_exchange_strong(value, writers);
|
||||||
return true;
|
|
||||||
return false;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
void SharedMutex::unlock()
|
void SharedMutex::unlock()
|
||||||
@ -68,9 +66,15 @@ void SharedMutex::lock_shared()
|
|||||||
bool SharedMutex::try_lock_shared()
|
bool SharedMutex::try_lock_shared()
|
||||||
{
|
{
|
||||||
UInt64 value = state.load();
|
UInt64 value = state.load();
|
||||||
if (!(value & writers) && state.compare_exchange_strong(value, value + 1))
|
while (true)
|
||||||
return true;
|
{
|
||||||
return false;
|
if (value & writers)
|
||||||
|
return false;
|
||||||
|
if (state.compare_exchange_strong(value, value + 1))
|
||||||
|
break;
|
||||||
|
// Concurrent try_lock_shared() should not fail, so we have to retry CAS, but avoid blocking wait
|
||||||
|
}
|
||||||
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
void SharedMutex::unlock_shared()
|
void SharedMutex::unlock_shared()
|
||||||
|
@ -13,7 +13,6 @@
|
|||||||
#include <map>
|
#include <map>
|
||||||
#include <memory>
|
#include <memory>
|
||||||
#include <mutex>
|
#include <mutex>
|
||||||
#include <shared_mutex>
|
|
||||||
#include <unordered_set>
|
#include <unordered_set>
|
||||||
|
|
||||||
|
|
||||||
|
@ -7,7 +7,7 @@
|
|||||||
|
|
||||||
TEST(FST, SimpleTest)
|
TEST(FST, SimpleTest)
|
||||||
{
|
{
|
||||||
std::vector<std::pair<std::string, DB::FST::Output>> indexed_data
|
std::vector<std::pair<String, DB::FST::Output>> indexed_data
|
||||||
{
|
{
|
||||||
{"mop", 100},
|
{"mop", 100},
|
||||||
{"moth", 91},
|
{"moth", 91},
|
||||||
@ -17,7 +17,7 @@ TEST(FST, SimpleTest)
|
|||||||
{"top", 55},
|
{"top", 55},
|
||||||
};
|
};
|
||||||
|
|
||||||
std::vector<std::pair<std::string, DB::FST::Output>> not_indexed_data
|
std::vector<std::pair<String, DB::FST::Output>> not_indexed_data
|
||||||
{
|
{
|
||||||
{"mo", 100},
|
{"mo", 100},
|
||||||
{"moth1", 91},
|
{"moth1", 91},
|
||||||
@ -29,42 +29,40 @@ TEST(FST, SimpleTest)
|
|||||||
|
|
||||||
std::vector<UInt8> buffer;
|
std::vector<UInt8> buffer;
|
||||||
DB::WriteBufferFromVector<std::vector<UInt8>> wbuf(buffer);
|
DB::WriteBufferFromVector<std::vector<UInt8>> wbuf(buffer);
|
||||||
DB::FST::FSTBuilder builder(wbuf);
|
DB::FST::FstBuilder builder(wbuf);
|
||||||
|
|
||||||
for (auto& [term, output] : indexed_data)
|
for (auto & [term, output] : indexed_data)
|
||||||
{
|
|
||||||
builder.add(term, output);
|
builder.add(term, output);
|
||||||
}
|
|
||||||
builder.build();
|
builder.build();
|
||||||
wbuf.finalize();
|
wbuf.finalize();
|
||||||
|
|
||||||
DB::FST::FiniteStateTransducer fst(buffer);
|
DB::FST::FiniteStateTransducer fst(buffer);
|
||||||
for (auto& [term, output] : indexed_data)
|
for (auto & [term, output] : indexed_data)
|
||||||
{
|
{
|
||||||
auto [result, found] = fst.getOutput(term);
|
auto [result, found] = fst.getOutput(term);
|
||||||
ASSERT_EQ(found, true);
|
ASSERT_TRUE(found);
|
||||||
ASSERT_EQ(result, output);
|
ASSERT_EQ(result, output);
|
||||||
}
|
}
|
||||||
|
|
||||||
for (auto& [term, output] : not_indexed_data)
|
for (auto & [term, output] : not_indexed_data)
|
||||||
{
|
{
|
||||||
auto [result, found] = fst.getOutput(term);
|
auto [result, found] = fst.getOutput(term);
|
||||||
ASSERT_EQ(found, false);
|
ASSERT_FALSE(found);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
TEST(FST, TestForLongTerms)
|
TEST(FST, TestForLongTerms)
|
||||||
{
|
{
|
||||||
/// Test long terms within limitation
|
/// Test long terms within limitation
|
||||||
std::string term1(DB::FST::MAX_TERM_LENGTH - 1, 'A');
|
String term1(DB::FST::MAX_TERM_LENGTH - 1, 'A');
|
||||||
std::string term2(DB::FST::MAX_TERM_LENGTH, 'B');
|
String term2(DB::FST::MAX_TERM_LENGTH, 'B');
|
||||||
|
|
||||||
DB::FST::Output output1 = 100;
|
DB::FST::Output output1 = 100;
|
||||||
DB::FST::Output output2 = 200;
|
DB::FST::Output output2 = 200;
|
||||||
|
|
||||||
std::vector<UInt8> buffer;
|
std::vector<UInt8> buffer;
|
||||||
DB::WriteBufferFromVector<std::vector<UInt8>> wbuf(buffer);
|
DB::WriteBufferFromVector<std::vector<UInt8>> wbuf(buffer);
|
||||||
DB::FST::FSTBuilder builder(wbuf);
|
DB::FST::FstBuilder builder(wbuf);
|
||||||
|
|
||||||
builder.add(term1, output1);
|
builder.add(term1, output1);
|
||||||
builder.add(term2, output2);
|
builder.add(term2, output2);
|
||||||
@ -75,20 +73,20 @@ TEST(FST, TestForLongTerms)
|
|||||||
DB::FST::FiniteStateTransducer fst(buffer);
|
DB::FST::FiniteStateTransducer fst(buffer);
|
||||||
|
|
||||||
auto [result1, found1] = fst.getOutput(term1);
|
auto [result1, found1] = fst.getOutput(term1);
|
||||||
ASSERT_EQ(found1, true);
|
ASSERT_TRUE(found1);
|
||||||
ASSERT_EQ(result1, output1);
|
ASSERT_EQ(result1, output1);
|
||||||
|
|
||||||
auto [result2, found2] = fst.getOutput(term2);
|
auto [result2, found2] = fst.getOutput(term2);
|
||||||
ASSERT_EQ(found2, true);
|
ASSERT_TRUE(found2);
|
||||||
ASSERT_EQ(result2, output2);
|
ASSERT_EQ(result2, output2);
|
||||||
|
|
||||||
/// Test exception case when term length exceeds limitation
|
/// Test exception case when term length exceeds limitation
|
||||||
std::string term3(DB::FST::MAX_TERM_LENGTH + 1, 'C');
|
String term3(DB::FST::MAX_TERM_LENGTH + 1, 'C');
|
||||||
DB::FST::Output output3 = 300;
|
DB::FST::Output output3 = 300;
|
||||||
|
|
||||||
std::vector<UInt8> buffer3;
|
std::vector<UInt8> buffer3;
|
||||||
DB::WriteBufferFromVector<std::vector<UInt8>> wbuf3(buffer3);
|
DB::WriteBufferFromVector<std::vector<UInt8>> wbuf3(buffer3);
|
||||||
DB::FST::FSTBuilder builder3(wbuf3);
|
DB::FST::FstBuilder builder3(wbuf3);
|
||||||
|
|
||||||
EXPECT_THROW(builder3.add(term3, output3), DB::Exception);
|
EXPECT_THROW(builder3.add(term3, output3), DB::Exception);
|
||||||
}
|
}
|
||||||
|
@ -27,7 +27,7 @@ namespace DB
|
|||||||
struct NoCancel {};
|
struct NoCancel {};
|
||||||
|
|
||||||
// for all PerfTests
|
// for all PerfTests
|
||||||
static constexpr int requests = 512 * 1024;
|
static constexpr int requests = 128 * 1024;
|
||||||
static constexpr int max_threads = 16;
|
static constexpr int max_threads = 16;
|
||||||
|
|
||||||
template <class T, class Status = NoCancel>
|
template <class T, class Status = NoCancel>
|
||||||
@ -91,6 +91,49 @@ void TestSharedMutex()
|
|||||||
|
|
||||||
ASSERT_EQ(test, writers);
|
ASSERT_EQ(test, writers);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Test multiple readers can acquire lock simultaneously using try_shared_lock
|
||||||
|
for (int readers = 1; readers <= 128; readers *= 2)
|
||||||
|
{
|
||||||
|
T sm;
|
||||||
|
std::atomic<int> test(0);
|
||||||
|
std::barrier sync(readers + 1);
|
||||||
|
|
||||||
|
std::vector<std::thread> threads;
|
||||||
|
threads.reserve(readers);
|
||||||
|
auto reader = [&]
|
||||||
|
{
|
||||||
|
[[maybe_unused]] Status status;
|
||||||
|
bool acquired = sm.try_lock_shared();
|
||||||
|
ASSERT_TRUE(acquired);
|
||||||
|
if (!acquired) return; // Just to make TSA happy
|
||||||
|
sync.arrive_and_wait(); // (A) sync with writer
|
||||||
|
test++;
|
||||||
|
sync.arrive_and_wait(); // (B) wait for writer to call try_lock() while shared_lock is held
|
||||||
|
sm.unlock_shared();
|
||||||
|
sync.arrive_and_wait(); // (C) wait for writer to release lock, to ensure try_lock_shared() will see no writer
|
||||||
|
};
|
||||||
|
|
||||||
|
for (int i = 0; i < readers; i++)
|
||||||
|
threads.emplace_back(reader);
|
||||||
|
|
||||||
|
{ // writer
|
||||||
|
[[maybe_unused]] Status status;
|
||||||
|
sync.arrive_and_wait(); // (A) wait for all reader to acquire lock to avoid blocking them
|
||||||
|
ASSERT_FALSE(sm.try_lock());
|
||||||
|
sync.arrive_and_wait(); // (B) sync with readers
|
||||||
|
{
|
||||||
|
std::unique_lock lock(sm);
|
||||||
|
test++;
|
||||||
|
}
|
||||||
|
sync.arrive_and_wait(); // (C) sync with readers
|
||||||
|
}
|
||||||
|
|
||||||
|
for (auto & thread : threads)
|
||||||
|
thread.join();
|
||||||
|
|
||||||
|
ASSERT_EQ(test, readers + 1);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
template <class T, class Status = NoCancel>
|
template <class T, class Status = NoCancel>
|
||||||
|
@ -30,7 +30,7 @@ protected:
|
|||||||
bool isGenericCompression() const override { return false; }
|
bool isGenericCompression() const override { return false; }
|
||||||
|
|
||||||
private:
|
private:
|
||||||
UInt8 delta_bytes_size;
|
const UInt8 delta_bytes_size;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
@ -68,8 +68,8 @@ void compressDataForType(const char * source, UInt32 source_size, char * dest)
|
|||||||
if (source_size % sizeof(T) != 0)
|
if (source_size % sizeof(T) != 0)
|
||||||
throw Exception(ErrorCodes::CANNOT_COMPRESS, "Cannot delta compress, data size {} is not aligned to {}", source_size, sizeof(T));
|
throw Exception(ErrorCodes::CANNOT_COMPRESS, "Cannot delta compress, data size {} is not aligned to {}", source_size, sizeof(T));
|
||||||
|
|
||||||
T prev_src{};
|
T prev_src = 0;
|
||||||
const char * source_end = source + source_size;
|
const char * const source_end = source + source_size;
|
||||||
while (source < source_end)
|
while (source < source_end)
|
||||||
{
|
{
|
||||||
T curr_src = unalignedLoad<T>(source);
|
T curr_src = unalignedLoad<T>(source);
|
||||||
@ -84,17 +84,17 @@ void compressDataForType(const char * source, UInt32 source_size, char * dest)
|
|||||||
template <typename T>
|
template <typename T>
|
||||||
void decompressDataForType(const char * source, UInt32 source_size, char * dest, UInt32 output_size)
|
void decompressDataForType(const char * source, UInt32 source_size, char * dest, UInt32 output_size)
|
||||||
{
|
{
|
||||||
const char * output_end = dest + output_size;
|
const char * const output_end = dest + output_size;
|
||||||
|
|
||||||
if (source_size % sizeof(T) != 0)
|
if (source_size % sizeof(T) != 0)
|
||||||
throw Exception(ErrorCodes::CANNOT_DECOMPRESS, "Cannot delta decompress, data size {} is not aligned to {}", source_size, sizeof(T));
|
throw Exception(ErrorCodes::CANNOT_DECOMPRESS, "Cannot delta decompress, data size {} is not aligned to {}", source_size, sizeof(T));
|
||||||
|
|
||||||
T accumulator{};
|
T accumulator{};
|
||||||
const char * source_end = source + source_size;
|
const char * const source_end = source + source_size;
|
||||||
while (source < source_end)
|
while (source < source_end)
|
||||||
{
|
{
|
||||||
accumulator += unalignedLoad<T>(source);
|
accumulator += unalignedLoad<T>(source);
|
||||||
if (dest + sizeof(accumulator) > output_end)
|
if (dest + sizeof(accumulator) > output_end) [[unlikely]]
|
||||||
throw Exception(ErrorCodes::CANNOT_DECOMPRESS, "Cannot decompress the data");
|
throw Exception(ErrorCodes::CANNOT_DECOMPRESS, "Cannot decompress the data");
|
||||||
unalignedStore<T>(dest, accumulator);
|
unalignedStore<T>(dest, accumulator);
|
||||||
|
|
||||||
@ -140,7 +140,7 @@ void CompressionCodecDelta::doDecompressData(const char * source, UInt32 source_
|
|||||||
|
|
||||||
UInt8 bytes_size = source[0];
|
UInt8 bytes_size = source[0];
|
||||||
|
|
||||||
if (bytes_size == 0)
|
if (!(bytes_size == 1 || bytes_size == 2 || bytes_size == 4 || bytes_size == 8))
|
||||||
throw Exception(ErrorCodes::CANNOT_DECOMPRESS, "Cannot decompress. File has wrong header");
|
throw Exception(ErrorCodes::CANNOT_DECOMPRESS, "Cannot decompress. File has wrong header");
|
||||||
|
|
||||||
UInt8 bytes_to_skip = uncompressed_size % bytes_size;
|
UInt8 bytes_to_skip = uncompressed_size % bytes_size;
|
||||||
@ -190,7 +190,7 @@ UInt8 getDeltaBytesSize(const IDataType * column_type)
|
|||||||
void registerCodecDelta(CompressionCodecFactory & factory)
|
void registerCodecDelta(CompressionCodecFactory & factory)
|
||||||
{
|
{
|
||||||
UInt8 method_code = static_cast<UInt8>(CompressionMethodByte::Delta);
|
UInt8 method_code = static_cast<UInt8>(CompressionMethodByte::Delta);
|
||||||
factory.registerCompressionCodecWithType("Delta", method_code, [&](const ASTPtr & arguments, const IDataType * column_type) -> CompressionCodecPtr
|
auto codec_builder = [&](const ASTPtr & arguments, const IDataType * column_type) -> CompressionCodecPtr
|
||||||
{
|
{
|
||||||
UInt8 delta_bytes_size = 0;
|
UInt8 delta_bytes_size = 0;
|
||||||
|
|
||||||
@ -215,7 +215,8 @@ void registerCodecDelta(CompressionCodecFactory & factory)
|
|||||||
}
|
}
|
||||||
|
|
||||||
return std::make_shared<CompressionCodecDelta>(delta_bytes_size);
|
return std::make_shared<CompressionCodecDelta>(delta_bytes_size);
|
||||||
});
|
};
|
||||||
|
factory.registerCompressionCodecWithType("Delta", method_code, codec_builder);
|
||||||
}
|
}
|
||||||
|
|
||||||
CompressionCodecPtr getCompressionCodecDelta(UInt8 delta_bytes_size)
|
CompressionCodecPtr getCompressionCodecDelta(UInt8 delta_bytes_size)
|
||||||
|
@ -11,19 +11,18 @@
|
|||||||
#include <IO/ReadBufferFromMemory.h>
|
#include <IO/ReadBufferFromMemory.h>
|
||||||
#include <IO/BitHelpers.h>
|
#include <IO/BitHelpers.h>
|
||||||
|
|
||||||
|
#include <bitset>
|
||||||
#include <cstring>
|
#include <cstring>
|
||||||
#include <algorithm>
|
#include <algorithm>
|
||||||
#include <type_traits>
|
#include <type_traits>
|
||||||
|
|
||||||
#include <bitset>
|
|
||||||
|
|
||||||
|
|
||||||
namespace DB
|
namespace DB
|
||||||
{
|
{
|
||||||
|
|
||||||
/** Gorilla column codec implementation.
|
/** Gorilla column codec implementation.
|
||||||
*
|
*
|
||||||
* Based on Gorilla paper: http://www.vldb.org/pvldb/vol8/p1816-teller.pdf
|
* Based on Gorilla paper: https://dl.acm.org/doi/10.14778/2824032.2824078
|
||||||
*
|
*
|
||||||
* This codec is best used against monotonic floating sequences, like CPU usage percentage
|
* This codec is best used against monotonic floating sequences, like CPU usage percentage
|
||||||
* or any other gauge.
|
* or any other gauge.
|
||||||
@ -125,7 +124,7 @@ protected:
|
|||||||
bool isGenericCompression() const override { return false; }
|
bool isGenericCompression() const override { return false; }
|
||||||
|
|
||||||
private:
|
private:
|
||||||
UInt8 data_bytes_size;
|
const UInt8 data_bytes_size;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
@ -139,7 +138,7 @@ namespace ErrorCodes
|
|||||||
namespace
|
namespace
|
||||||
{
|
{
|
||||||
|
|
||||||
constexpr inline UInt8 getBitLengthOfLength(UInt8 data_bytes_size)
|
constexpr UInt8 getBitLengthOfLength(UInt8 data_bytes_size)
|
||||||
{
|
{
|
||||||
// 1-byte value is 8 bits, and we need 4 bits to represent 8 : 1000,
|
// 1-byte value is 8 bits, and we need 4 bits to represent 8 : 1000,
|
||||||
// 2-byte 16 bits => 5
|
// 2-byte 16 bits => 5
|
||||||
@ -147,21 +146,20 @@ constexpr inline UInt8 getBitLengthOfLength(UInt8 data_bytes_size)
|
|||||||
// 8-byte 64 bits => 7
|
// 8-byte 64 bits => 7
|
||||||
const UInt8 bit_lengths[] = {0, 4, 5, 0, 6, 0, 0, 0, 7};
|
const UInt8 bit_lengths[] = {0, 4, 5, 0, 6, 0, 0, 0, 7};
|
||||||
assert(data_bytes_size >= 1 && data_bytes_size < sizeof(bit_lengths) && bit_lengths[data_bytes_size] != 0);
|
assert(data_bytes_size >= 1 && data_bytes_size < sizeof(bit_lengths) && bit_lengths[data_bytes_size] != 0);
|
||||||
|
|
||||||
return bit_lengths[data_bytes_size];
|
return bit_lengths[data_bytes_size];
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
UInt32 getCompressedHeaderSize(UInt8 data_bytes_size)
|
UInt32 getCompressedHeaderSize(UInt8 data_bytes_size)
|
||||||
{
|
{
|
||||||
const UInt8 items_count_size = 4;
|
constexpr UInt8 items_count_size = 4;
|
||||||
|
|
||||||
return items_count_size + data_bytes_size;
|
return items_count_size + data_bytes_size;
|
||||||
}
|
}
|
||||||
|
|
||||||
UInt32 getCompressedDataSize(UInt8 data_bytes_size, UInt32 uncompressed_size)
|
UInt32 getCompressedDataSize(UInt8 data_bytes_size, UInt32 uncompressed_size)
|
||||||
{
|
{
|
||||||
const UInt32 items_count = uncompressed_size / data_bytes_size;
|
const UInt32 items_count = uncompressed_size / data_bytes_size;
|
||||||
|
|
||||||
static const auto DATA_BIT_LENGTH = getBitLengthOfLength(data_bytes_size);
|
static const auto DATA_BIT_LENGTH = getBitLengthOfLength(data_bytes_size);
|
||||||
// -1 since there must be at least 1 non-zero bit.
|
// -1 since there must be at least 1 non-zero bit.
|
||||||
static const auto LEADING_ZEROES_BIT_LENGTH = DATA_BIT_LENGTH - 1;
|
static const auto LEADING_ZEROES_BIT_LENGTH = DATA_BIT_LENGTH - 1;
|
||||||
@ -182,7 +180,7 @@ struct BinaryValueInfo
|
|||||||
};
|
};
|
||||||
|
|
||||||
template <typename T>
|
template <typename T>
|
||||||
BinaryValueInfo getLeadingAndTrailingBits(const T & value)
|
BinaryValueInfo getBinaryValueInfo(const T & value)
|
||||||
{
|
{
|
||||||
constexpr UInt8 bit_size = sizeof(T) * 8;
|
constexpr UInt8 bit_size = sizeof(T) * 8;
|
||||||
|
|
||||||
@ -190,28 +188,25 @@ BinaryValueInfo getLeadingAndTrailingBits(const T & value)
|
|||||||
const UInt8 tz = getTrailingZeroBits(value);
|
const UInt8 tz = getTrailingZeroBits(value);
|
||||||
const UInt8 data_size = value == 0 ? 0 : static_cast<UInt8>(bit_size - lz - tz);
|
const UInt8 data_size = value == 0 ? 0 : static_cast<UInt8>(bit_size - lz - tz);
|
||||||
|
|
||||||
return BinaryValueInfo{lz, data_size, tz};
|
return {lz, data_size, tz};
|
||||||
}
|
}
|
||||||
|
|
||||||
template <typename T>
|
template <typename T>
|
||||||
UInt32 compressDataForType(const char * source, UInt32 source_size, char * dest, UInt32 dest_size)
|
UInt32 compressDataForType(const char * source, UInt32 source_size, char * dest, UInt32 dest_size)
|
||||||
{
|
{
|
||||||
static const auto DATA_BIT_LENGTH = getBitLengthOfLength(sizeof(T));
|
|
||||||
// -1 since there must be at least 1 non-zero bit.
|
|
||||||
static const auto LEADING_ZEROES_BIT_LENGTH = DATA_BIT_LENGTH - 1;
|
|
||||||
|
|
||||||
if (source_size % sizeof(T) != 0)
|
if (source_size % sizeof(T) != 0)
|
||||||
throw Exception(ErrorCodes::CANNOT_COMPRESS, "Cannot compress, data size {} is not aligned to {}", source_size, sizeof(T));
|
throw Exception(ErrorCodes::CANNOT_COMPRESS, "Cannot compress, data size {} is not aligned to {}", source_size, sizeof(T));
|
||||||
const char * source_end = source + source_size;
|
|
||||||
const char * dest_start = dest;
|
const char * const source_end = source + source_size;
|
||||||
const char * dest_end = dest + dest_size;
|
const char * const dest_start = dest;
|
||||||
|
const char * const dest_end = dest + dest_size;
|
||||||
|
|
||||||
const UInt32 items_count = source_size / sizeof(T);
|
const UInt32 items_count = source_size / sizeof(T);
|
||||||
|
|
||||||
unalignedStoreLE<UInt32>(dest, items_count);
|
unalignedStoreLE<UInt32>(dest, items_count);
|
||||||
dest += sizeof(items_count);
|
dest += sizeof(items_count);
|
||||||
|
|
||||||
T prev_value{};
|
T prev_value = 0;
|
||||||
// That would cause first XORed value to be written in-full.
|
// That would cause first XORed value to be written in-full.
|
||||||
BinaryValueInfo prev_xored_info{0, 0, 0};
|
BinaryValueInfo prev_xored_info{0, 0, 0};
|
||||||
|
|
||||||
@ -226,13 +221,17 @@ UInt32 compressDataForType(const char * source, UInt32 source_size, char * dest,
|
|||||||
|
|
||||||
BitWriter writer(dest, dest_end - dest);
|
BitWriter writer(dest, dest_end - dest);
|
||||||
|
|
||||||
|
static const auto DATA_BIT_LENGTH = getBitLengthOfLength(sizeof(T));
|
||||||
|
// -1 since there must be at least 1 non-zero bit.
|
||||||
|
static const auto LEADING_ZEROES_BIT_LENGTH = DATA_BIT_LENGTH - 1;
|
||||||
|
|
||||||
while (source < source_end)
|
while (source < source_end)
|
||||||
{
|
{
|
||||||
const T curr_value = unalignedLoadLE<T>(source);
|
const T curr_value = unalignedLoadLE<T>(source);
|
||||||
source += sizeof(curr_value);
|
source += sizeof(curr_value);
|
||||||
|
|
||||||
const auto xored_data = curr_value ^ prev_value;
|
const auto xored_data = curr_value ^ prev_value;
|
||||||
const BinaryValueInfo curr_xored_info = getLeadingAndTrailingBits(xored_data);
|
const BinaryValueInfo curr_xored_info = getBinaryValueInfo(xored_data);
|
||||||
|
|
||||||
if (xored_data == 0)
|
if (xored_data == 0)
|
||||||
{
|
{
|
||||||
@ -265,11 +264,7 @@ UInt32 compressDataForType(const char * source, UInt32 source_size, char * dest,
|
|||||||
template <typename T>
|
template <typename T>
|
||||||
void decompressDataForType(const char * source, UInt32 source_size, char * dest)
|
void decompressDataForType(const char * source, UInt32 source_size, char * dest)
|
||||||
{
|
{
|
||||||
static const auto DATA_BIT_LENGTH = getBitLengthOfLength(sizeof(T));
|
const char * const source_end = source + source_size;
|
||||||
// -1 since there must be at least 1 non-zero bit.
|
|
||||||
static const auto LEADING_ZEROES_BIT_LENGTH = DATA_BIT_LENGTH - 1;
|
|
||||||
|
|
||||||
const char * source_end = source + source_size;
|
|
||||||
|
|
||||||
if (source + sizeof(UInt32) > source_end)
|
if (source + sizeof(UInt32) > source_end)
|
||||||
return;
|
return;
|
||||||
@ -277,7 +272,7 @@ void decompressDataForType(const char * source, UInt32 source_size, char * dest)
|
|||||||
const UInt32 items_count = unalignedLoadLE<UInt32>(source);
|
const UInt32 items_count = unalignedLoadLE<UInt32>(source);
|
||||||
source += sizeof(items_count);
|
source += sizeof(items_count);
|
||||||
|
|
||||||
T prev_value{};
|
T prev_value = 0;
|
||||||
|
|
||||||
// decoding first item
|
// decoding first item
|
||||||
if (source + sizeof(T) > source_end || items_count < 1)
|
if (source + sizeof(T) > source_end || items_count < 1)
|
||||||
@ -293,13 +288,17 @@ void decompressDataForType(const char * source, UInt32 source_size, char * dest)
|
|||||||
|
|
||||||
BinaryValueInfo prev_xored_info{0, 0, 0};
|
BinaryValueInfo prev_xored_info{0, 0, 0};
|
||||||
|
|
||||||
|
static const auto DATA_BIT_LENGTH = getBitLengthOfLength(sizeof(T));
|
||||||
|
// -1 since there must be at least 1 non-zero bit.
|
||||||
|
static const auto LEADING_ZEROES_BIT_LENGTH = DATA_BIT_LENGTH - 1;
|
||||||
|
|
||||||
// since data is tightly packed, up to 1 bit per value, and last byte is padded with zeroes,
|
// since data is tightly packed, up to 1 bit per value, and last byte is padded with zeroes,
|
||||||
// we have to keep track of items to avoid reading more that there is.
|
// we have to keep track of items to avoid reading more that there is.
|
||||||
for (UInt32 items_read = 1; items_read < items_count && !reader.eof(); ++items_read)
|
for (UInt32 items_read = 1; items_read < items_count && !reader.eof(); ++items_read)
|
||||||
{
|
{
|
||||||
T curr_value = prev_value;
|
T curr_value = prev_value;
|
||||||
BinaryValueInfo curr_xored_info = prev_xored_info;
|
BinaryValueInfo curr_xored_info = prev_xored_info;
|
||||||
T xored_data{};
|
T xored_data = 0;
|
||||||
|
|
||||||
if (reader.readBit() == 1)
|
if (reader.readBit() == 1)
|
||||||
{
|
{
|
||||||
@ -314,7 +313,7 @@ void decompressDataForType(const char * source, UInt32 source_size, char * dest)
|
|||||||
|
|
||||||
if (curr_xored_info.leading_zero_bits == 0
|
if (curr_xored_info.leading_zero_bits == 0
|
||||||
&& curr_xored_info.data_bits == 0
|
&& curr_xored_info.data_bits == 0
|
||||||
&& curr_xored_info.trailing_zero_bits == 0)
|
&& curr_xored_info.trailing_zero_bits == 0) [[unlikely]]
|
||||||
{
|
{
|
||||||
throw Exception(ErrorCodes::CANNOT_DECOMPRESS, "Cannot decompress gorilla-encoded data: corrupted input data.");
|
throw Exception(ErrorCodes::CANNOT_DECOMPRESS, "Cannot decompress gorilla-encoded data: corrupted input data.");
|
||||||
}
|
}
|
||||||
@ -403,7 +402,7 @@ UInt32 CompressionCodecGorilla::doCompressData(const char * source, UInt32 sourc
|
|||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
return 1 + 1 + result_size;
|
return 2 + bytes_to_skip + result_size;
|
||||||
}
|
}
|
||||||
|
|
||||||
void CompressionCodecGorilla::doDecompressData(const char * source, UInt32 source_size, char * dest, UInt32 uncompressed_size) const
|
void CompressionCodecGorilla::doDecompressData(const char * source, UInt32 source_size, char * dest, UInt32 uncompressed_size) const
|
||||||
|
@ -11,13 +11,6 @@
|
|||||||
namespace DB
|
namespace DB
|
||||||
{
|
{
|
||||||
|
|
||||||
class ICompressionCodec;
|
|
||||||
|
|
||||||
using CompressionCodecPtr = std::shared_ptr<ICompressionCodec>;
|
|
||||||
using Codecs = std::vector<CompressionCodecPtr>;
|
|
||||||
|
|
||||||
class IDataType;
|
|
||||||
|
|
||||||
extern "C" int LLVMFuzzerTestOneInput(const uint8_t * data, size_t size);
|
extern "C" int LLVMFuzzerTestOneInput(const uint8_t * data, size_t size);
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@ -120,7 +113,7 @@ protected:
|
|||||||
/// Return size of compressed data without header
|
/// Return size of compressed data without header
|
||||||
virtual UInt32 getMaxCompressedDataSize(UInt32 uncompressed_size) const { return uncompressed_size; }
|
virtual UInt32 getMaxCompressedDataSize(UInt32 uncompressed_size) const { return uncompressed_size; }
|
||||||
|
|
||||||
/// Actually compress data, without header
|
/// Actually compress data without header
|
||||||
virtual UInt32 doCompressData(const char * source, UInt32 source_size, char * dest) const = 0;
|
virtual UInt32 doCompressData(const char * source, UInt32 source_size, char * dest) const = 0;
|
||||||
|
|
||||||
/// Actually decompress data without header
|
/// Actually decompress data without header
|
||||||
@ -134,4 +127,7 @@ private:
|
|||||||
CodecMode decompressMode{CodecMode::Synchronous};
|
CodecMode decompressMode{CodecMode::Synchronous};
|
||||||
};
|
};
|
||||||
|
|
||||||
|
using CompressionCodecPtr = std::shared_ptr<ICompressionCodec>;
|
||||||
|
using Codecs = std::vector<CompressionCodecPtr>;
|
||||||
|
|
||||||
}
|
}
|
||||||
|
@ -528,8 +528,7 @@ class IColumn;
|
|||||||
M(Bool, allow_non_metadata_alters, true, "Allow to execute alters which affects not only tables metadata, but also data on disk", 0) \
|
M(Bool, allow_non_metadata_alters, true, "Allow to execute alters which affects not only tables metadata, but also data on disk", 0) \
|
||||||
M(Bool, enable_global_with_statement, true, "Propagate WITH statements to UNION queries and all subqueries", 0) \
|
M(Bool, enable_global_with_statement, true, "Propagate WITH statements to UNION queries and all subqueries", 0) \
|
||||||
M(Bool, aggregate_functions_null_for_empty, false, "Rewrite all aggregate functions in a query, adding -OrNull suffix to them", 0) \
|
M(Bool, aggregate_functions_null_for_empty, false, "Rewrite all aggregate functions in a query, adding -OrNull suffix to them", 0) \
|
||||||
M(Bool, optimize_syntax_fuse_functions, false, "Not ready for production, do not use. Allow apply syntax optimisation: fuse aggregate functions", 0) \
|
M(Bool, optimize_syntax_fuse_functions, false, "Allow apply fuse aggregating function. Available only with `allow_experimental_analyzer`", 0) \
|
||||||
M(Bool, optimize_fuse_sum_count_avg, false, "Replace calls of functions `sum`, `avg`, `count` with identical arguments into one `sumCount`", 0) \
|
|
||||||
M(Bool, flatten_nested, true, "If true, columns of type Nested will be flatten to separate array columns instead of one array of tuples", 0) \
|
M(Bool, flatten_nested, true, "If true, columns of type Nested will be flatten to separate array columns instead of one array of tuples", 0) \
|
||||||
M(Bool, asterisk_include_materialized_columns, false, "Include MATERIALIZED columns for wildcard query", 0) \
|
M(Bool, asterisk_include_materialized_columns, false, "Include MATERIALIZED columns for wildcard query", 0) \
|
||||||
M(Bool, asterisk_include_alias_columns, false, "Include ALIAS columns for wildcard query", 0) \
|
M(Bool, asterisk_include_alias_columns, false, "Include ALIAS columns for wildcard query", 0) \
|
||||||
@ -585,6 +584,7 @@ class IColumn;
|
|||||||
M(Bool, query_plan_optimize_primary_key, true, "Analyze primary key using query plan (instead of AST)", 0) \
|
M(Bool, query_plan_optimize_primary_key, true, "Analyze primary key using query plan (instead of AST)", 0) \
|
||||||
M(Bool, query_plan_read_in_order, true, "Use query plan for read-in-order optimisation", 0) \
|
M(Bool, query_plan_read_in_order, true, "Use query plan for read-in-order optimisation", 0) \
|
||||||
M(Bool, query_plan_aggregation_in_order, true, "Use query plan for aggregation-in-order optimisation", 0) \
|
M(Bool, query_plan_aggregation_in_order, true, "Use query plan for aggregation-in-order optimisation", 0) \
|
||||||
|
M(Bool, query_plan_remove_redundant_sorting, false, "Remove redundant sorting in query plan. For example, sorting steps related to ORDER BY clauses in subqueries", 0) \
|
||||||
M(UInt64, regexp_max_matches_per_row, 1000, "Max matches of any single regexp per row, used to safeguard 'extractAllGroupsHorizontal' against consuming too much memory with greedy RE.", 0) \
|
M(UInt64, regexp_max_matches_per_row, 1000, "Max matches of any single regexp per row, used to safeguard 'extractAllGroupsHorizontal' against consuming too much memory with greedy RE.", 0) \
|
||||||
\
|
\
|
||||||
M(UInt64, limit, 0, "Limit on read rows from the most 'end' result for select query, default 0 means no limit length", 0) \
|
M(UInt64, limit, 0, "Limit on read rows from the most 'end' result for select query, default 0 means no limit length", 0) \
|
||||||
@ -619,7 +619,7 @@ class IColumn;
|
|||||||
M(Bool, read_from_filesystem_cache_if_exists_otherwise_bypass_cache, false, "Allow to use the filesystem cache in passive mode - benefit from the existing cache entries, but don't put more entries into the cache. If you set this setting for heavy ad-hoc queries and leave it disabled for short real-time queries, this will allows to avoid cache threshing by too heavy queries and to improve the overall system efficiency.", 0) \
|
M(Bool, read_from_filesystem_cache_if_exists_otherwise_bypass_cache, false, "Allow to use the filesystem cache in passive mode - benefit from the existing cache entries, but don't put more entries into the cache. If you set this setting for heavy ad-hoc queries and leave it disabled for short real-time queries, this will allows to avoid cache threshing by too heavy queries and to improve the overall system efficiency.", 0) \
|
||||||
M(Bool, enable_filesystem_cache_on_lower_level, true, "If read buffer supports caching inside threadpool, allow it to do it, otherwise cache outside ot threadpool. Do not use this setting, it is needed for testing", 0) \
|
M(Bool, enable_filesystem_cache_on_lower_level, true, "If read buffer supports caching inside threadpool, allow it to do it, otherwise cache outside ot threadpool. Do not use this setting, it is needed for testing", 0) \
|
||||||
M(Bool, skip_download_if_exceeds_query_cache, true, "Skip download from remote filesystem if exceeds query cache size", 0) \
|
M(Bool, skip_download_if_exceeds_query_cache, true, "Skip download from remote filesystem if exceeds query cache size", 0) \
|
||||||
M(UInt64, max_query_cache_size, (128UL * 1024 * 1024 * 1024), "Max remote filesystem cache size that can be used by a single query", 0) \
|
M(UInt64, filesystem_cache_max_download_size, (128UL * 1024 * 1024 * 1024), "Max remote filesystem cache size that can be downloaded by a single query", 0) \
|
||||||
M(Bool, throw_on_error_from_cache_on_write_operations, false, "Ignore error from cache when caching on write operations (INSERT, merges)", 0) \
|
M(Bool, throw_on_error_from_cache_on_write_operations, false, "Ignore error from cache when caching on write operations (INSERT, merges)", 0) \
|
||||||
\
|
\
|
||||||
M(Bool, load_marks_asynchronously, false, "Load MergeTree marks asynchronously", 0) \
|
M(Bool, load_marks_asynchronously, false, "Load MergeTree marks asynchronously", 0) \
|
||||||
@ -726,6 +726,8 @@ class IColumn;
|
|||||||
MAKE_OBSOLETE(M, UInt64, max_pipeline_depth, 0) \
|
MAKE_OBSOLETE(M, UInt64, max_pipeline_depth, 0) \
|
||||||
MAKE_OBSOLETE(M, Seconds, temporary_live_view_timeout, 1) \
|
MAKE_OBSOLETE(M, Seconds, temporary_live_view_timeout, 1) \
|
||||||
MAKE_OBSOLETE(M, Milliseconds, async_insert_cleanup_timeout_ms, 1000) \
|
MAKE_OBSOLETE(M, Milliseconds, async_insert_cleanup_timeout_ms, 1000) \
|
||||||
|
MAKE_OBSOLETE(M, Bool, optimize_fuse_sum_count_avg, 0) \
|
||||||
|
|
||||||
|
|
||||||
/** The section above is for obsolete settings. Do not add anything there. */
|
/** The section above is for obsolete settings. Do not add anything there. */
|
||||||
|
|
||||||
@ -765,6 +767,9 @@ class IColumn;
|
|||||||
M(UInt64, input_format_max_rows_to_read_for_schema_inference, 25000, "The maximum rows of data to read for automatic schema inference", 0) \
|
M(UInt64, input_format_max_rows_to_read_for_schema_inference, 25000, "The maximum rows of data to read for automatic schema inference", 0) \
|
||||||
M(Bool, input_format_csv_use_best_effort_in_schema_inference, true, "Use some tweaks and heuristics to infer schema in CSV format", 0) \
|
M(Bool, input_format_csv_use_best_effort_in_schema_inference, true, "Use some tweaks and heuristics to infer schema in CSV format", 0) \
|
||||||
M(Bool, input_format_tsv_use_best_effort_in_schema_inference, true, "Use some tweaks and heuristics to infer schema in TSV format", 0) \
|
M(Bool, input_format_tsv_use_best_effort_in_schema_inference, true, "Use some tweaks and heuristics to infer schema in TSV format", 0) \
|
||||||
|
M(Bool, input_format_csv_detect_header, true, "Automatically detect header with names and types in CSV format", 0) \
|
||||||
|
M(Bool, input_format_tsv_detect_header, true, "Automatically detect header with names and types in TSV format", 0) \
|
||||||
|
M(Bool, input_format_custom_detect_header, true, "Automatically detect header with names and types in CustomSeparated format", 0) \
|
||||||
M(Bool, input_format_parquet_skip_columns_with_unsupported_types_in_schema_inference, false, "Skip columns with unsupported types while schema inference for format Parquet", 0) \
|
M(Bool, input_format_parquet_skip_columns_with_unsupported_types_in_schema_inference, false, "Skip columns with unsupported types while schema inference for format Parquet", 0) \
|
||||||
M(Bool, input_format_protobuf_skip_fields_with_unsupported_types_in_schema_inference, false, "Skip fields with unsupported types while schema inference for format Protobuf", 0) \
|
M(Bool, input_format_protobuf_skip_fields_with_unsupported_types_in_schema_inference, false, "Skip fields with unsupported types while schema inference for format Protobuf", 0) \
|
||||||
M(Bool, input_format_capn_proto_skip_fields_with_unsupported_types_in_schema_inference, false, "Skip columns with unsupported types while schema inference for format CapnProto", 0) \
|
M(Bool, input_format_capn_proto_skip_fields_with_unsupported_types_in_schema_inference, false, "Skip columns with unsupported types while schema inference for format CapnProto", 0) \
|
||||||
|
@ -81,6 +81,9 @@ namespace SettingsChangesHistory
|
|||||||
static std::map<ClickHouseVersion, SettingsChangesHistory::SettingsChanges> settings_changes_history =
|
static std::map<ClickHouseVersion, SettingsChangesHistory::SettingsChanges> settings_changes_history =
|
||||||
{
|
{
|
||||||
{"23.1", {{"input_format_json_read_objects_as_strings", 0, 1, "Enable reading nested json objects as strings while object type is experimental"},
|
{"23.1", {{"input_format_json_read_objects_as_strings", 0, 1, "Enable reading nested json objects as strings while object type is experimental"},
|
||||||
|
{"input_format_csv_detect_header", false, true, "Detect header in CSV format by default"},
|
||||||
|
{"input_format_tsv_detect_header", false, true, "Detect header in TSV format by default"},
|
||||||
|
{"input_format_custom_detect_header", false, true, "Detect header in CustomSeparated format by default"},
|
||||||
{"input_format_json_defaults_for_missing_elements_in_named_tuple", false, true, "Allow missing elements in JSON objects while reading named tuples by default"}}},
|
{"input_format_json_defaults_for_missing_elements_in_named_tuple", false, true, "Allow missing elements in JSON objects while reading named tuples by default"}}},
|
||||||
{"22.12", {{"max_size_to_preallocate_for_aggregation", 10'000'000, 100'000'000, "This optimizes performance"},
|
{"22.12", {{"max_size_to_preallocate_for_aggregation", 10'000'000, 100'000'000, "This optimizes performance"},
|
||||||
{"query_plan_aggregation_in_order", 0, 1, "Enable some refactoring around query plan"},
|
{"query_plan_aggregation_in_order", 0, 1, "Enable some refactoring around query plan"},
|
||||||
|
@ -26,8 +26,18 @@ namespace ErrorCodes
|
|||||||
extern const int DATA_TYPE_CANNOT_HAVE_ARGUMENTS;
|
extern const int DATA_TYPE_CANNOT_HAVE_ARGUMENTS;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
DataTypePtr DataTypeFactory::get(const String & full_name) const
|
DataTypePtr DataTypeFactory::get(const String & full_name) const
|
||||||
|
{
|
||||||
|
return getImpl<false>(full_name);
|
||||||
|
}
|
||||||
|
|
||||||
|
DataTypePtr DataTypeFactory::tryGet(const String & full_name) const
|
||||||
|
{
|
||||||
|
return getImpl<true>(full_name);
|
||||||
|
}
|
||||||
|
|
||||||
|
template <bool nullptr_on_error>
|
||||||
|
DataTypePtr DataTypeFactory::getImpl(const String & full_name) const
|
||||||
{
|
{
|
||||||
/// Data type parser can be invoked from coroutines with small stack.
|
/// Data type parser can be invoked from coroutines with small stack.
|
||||||
/// Value 315 is known to cause stack overflow in some test configurations (debug build, sanitizers)
|
/// Value 315 is known to cause stack overflow in some test configurations (debug build, sanitizers)
|
||||||
@ -41,34 +51,75 @@ DataTypePtr DataTypeFactory::get(const String & full_name) const
|
|||||||
#endif
|
#endif
|
||||||
|
|
||||||
ParserDataType parser;
|
ParserDataType parser;
|
||||||
ASTPtr ast = parseQuery(parser, full_name.data(), full_name.data() + full_name.size(), "data type", 0, data_type_max_parse_depth);
|
ASTPtr ast;
|
||||||
return get(ast);
|
if constexpr (nullptr_on_error)
|
||||||
|
{
|
||||||
|
String out_err;
|
||||||
|
const char * start = full_name.data();
|
||||||
|
ast = tryParseQuery(parser, start, start + full_name.size(), out_err, false, "data type", false, DBMS_DEFAULT_MAX_QUERY_SIZE, data_type_max_parse_depth);
|
||||||
|
if (!ast)
|
||||||
|
return nullptr;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
ast = parseQuery(parser, full_name.data(), full_name.data() + full_name.size(), "data type", false, data_type_max_parse_depth);
|
||||||
|
}
|
||||||
|
|
||||||
|
return getImpl<nullptr_on_error>(ast);
|
||||||
}
|
}
|
||||||
|
|
||||||
DataTypePtr DataTypeFactory::get(const ASTPtr & ast) const
|
DataTypePtr DataTypeFactory::get(const ASTPtr & ast) const
|
||||||
|
{
|
||||||
|
return getImpl<false>(ast);
|
||||||
|
}
|
||||||
|
|
||||||
|
DataTypePtr DataTypeFactory::tryGet(const ASTPtr & ast) const
|
||||||
|
{
|
||||||
|
return getImpl<true>(ast);
|
||||||
|
}
|
||||||
|
|
||||||
|
template <bool nullptr_on_error>
|
||||||
|
DataTypePtr DataTypeFactory::getImpl(const ASTPtr & ast) const
|
||||||
{
|
{
|
||||||
if (const auto * func = ast->as<ASTFunction>())
|
if (const auto * func = ast->as<ASTFunction>())
|
||||||
{
|
{
|
||||||
if (func->parameters)
|
if (func->parameters)
|
||||||
|
{
|
||||||
|
if constexpr (nullptr_on_error)
|
||||||
|
return nullptr;
|
||||||
throw Exception(ErrorCodes::ILLEGAL_SYNTAX_FOR_DATA_TYPE, "Data type cannot have multiple parenthesized parameters.");
|
throw Exception(ErrorCodes::ILLEGAL_SYNTAX_FOR_DATA_TYPE, "Data type cannot have multiple parenthesized parameters.");
|
||||||
return get(func->name, func->arguments);
|
}
|
||||||
|
return getImpl<nullptr_on_error>(func->name, func->arguments);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (const auto * ident = ast->as<ASTIdentifier>())
|
if (const auto * ident = ast->as<ASTIdentifier>())
|
||||||
{
|
{
|
||||||
return get(ident->name(), {});
|
return getImpl<nullptr_on_error>(ident->name(), {});
|
||||||
}
|
}
|
||||||
|
|
||||||
if (const auto * lit = ast->as<ASTLiteral>())
|
if (const auto * lit = ast->as<ASTLiteral>())
|
||||||
{
|
{
|
||||||
if (lit->value.isNull())
|
if (lit->value.isNull())
|
||||||
return get("Null", {});
|
return getImpl<nullptr_on_error>("Null", {});
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if constexpr (nullptr_on_error)
|
||||||
|
return nullptr;
|
||||||
throw Exception(ErrorCodes::UNEXPECTED_AST_STRUCTURE, "Unexpected AST element for data type.");
|
throw Exception(ErrorCodes::UNEXPECTED_AST_STRUCTURE, "Unexpected AST element for data type.");
|
||||||
}
|
}
|
||||||
|
|
||||||
DataTypePtr DataTypeFactory::get(const String & family_name_param, const ASTPtr & parameters) const
|
DataTypePtr DataTypeFactory::get(const String & family_name_param, const ASTPtr & parameters) const
|
||||||
|
{
|
||||||
|
return getImpl<false>(family_name_param, parameters);
|
||||||
|
}
|
||||||
|
|
||||||
|
DataTypePtr DataTypeFactory::tryGet(const String & family_name_param, const ASTPtr & parameters) const
|
||||||
|
{
|
||||||
|
return getImpl<true>(family_name_param, parameters);
|
||||||
|
}
|
||||||
|
|
||||||
|
template <bool nullptr_on_error>
|
||||||
|
DataTypePtr DataTypeFactory::getImpl(const String & family_name_param, const ASTPtr & parameters) const
|
||||||
{
|
{
|
||||||
String family_name = getAliasToOrName(family_name_param);
|
String family_name = getAliasToOrName(family_name_param);
|
||||||
|
|
||||||
@ -86,10 +137,29 @@ DataTypePtr DataTypeFactory::get(const String & family_name_param, const ASTPtr
|
|||||||
else
|
else
|
||||||
low_cardinality_params->children.push_back(std::make_shared<ASTIdentifier>(param_name));
|
low_cardinality_params->children.push_back(std::make_shared<ASTIdentifier>(param_name));
|
||||||
|
|
||||||
return get("LowCardinality", low_cardinality_params);
|
return getImpl<nullptr_on_error>("LowCardinality", low_cardinality_params);
|
||||||
}
|
}
|
||||||
|
|
||||||
return findCreatorByName(family_name)(parameters);
|
const auto * creator = findCreatorByName<nullptr_on_error>(family_name);
|
||||||
|
if constexpr (nullptr_on_error)
|
||||||
|
{
|
||||||
|
if (!creator)
|
||||||
|
return nullptr;
|
||||||
|
|
||||||
|
try
|
||||||
|
{
|
||||||
|
return (*creator)(parameters);
|
||||||
|
}
|
||||||
|
catch (...)
|
||||||
|
{
|
||||||
|
return nullptr;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
assert(creator);
|
||||||
|
return (*creator)(parameters);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
DataTypePtr DataTypeFactory::getCustom(DataTypeCustomDescPtr customization) const
|
DataTypePtr DataTypeFactory::getCustom(DataTypeCustomDescPtr customization) const
|
||||||
@ -155,19 +225,19 @@ void DataTypeFactory::registerSimpleDataTypeCustom(const String &name, SimpleCre
|
|||||||
}, case_sensitiveness);
|
}, case_sensitiveness);
|
||||||
}
|
}
|
||||||
|
|
||||||
const DataTypeFactory::Value & DataTypeFactory::findCreatorByName(const String & family_name) const
|
template <bool nullptr_on_error>
|
||||||
|
const DataTypeFactory::Value * DataTypeFactory::findCreatorByName(const String & family_name) const
|
||||||
{
|
{
|
||||||
ContextPtr query_context;
|
ContextPtr query_context;
|
||||||
if (CurrentThread::isInitialized())
|
if (CurrentThread::isInitialized())
|
||||||
query_context = CurrentThread::get().getQueryContext();
|
query_context = CurrentThread::get().getQueryContext();
|
||||||
|
|
||||||
{
|
{
|
||||||
DataTypesDictionary::const_iterator it = data_types.find(family_name);
|
DataTypesDictionary::const_iterator it = data_types.find(family_name);
|
||||||
if (data_types.end() != it)
|
if (data_types.end() != it)
|
||||||
{
|
{
|
||||||
if (query_context && query_context->getSettingsRef().log_queries)
|
if (query_context && query_context->getSettingsRef().log_queries)
|
||||||
query_context->addQueryFactoriesInfo(Context::QueryLogFactories::DataType, family_name);
|
query_context->addQueryFactoriesInfo(Context::QueryLogFactories::DataType, family_name);
|
||||||
return it->second;
|
return &it->second;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -179,10 +249,13 @@ const DataTypeFactory::Value & DataTypeFactory::findCreatorByName(const String &
|
|||||||
{
|
{
|
||||||
if (query_context && query_context->getSettingsRef().log_queries)
|
if (query_context && query_context->getSettingsRef().log_queries)
|
||||||
query_context->addQueryFactoriesInfo(Context::QueryLogFactories::DataType, family_name_lowercase);
|
query_context->addQueryFactoriesInfo(Context::QueryLogFactories::DataType, family_name_lowercase);
|
||||||
return it->second;
|
return &it->second;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if constexpr (nullptr_on_error)
|
||||||
|
return nullptr;
|
||||||
|
|
||||||
auto hints = this->getHints(family_name);
|
auto hints = this->getHints(family_name);
|
||||||
if (!hints.empty())
|
if (!hints.empty())
|
||||||
throw Exception(ErrorCodes::UNKNOWN_TYPE, "Unknown data type family: {}. Maybe you meant: {}", family_name, toString(hints));
|
throw Exception(ErrorCodes::UNKNOWN_TYPE, "Unknown data type family: {}. Maybe you meant: {}", family_name, toString(hints));
|
||||||
|
@ -36,6 +36,11 @@ public:
|
|||||||
DataTypePtr get(const ASTPtr & ast) const;
|
DataTypePtr get(const ASTPtr & ast) const;
|
||||||
DataTypePtr getCustom(DataTypeCustomDescPtr customization) const;
|
DataTypePtr getCustom(DataTypeCustomDescPtr customization) const;
|
||||||
|
|
||||||
|
/// Return nullptr in case of error.
|
||||||
|
DataTypePtr tryGet(const String & full_name) const;
|
||||||
|
DataTypePtr tryGet(const String & family_name, const ASTPtr & parameters) const;
|
||||||
|
DataTypePtr tryGet(const ASTPtr & ast) const;
|
||||||
|
|
||||||
/// Register a type family by its name.
|
/// Register a type family by its name.
|
||||||
void registerDataType(const String & family_name, Value creator, CaseSensitiveness case_sensitiveness = CaseSensitive);
|
void registerDataType(const String & family_name, Value creator, CaseSensitiveness case_sensitiveness = CaseSensitive);
|
||||||
|
|
||||||
@ -49,7 +54,14 @@ public:
|
|||||||
void registerSimpleDataTypeCustom(const String & name, SimpleCreatorWithCustom creator, CaseSensitiveness case_sensitiveness = CaseSensitive);
|
void registerSimpleDataTypeCustom(const String & name, SimpleCreatorWithCustom creator, CaseSensitiveness case_sensitiveness = CaseSensitive);
|
||||||
|
|
||||||
private:
|
private:
|
||||||
const Value & findCreatorByName(const String & family_name) const;
|
template <bool nullptr_on_error>
|
||||||
|
DataTypePtr getImpl(const String & full_name) const;
|
||||||
|
template <bool nullptr_on_error>
|
||||||
|
DataTypePtr getImpl(const String & family_name, const ASTPtr & parameters) const;
|
||||||
|
template <bool nullptr_on_error>
|
||||||
|
DataTypePtr getImpl(const ASTPtr & ast) const;
|
||||||
|
template <bool nullptr_on_error>
|
||||||
|
const Value * findCreatorByName(const String & family_name) const;
|
||||||
|
|
||||||
DataTypesDictionary data_types;
|
DataTypesDictionary data_types;
|
||||||
|
|
||||||
|
@ -379,10 +379,14 @@ void DatabaseOnDisk::renameTable(
|
|||||||
if (dictionary && table && !table->isDictionary())
|
if (dictionary && table && !table->isDictionary())
|
||||||
throw Exception(ErrorCodes::INCORRECT_QUERY, "Use RENAME/EXCHANGE TABLE (instead of RENAME/EXCHANGE DICTIONARY) for tables");
|
throw Exception(ErrorCodes::INCORRECT_QUERY, "Use RENAME/EXCHANGE TABLE (instead of RENAME/EXCHANGE DICTIONARY) for tables");
|
||||||
|
|
||||||
table_lock = table->lockExclusively(
|
/// We have to lock the table before detaching, because otherwise lockExclusively will throw. But the table may not exist.
|
||||||
local_context->getCurrentQueryId(), local_context->getSettingsRef().lock_acquire_timeout);
|
bool need_lock = table != nullptr;
|
||||||
|
if (need_lock)
|
||||||
|
table_lock = table->lockExclusively(local_context->getCurrentQueryId(), local_context->getSettingsRef().lock_acquire_timeout);
|
||||||
|
|
||||||
detachTable(local_context, table_name);
|
detachTable(local_context, table_name);
|
||||||
|
if (!need_lock)
|
||||||
|
throw Exception(ErrorCodes::LOGICAL_ERROR, "Table was detached without locking, it's a bug");
|
||||||
|
|
||||||
UUID prev_uuid = UUIDHelpers::Nil;
|
UUID prev_uuid = UUIDHelpers::Nil;
|
||||||
try
|
try
|
||||||
|
@ -14,6 +14,7 @@ namespace ErrorCodes
|
|||||||
extern const int PATH_ACCESS_DENIED;
|
extern const int PATH_ACCESS_DENIED;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static std::mutex init_sqlite_db_mutex;
|
||||||
|
|
||||||
void processSQLiteError(const String & message, bool throw_on_error)
|
void processSQLiteError(const String & message, bool throw_on_error)
|
||||||
{
|
{
|
||||||
@ -52,7 +53,11 @@ SQLitePtr openSQLiteDB(const String & path, ContextPtr context, bool throw_on_er
|
|||||||
LOG_DEBUG(&Poco::Logger::get("SQLite"), "SQLite database path {} does not exist, will create an empty SQLite database", database_path);
|
LOG_DEBUG(&Poco::Logger::get("SQLite"), "SQLite database path {} does not exist, will create an empty SQLite database", database_path);
|
||||||
|
|
||||||
sqlite3 * tmp_sqlite_db = nullptr;
|
sqlite3 * tmp_sqlite_db = nullptr;
|
||||||
int status = sqlite3_open(database_path.c_str(), &tmp_sqlite_db);
|
int status;
|
||||||
|
{
|
||||||
|
std::lock_guard lock(init_sqlite_db_mutex);
|
||||||
|
status = sqlite3_open(database_path.c_str(), &tmp_sqlite_db);
|
||||||
|
}
|
||||||
|
|
||||||
if (status != SQLITE_OK)
|
if (status != SQLITE_OK)
|
||||||
{
|
{
|
||||||
|
@ -4,7 +4,6 @@
|
|||||||
#include <chrono>
|
#include <chrono>
|
||||||
#include <cmath>
|
#include <cmath>
|
||||||
#include <mutex>
|
#include <mutex>
|
||||||
#include <shared_mutex>
|
|
||||||
#include <utility>
|
#include <utility>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
|
|
||||||
@ -14,6 +13,7 @@
|
|||||||
|
|
||||||
#include <Common/randomSeed.h>
|
#include <Common/randomSeed.h>
|
||||||
#include <Common/ThreadPool.h>
|
#include <Common/ThreadPool.h>
|
||||||
|
#include <Common/SharedMutex.h>
|
||||||
#include <Common/CurrentMetrics.h>
|
#include <Common/CurrentMetrics.h>
|
||||||
|
|
||||||
#include <Dictionaries/IDictionary.h>
|
#include <Dictionaries/IDictionary.h>
|
||||||
@ -206,7 +206,7 @@ private:
|
|||||||
/// This lock is used for the inner cache state update function lock it for
|
/// This lock is used for the inner cache state update function lock it for
|
||||||
/// write, when it need to update cache state all other functions just
|
/// write, when it need to update cache state all other functions just
|
||||||
/// readers. Surprisingly this lock is also used for last_exception pointer.
|
/// readers. Surprisingly this lock is also used for last_exception pointer.
|
||||||
mutable std::shared_mutex rw_lock;
|
mutable SharedMutex rw_lock;
|
||||||
|
|
||||||
mutable std::exception_ptr last_exception;
|
mutable std::exception_ptr last_exception;
|
||||||
mutable std::atomic<size_t> error_count {0};
|
mutable std::atomic<size_t> error_count {0};
|
||||||
|
@ -2,7 +2,6 @@
|
|||||||
|
|
||||||
#include <atomic>
|
#include <atomic>
|
||||||
#include <mutex>
|
#include <mutex>
|
||||||
#include <shared_mutex>
|
|
||||||
#include <utility>
|
#include <utility>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
#include <functional>
|
#include <functional>
|
||||||
|
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
13
src/Dictionaries/RangeHashedDictionaryComplex.cpp
Normal file
13
src/Dictionaries/RangeHashedDictionaryComplex.cpp
Normal file
@ -0,0 +1,13 @@
|
|||||||
|
#include <Dictionaries/RangeHashedDictionary.h>
|
||||||
|
|
||||||
|
/// RangeHashedDictionary is instantiated from two files
|
||||||
|
/// RangeHashedDictionarySimple.cpp and RangeHashedDictionaryComplex.cpp
|
||||||
|
/// to better parallelize the build procedure and avoid MSan build failure
|
||||||
|
/// due to excessive resource consumption.
|
||||||
|
|
||||||
|
namespace DB
|
||||||
|
{
|
||||||
|
|
||||||
|
template class RangeHashedDictionary<DictionaryKeyType::Complex>;
|
||||||
|
|
||||||
|
}
|
13
src/Dictionaries/RangeHashedDictionarySimple.cpp
Normal file
13
src/Dictionaries/RangeHashedDictionarySimple.cpp
Normal file
@ -0,0 +1,13 @@
|
|||||||
|
#include <Dictionaries/RangeHashedDictionary.h>
|
||||||
|
|
||||||
|
/// RangeHashedDictionary is instantiated from two files
|
||||||
|
/// RangeHashedDictionarySimple.cpp and RangeHashedDictionaryComplex.cpp
|
||||||
|
/// to better parallelize the build procedure and avoid MSan build failure
|
||||||
|
/// due to excessive resource consumption.
|
||||||
|
|
||||||
|
namespace DB
|
||||||
|
{
|
||||||
|
|
||||||
|
template class RangeHashedDictionary<DictionaryKeyType::Simple>;
|
||||||
|
|
||||||
|
}
|
@ -1,5 +1,6 @@
|
|||||||
#pragma once
|
#pragma once
|
||||||
|
|
||||||
|
#include <Common/SharedMutex.h>
|
||||||
#include <Disks/IDisk.h>
|
#include <Disks/IDisk.h>
|
||||||
#include <Disks/ObjectStorages/IMetadataStorage.h>
|
#include <Disks/ObjectStorages/IMetadataStorage.h>
|
||||||
#include <Disks/ObjectStorages/MetadataFromDiskTransactionState.h>
|
#include <Disks/ObjectStorages/MetadataFromDiskTransactionState.h>
|
||||||
@ -15,7 +16,7 @@ class FakeMetadataStorageFromDisk final : public IMetadataStorage
|
|||||||
private:
|
private:
|
||||||
friend class FakeMetadataStorageFromDiskTransaction;
|
friend class FakeMetadataStorageFromDiskTransaction;
|
||||||
|
|
||||||
mutable std::shared_mutex metadata_mutex;
|
mutable SharedMutex metadata_mutex;
|
||||||
|
|
||||||
DiskPtr disk;
|
DiskPtr disk;
|
||||||
ObjectStoragePtr object_storage;
|
ObjectStoragePtr object_storage;
|
||||||
|
@ -83,7 +83,7 @@ std::string MetadataStorageFromDisk::readInlineDataToString(const std::string &
|
|||||||
return readMetadata(path)->getInlineData();
|
return readMetadata(path)->getInlineData();
|
||||||
}
|
}
|
||||||
|
|
||||||
DiskObjectStorageMetadataPtr MetadataStorageFromDisk::readMetadataUnlocked(const std::string & path, std::shared_lock<std::shared_mutex> &) const
|
DiskObjectStorageMetadataPtr MetadataStorageFromDisk::readMetadataUnlocked(const std::string & path, std::shared_lock<SharedMutex> &) const
|
||||||
{
|
{
|
||||||
auto metadata = std::make_unique<DiskObjectStorageMetadata>(disk->getPath(), object_storage_root_path, path);
|
auto metadata = std::make_unique<DiskObjectStorageMetadata>(disk->getPath(), object_storage_root_path, path);
|
||||||
auto str = readFileToString(path);
|
auto str = readFileToString(path);
|
||||||
@ -91,7 +91,7 @@ DiskObjectStorageMetadataPtr MetadataStorageFromDisk::readMetadataUnlocked(const
|
|||||||
return metadata;
|
return metadata;
|
||||||
}
|
}
|
||||||
|
|
||||||
DiskObjectStorageMetadataPtr MetadataStorageFromDisk::readMetadataUnlocked(const std::string & path, std::unique_lock<std::shared_mutex> &) const
|
DiskObjectStorageMetadataPtr MetadataStorageFromDisk::readMetadataUnlocked(const std::string & path, std::unique_lock<SharedMutex> &) const
|
||||||
{
|
{
|
||||||
auto metadata = std::make_unique<DiskObjectStorageMetadata>(disk->getPath(), object_storage_root_path, path);
|
auto metadata = std::make_unique<DiskObjectStorageMetadata>(disk->getPath(), object_storage_root_path, path);
|
||||||
auto str = readFileToString(path);
|
auto str = readFileToString(path);
|
||||||
|
@ -1,5 +1,6 @@
|
|||||||
#pragma once
|
#pragma once
|
||||||
|
|
||||||
|
#include <Common/SharedMutex.h>
|
||||||
#include <Disks/ObjectStorages/IMetadataStorage.h>
|
#include <Disks/ObjectStorages/IMetadataStorage.h>
|
||||||
|
|
||||||
#include <Disks/IDisk.h>
|
#include <Disks/IDisk.h>
|
||||||
@ -17,7 +18,7 @@ class MetadataStorageFromDisk final : public IMetadataStorage
|
|||||||
private:
|
private:
|
||||||
friend class MetadataStorageFromDiskTransaction;
|
friend class MetadataStorageFromDiskTransaction;
|
||||||
|
|
||||||
mutable std::shared_mutex metadata_mutex;
|
mutable SharedMutex metadata_mutex;
|
||||||
|
|
||||||
DiskPtr disk;
|
DiskPtr disk;
|
||||||
std::string object_storage_root_path;
|
std::string object_storage_root_path;
|
||||||
@ -67,8 +68,8 @@ public:
|
|||||||
|
|
||||||
DiskObjectStorageMetadataPtr readMetadata(const std::string & path) const;
|
DiskObjectStorageMetadataPtr readMetadata(const std::string & path) const;
|
||||||
|
|
||||||
DiskObjectStorageMetadataPtr readMetadataUnlocked(const std::string & path, std::unique_lock<std::shared_mutex> & lock) const;
|
DiskObjectStorageMetadataPtr readMetadataUnlocked(const std::string & path, std::unique_lock<SharedMutex> & lock) const;
|
||||||
DiskObjectStorageMetadataPtr readMetadataUnlocked(const std::string & path, std::shared_lock<std::shared_mutex> & lock) const;
|
DiskObjectStorageMetadataPtr readMetadataUnlocked(const std::string & path, std::shared_lock<SharedMutex> & lock) const;
|
||||||
};
|
};
|
||||||
|
|
||||||
class MetadataStorageFromDiskTransaction final : public IMetadataTransaction
|
class MetadataStorageFromDiskTransaction final : public IMetadataTransaction
|
||||||
|
@ -26,7 +26,7 @@ SetLastModifiedOperation::SetLastModifiedOperation(const std::string & path_, Po
|
|||||||
{
|
{
|
||||||
}
|
}
|
||||||
|
|
||||||
void SetLastModifiedOperation::execute(std::unique_lock<std::shared_mutex> &)
|
void SetLastModifiedOperation::execute(std::unique_lock<SharedMutex> &)
|
||||||
{
|
{
|
||||||
old_timestamp = disk.getLastModified(path);
|
old_timestamp = disk.getLastModified(path);
|
||||||
disk.setLastModified(path, new_timestamp);
|
disk.setLastModified(path, new_timestamp);
|
||||||
@ -44,7 +44,7 @@ ChmodOperation::ChmodOperation(const std::string & path_, mode_t mode_, IDisk &
|
|||||||
{
|
{
|
||||||
}
|
}
|
||||||
|
|
||||||
void ChmodOperation::execute(std::unique_lock<std::shared_mutex> &)
|
void ChmodOperation::execute(std::unique_lock<SharedMutex> &)
|
||||||
{
|
{
|
||||||
old_mode = disk.stat(path).st_mode;
|
old_mode = disk.stat(path).st_mode;
|
||||||
disk.chmod(path, mode);
|
disk.chmod(path, mode);
|
||||||
@ -61,7 +61,7 @@ UnlinkFileOperation::UnlinkFileOperation(const std::string & path_, IDisk & disk
|
|||||||
{
|
{
|
||||||
}
|
}
|
||||||
|
|
||||||
void UnlinkFileOperation::execute(std::unique_lock<std::shared_mutex> &)
|
void UnlinkFileOperation::execute(std::unique_lock<SharedMutex> &)
|
||||||
{
|
{
|
||||||
auto buf = disk.readFile(path, ReadSettings{}, std::nullopt, disk.getFileSize(path));
|
auto buf = disk.readFile(path, ReadSettings{}, std::nullopt, disk.getFileSize(path));
|
||||||
readStringUntilEOF(prev_data, *buf);
|
readStringUntilEOF(prev_data, *buf);
|
||||||
@ -81,7 +81,7 @@ CreateDirectoryOperation::CreateDirectoryOperation(const std::string & path_, ID
|
|||||||
{
|
{
|
||||||
}
|
}
|
||||||
|
|
||||||
void CreateDirectoryOperation::execute(std::unique_lock<std::shared_mutex> &)
|
void CreateDirectoryOperation::execute(std::unique_lock<SharedMutex> &)
|
||||||
{
|
{
|
||||||
disk.createDirectory(path);
|
disk.createDirectory(path);
|
||||||
}
|
}
|
||||||
@ -97,7 +97,7 @@ CreateDirectoryRecursiveOperation::CreateDirectoryRecursiveOperation(const std::
|
|||||||
{
|
{
|
||||||
}
|
}
|
||||||
|
|
||||||
void CreateDirectoryRecursiveOperation::execute(std::unique_lock<std::shared_mutex> &)
|
void CreateDirectoryRecursiveOperation::execute(std::unique_lock<SharedMutex> &)
|
||||||
{
|
{
|
||||||
namespace fs = std::filesystem;
|
namespace fs = std::filesystem;
|
||||||
fs::path p(path);
|
fs::path p(path);
|
||||||
@ -124,7 +124,7 @@ RemoveDirectoryOperation::RemoveDirectoryOperation(const std::string & path_, ID
|
|||||||
{
|
{
|
||||||
}
|
}
|
||||||
|
|
||||||
void RemoveDirectoryOperation::execute(std::unique_lock<std::shared_mutex> &)
|
void RemoveDirectoryOperation::execute(std::unique_lock<SharedMutex> &)
|
||||||
{
|
{
|
||||||
disk.removeDirectory(path);
|
disk.removeDirectory(path);
|
||||||
}
|
}
|
||||||
@ -141,7 +141,7 @@ RemoveRecursiveOperation::RemoveRecursiveOperation(const std::string & path_, ID
|
|||||||
{
|
{
|
||||||
}
|
}
|
||||||
|
|
||||||
void RemoveRecursiveOperation::execute(std::unique_lock<std::shared_mutex> &)
|
void RemoveRecursiveOperation::execute(std::unique_lock<SharedMutex> &)
|
||||||
{
|
{
|
||||||
if (disk.isFile(path))
|
if (disk.isFile(path))
|
||||||
disk.moveFile(path, temp_path);
|
disk.moveFile(path, temp_path);
|
||||||
@ -174,7 +174,7 @@ CreateHardlinkOperation::CreateHardlinkOperation(const std::string & path_from_,
|
|||||||
{
|
{
|
||||||
}
|
}
|
||||||
|
|
||||||
void CreateHardlinkOperation::execute(std::unique_lock<std::shared_mutex> & lock)
|
void CreateHardlinkOperation::execute(std::unique_lock<SharedMutex> & lock)
|
||||||
{
|
{
|
||||||
auto metadata = metadata_storage.readMetadataUnlocked(path_from, lock);
|
auto metadata = metadata_storage.readMetadataUnlocked(path_from, lock);
|
||||||
|
|
||||||
@ -201,7 +201,7 @@ MoveFileOperation::MoveFileOperation(const std::string & path_from_, const std::
|
|||||||
{
|
{
|
||||||
}
|
}
|
||||||
|
|
||||||
void MoveFileOperation::execute(std::unique_lock<std::shared_mutex> &)
|
void MoveFileOperation::execute(std::unique_lock<SharedMutex> &)
|
||||||
{
|
{
|
||||||
disk.moveFile(path_from, path_to);
|
disk.moveFile(path_from, path_to);
|
||||||
}
|
}
|
||||||
@ -218,7 +218,7 @@ MoveDirectoryOperation::MoveDirectoryOperation(const std::string & path_from_, c
|
|||||||
{
|
{
|
||||||
}
|
}
|
||||||
|
|
||||||
void MoveDirectoryOperation::execute(std::unique_lock<std::shared_mutex> &)
|
void MoveDirectoryOperation::execute(std::unique_lock<SharedMutex> &)
|
||||||
{
|
{
|
||||||
disk.moveDirectory(path_from, path_to);
|
disk.moveDirectory(path_from, path_to);
|
||||||
}
|
}
|
||||||
@ -236,7 +236,7 @@ ReplaceFileOperation::ReplaceFileOperation(const std::string & path_from_, const
|
|||||||
{
|
{
|
||||||
}
|
}
|
||||||
|
|
||||||
void ReplaceFileOperation::execute(std::unique_lock<std::shared_mutex> &)
|
void ReplaceFileOperation::execute(std::unique_lock<SharedMutex> &)
|
||||||
{
|
{
|
||||||
if (disk.exists(path_to))
|
if (disk.exists(path_to))
|
||||||
disk.moveFile(path_to, temp_path_to);
|
disk.moveFile(path_to, temp_path_to);
|
||||||
@ -262,7 +262,7 @@ WriteFileOperation::WriteFileOperation(const std::string & path_, IDisk & disk_,
|
|||||||
{
|
{
|
||||||
}
|
}
|
||||||
|
|
||||||
void WriteFileOperation::execute(std::unique_lock<std::shared_mutex> &)
|
void WriteFileOperation::execute(std::unique_lock<SharedMutex> &)
|
||||||
{
|
{
|
||||||
if (disk.exists(path))
|
if (disk.exists(path))
|
||||||
{
|
{
|
||||||
@ -288,7 +288,7 @@ void WriteFileOperation::undo()
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void AddBlobOperation::execute(std::unique_lock<std::shared_mutex> & metadata_lock)
|
void AddBlobOperation::execute(std::unique_lock<SharedMutex> & metadata_lock)
|
||||||
{
|
{
|
||||||
DiskObjectStorageMetadataPtr metadata;
|
DiskObjectStorageMetadataPtr metadata;
|
||||||
if (metadata_storage.exists(path))
|
if (metadata_storage.exists(path))
|
||||||
@ -309,7 +309,7 @@ void AddBlobOperation::undo()
|
|||||||
write_operation->undo();
|
write_operation->undo();
|
||||||
}
|
}
|
||||||
|
|
||||||
void UnlinkMetadataFileOperation::execute(std::unique_lock<std::shared_mutex> & metadata_lock)
|
void UnlinkMetadataFileOperation::execute(std::unique_lock<SharedMutex> & metadata_lock)
|
||||||
{
|
{
|
||||||
auto metadata = metadata_storage.readMetadataUnlocked(path, metadata_lock);
|
auto metadata = metadata_storage.readMetadataUnlocked(path, metadata_lock);
|
||||||
uint32_t ref_count = metadata->getRefCount();
|
uint32_t ref_count = metadata->getRefCount();
|
||||||
@ -336,7 +336,7 @@ void UnlinkMetadataFileOperation::undo()
|
|||||||
write_operation->undo();
|
write_operation->undo();
|
||||||
}
|
}
|
||||||
|
|
||||||
void SetReadonlyFileOperation::execute(std::unique_lock<std::shared_mutex> & metadata_lock)
|
void SetReadonlyFileOperation::execute(std::unique_lock<SharedMutex> & metadata_lock)
|
||||||
{
|
{
|
||||||
auto metadata = metadata_storage.readMetadataUnlocked(path, metadata_lock);
|
auto metadata = metadata_storage.readMetadataUnlocked(path, metadata_lock);
|
||||||
metadata->setReadOnly();
|
metadata->setReadOnly();
|
||||||
|
@ -1,5 +1,6 @@
|
|||||||
#pragma once
|
#pragma once
|
||||||
|
|
||||||
|
#include <Common/SharedMutex.h>
|
||||||
#include <Disks/ObjectStorages/IMetadataStorage.h>
|
#include <Disks/ObjectStorages/IMetadataStorage.h>
|
||||||
|
|
||||||
namespace DB
|
namespace DB
|
||||||
@ -13,7 +14,7 @@ class IDisk;
|
|||||||
|
|
||||||
struct IMetadataOperation
|
struct IMetadataOperation
|
||||||
{
|
{
|
||||||
virtual void execute(std::unique_lock<std::shared_mutex> & metadata_lock) = 0;
|
virtual void execute(std::unique_lock<SharedMutex> & metadata_lock) = 0;
|
||||||
virtual void undo() = 0;
|
virtual void undo() = 0;
|
||||||
virtual void finalize() {}
|
virtual void finalize() {}
|
||||||
virtual ~IMetadataOperation() = default;
|
virtual ~IMetadataOperation() = default;
|
||||||
@ -26,7 +27,7 @@ struct SetLastModifiedOperation final : public IMetadataOperation
|
|||||||
{
|
{
|
||||||
SetLastModifiedOperation(const std::string & path_, Poco::Timestamp new_timestamp_, IDisk & disk_);
|
SetLastModifiedOperation(const std::string & path_, Poco::Timestamp new_timestamp_, IDisk & disk_);
|
||||||
|
|
||||||
void execute(std::unique_lock<std::shared_mutex> & metadata_lock) override;
|
void execute(std::unique_lock<SharedMutex> & metadata_lock) override;
|
||||||
|
|
||||||
void undo() override;
|
void undo() override;
|
||||||
|
|
||||||
@ -41,7 +42,7 @@ struct ChmodOperation final : public IMetadataOperation
|
|||||||
{
|
{
|
||||||
ChmodOperation(const std::string & path_, mode_t mode_, IDisk & disk_);
|
ChmodOperation(const std::string & path_, mode_t mode_, IDisk & disk_);
|
||||||
|
|
||||||
void execute(std::unique_lock<std::shared_mutex> & metadata_lock) override;
|
void execute(std::unique_lock<SharedMutex> & metadata_lock) override;
|
||||||
|
|
||||||
void undo() override;
|
void undo() override;
|
||||||
|
|
||||||
@ -57,7 +58,7 @@ struct UnlinkFileOperation final : public IMetadataOperation
|
|||||||
{
|
{
|
||||||
UnlinkFileOperation(const std::string & path_, IDisk & disk_);
|
UnlinkFileOperation(const std::string & path_, IDisk & disk_);
|
||||||
|
|
||||||
void execute(std::unique_lock<std::shared_mutex> & metadata_lock) override;
|
void execute(std::unique_lock<SharedMutex> & metadata_lock) override;
|
||||||
|
|
||||||
void undo() override;
|
void undo() override;
|
||||||
|
|
||||||
@ -72,7 +73,7 @@ struct CreateDirectoryOperation final : public IMetadataOperation
|
|||||||
{
|
{
|
||||||
CreateDirectoryOperation(const std::string & path_, IDisk & disk_);
|
CreateDirectoryOperation(const std::string & path_, IDisk & disk_);
|
||||||
|
|
||||||
void execute(std::unique_lock<std::shared_mutex> & metadata_lock) override;
|
void execute(std::unique_lock<SharedMutex> & metadata_lock) override;
|
||||||
|
|
||||||
void undo() override;
|
void undo() override;
|
||||||
|
|
||||||
@ -86,7 +87,7 @@ struct CreateDirectoryRecursiveOperation final : public IMetadataOperation
|
|||||||
{
|
{
|
||||||
CreateDirectoryRecursiveOperation(const std::string & path_, IDisk & disk_);
|
CreateDirectoryRecursiveOperation(const std::string & path_, IDisk & disk_);
|
||||||
|
|
||||||
void execute(std::unique_lock<std::shared_mutex> & metadata_lock) override;
|
void execute(std::unique_lock<SharedMutex> & metadata_lock) override;
|
||||||
|
|
||||||
void undo() override;
|
void undo() override;
|
||||||
|
|
||||||
@ -101,7 +102,7 @@ struct RemoveDirectoryOperation final : public IMetadataOperation
|
|||||||
{
|
{
|
||||||
RemoveDirectoryOperation(const std::string & path_, IDisk & disk_);
|
RemoveDirectoryOperation(const std::string & path_, IDisk & disk_);
|
||||||
|
|
||||||
void execute(std::unique_lock<std::shared_mutex> & metadata_lock) override;
|
void execute(std::unique_lock<SharedMutex> & metadata_lock) override;
|
||||||
|
|
||||||
void undo() override;
|
void undo() override;
|
||||||
|
|
||||||
@ -114,7 +115,7 @@ struct RemoveRecursiveOperation final : public IMetadataOperation
|
|||||||
{
|
{
|
||||||
RemoveRecursiveOperation(const std::string & path_, IDisk & disk_);
|
RemoveRecursiveOperation(const std::string & path_, IDisk & disk_);
|
||||||
|
|
||||||
void execute(std::unique_lock<std::shared_mutex> & metadata_lock) override;
|
void execute(std::unique_lock<SharedMutex> & metadata_lock) override;
|
||||||
|
|
||||||
void undo() override;
|
void undo() override;
|
||||||
|
|
||||||
@ -130,7 +131,7 @@ struct WriteFileOperation final : public IMetadataOperation
|
|||||||
{
|
{
|
||||||
WriteFileOperation(const std::string & path_, IDisk & disk_, const std::string & data_);
|
WriteFileOperation(const std::string & path_, IDisk & disk_, const std::string & data_);
|
||||||
|
|
||||||
void execute(std::unique_lock<std::shared_mutex> & metadata_lock) override;
|
void execute(std::unique_lock<SharedMutex> & metadata_lock) override;
|
||||||
|
|
||||||
void undo() override;
|
void undo() override;
|
||||||
private:
|
private:
|
||||||
@ -149,7 +150,7 @@ struct CreateHardlinkOperation final : public IMetadataOperation
|
|||||||
IDisk & disk_,
|
IDisk & disk_,
|
||||||
const MetadataStorageFromDisk & metadata_storage_);
|
const MetadataStorageFromDisk & metadata_storage_);
|
||||||
|
|
||||||
void execute(std::unique_lock<std::shared_mutex> & metadata_lock) override;
|
void execute(std::unique_lock<SharedMutex> & metadata_lock) override;
|
||||||
|
|
||||||
void undo() override;
|
void undo() override;
|
||||||
|
|
||||||
@ -166,7 +167,7 @@ struct MoveFileOperation final : public IMetadataOperation
|
|||||||
{
|
{
|
||||||
MoveFileOperation(const std::string & path_from_, const std::string & path_to_, IDisk & disk_);
|
MoveFileOperation(const std::string & path_from_, const std::string & path_to_, IDisk & disk_);
|
||||||
|
|
||||||
void execute(std::unique_lock<std::shared_mutex> & metadata_lock) override;
|
void execute(std::unique_lock<SharedMutex> & metadata_lock) override;
|
||||||
|
|
||||||
void undo() override;
|
void undo() override;
|
||||||
|
|
||||||
@ -181,7 +182,7 @@ struct MoveDirectoryOperation final : public IMetadataOperation
|
|||||||
{
|
{
|
||||||
MoveDirectoryOperation(const std::string & path_from_, const std::string & path_to_, IDisk & disk_);
|
MoveDirectoryOperation(const std::string & path_from_, const std::string & path_to_, IDisk & disk_);
|
||||||
|
|
||||||
void execute(std::unique_lock<std::shared_mutex> & metadata_lock) override;
|
void execute(std::unique_lock<SharedMutex> & metadata_lock) override;
|
||||||
|
|
||||||
void undo() override;
|
void undo() override;
|
||||||
|
|
||||||
@ -196,7 +197,7 @@ struct ReplaceFileOperation final : public IMetadataOperation
|
|||||||
{
|
{
|
||||||
ReplaceFileOperation(const std::string & path_from_, const std::string & path_to_, IDisk & disk_);
|
ReplaceFileOperation(const std::string & path_from_, const std::string & path_to_, IDisk & disk_);
|
||||||
|
|
||||||
void execute(std::unique_lock<std::shared_mutex> & metadata_lock) override;
|
void execute(std::unique_lock<SharedMutex> & metadata_lock) override;
|
||||||
|
|
||||||
void undo() override;
|
void undo() override;
|
||||||
|
|
||||||
@ -226,7 +227,7 @@ struct AddBlobOperation final : public IMetadataOperation
|
|||||||
, metadata_storage(metadata_storage_)
|
, metadata_storage(metadata_storage_)
|
||||||
{}
|
{}
|
||||||
|
|
||||||
void execute(std::unique_lock<std::shared_mutex> & metadata_lock) override;
|
void execute(std::unique_lock<SharedMutex> & metadata_lock) override;
|
||||||
|
|
||||||
void undo() override;
|
void undo() override;
|
||||||
|
|
||||||
@ -254,7 +255,7 @@ struct UnlinkMetadataFileOperation final : public IMetadataOperation
|
|||||||
{
|
{
|
||||||
}
|
}
|
||||||
|
|
||||||
void execute(std::unique_lock<std::shared_mutex> & metadata_lock) override;
|
void execute(std::unique_lock<SharedMutex> & metadata_lock) override;
|
||||||
|
|
||||||
void undo() override;
|
void undo() override;
|
||||||
|
|
||||||
@ -279,7 +280,7 @@ struct SetReadonlyFileOperation final : public IMetadataOperation
|
|||||||
{
|
{
|
||||||
}
|
}
|
||||||
|
|
||||||
void execute(std::unique_lock<std::shared_mutex> & metadata_lock) override;
|
void execute(std::unique_lock<SharedMutex> & metadata_lock) override;
|
||||||
|
|
||||||
void undo() override;
|
void undo() override;
|
||||||
|
|
||||||
|
@ -16,18 +16,13 @@
|
|||||||
#include <IO/WriteBufferFromS3.h>
|
#include <IO/WriteBufferFromS3.h>
|
||||||
#include <IO/ReadBufferFromS3.h>
|
#include <IO/ReadBufferFromS3.h>
|
||||||
#include <IO/SeekAvoidingReadBuffer.h>
|
#include <IO/SeekAvoidingReadBuffer.h>
|
||||||
|
#include <IO/S3/copyS3File.h>
|
||||||
#include <Interpreters/threadPoolCallbackRunner.h>
|
#include <Interpreters/threadPoolCallbackRunner.h>
|
||||||
#include <Disks/ObjectStorages/S3/diskSettings.h>
|
#include <Disks/ObjectStorages/S3/diskSettings.h>
|
||||||
|
|
||||||
#include <aws/s3/model/CopyObjectRequest.h>
|
|
||||||
#include <aws/s3/model/ListObjectsV2Request.h>
|
#include <aws/s3/model/ListObjectsV2Request.h>
|
||||||
#include <aws/s3/model/HeadObjectRequest.h>
|
|
||||||
#include <aws/s3/model/DeleteObjectRequest.h>
|
#include <aws/s3/model/DeleteObjectRequest.h>
|
||||||
#include <aws/s3/model/DeleteObjectsRequest.h>
|
#include <aws/s3/model/DeleteObjectsRequest.h>
|
||||||
#include <aws/s3/model/CreateMultipartUploadRequest.h>
|
|
||||||
#include <aws/s3/model/CompleteMultipartUploadRequest.h>
|
|
||||||
#include <aws/s3/model/UploadPartCopyRequest.h>
|
|
||||||
#include <aws/s3/model/AbortMultipartUploadRequest.h>
|
|
||||||
|
|
||||||
#include <Common/getRandomASCIIString.h>
|
#include <Common/getRandomASCIIString.h>
|
||||||
#include <Common/StringUtils/StringUtils.h>
|
#include <Common/StringUtils/StringUtils.h>
|
||||||
@ -39,22 +34,9 @@
|
|||||||
namespace ProfileEvents
|
namespace ProfileEvents
|
||||||
{
|
{
|
||||||
extern const Event S3DeleteObjects;
|
extern const Event S3DeleteObjects;
|
||||||
extern const Event S3HeadObject;
|
|
||||||
extern const Event S3ListObjects;
|
extern const Event S3ListObjects;
|
||||||
extern const Event S3CopyObject;
|
|
||||||
extern const Event S3CreateMultipartUpload;
|
|
||||||
extern const Event S3UploadPartCopy;
|
|
||||||
extern const Event S3AbortMultipartUpload;
|
|
||||||
extern const Event S3CompleteMultipartUpload;
|
|
||||||
|
|
||||||
extern const Event DiskS3DeleteObjects;
|
extern const Event DiskS3DeleteObjects;
|
||||||
extern const Event DiskS3HeadObject;
|
|
||||||
extern const Event DiskS3ListObjects;
|
extern const Event DiskS3ListObjects;
|
||||||
extern const Event DiskS3CopyObject;
|
|
||||||
extern const Event DiskS3CreateMultipartUpload;
|
|
||||||
extern const Event DiskS3UploadPartCopy;
|
|
||||||
extern const Event DiskS3AbortMultipartUpload;
|
|
||||||
extern const Event DiskS3CompleteMultipartUpload;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
namespace DB
|
namespace DB
|
||||||
@ -125,21 +107,11 @@ std::string S3ObjectStorage::generateBlobNameForPath(const std::string & /* path
|
|||||||
getRandomASCIIString(key_name_total_size - key_name_prefix_size));
|
getRandomASCIIString(key_name_total_size - key_name_prefix_size));
|
||||||
}
|
}
|
||||||
|
|
||||||
size_t S3ObjectStorage::getObjectSize(const std::string & bucket_from, const std::string & key) const
|
|
||||||
{
|
|
||||||
return S3::getObjectSize(*client.get(), bucket_from, key, {}, /* for_disk_s3= */ true);
|
|
||||||
}
|
|
||||||
|
|
||||||
bool S3ObjectStorage::exists(const StoredObject & object) const
|
bool S3ObjectStorage::exists(const StoredObject & object) const
|
||||||
{
|
{
|
||||||
return S3::objectExists(*client.get(), bucket, object.absolute_path, {}, /* for_disk_s3= */ true);
|
return S3::objectExists(*client.get(), bucket, object.absolute_path, {}, /* for_disk_s3= */ true);
|
||||||
}
|
}
|
||||||
|
|
||||||
void S3ObjectStorage::checkObjectExists(const std::string & bucket_from, const std::string & key, std::string_view description) const
|
|
||||||
{
|
|
||||||
return S3::checkObjectExists(*client.get(), bucket_from, key, {}, /* for_disk_s3= */ true, description);
|
|
||||||
}
|
|
||||||
|
|
||||||
std::unique_ptr<ReadBufferFromFileBase> S3ObjectStorage::readObjects( /// NOLINT
|
std::unique_ptr<ReadBufferFromFileBase> S3ObjectStorage::readObjects( /// NOLINT
|
||||||
const StoredObjects & objects,
|
const StoredObjects & objects,
|
||||||
const ReadSettings & read_settings,
|
const ReadSettings & read_settings,
|
||||||
@ -431,7 +403,12 @@ void S3ObjectStorage::copyObjectToAnotherObjectStorage( // NOLINT
|
|||||||
/// Shortcut for S3
|
/// Shortcut for S3
|
||||||
if (auto * dest_s3 = dynamic_cast<S3ObjectStorage * >(&object_storage_to); dest_s3 != nullptr)
|
if (auto * dest_s3 = dynamic_cast<S3ObjectStorage * >(&object_storage_to); dest_s3 != nullptr)
|
||||||
{
|
{
|
||||||
copyObjectImpl(bucket, object_from.absolute_path, dest_s3->bucket, object_to.absolute_path, {}, object_to_attributes);
|
auto client_ptr = client.get();
|
||||||
|
auto size = S3::getObjectSize(*client_ptr, bucket, object_from.absolute_path, {}, /* for_disk_s3= */ true);
|
||||||
|
auto settings_ptr = s3_settings.get();
|
||||||
|
auto scheduler = threadPoolCallbackRunner<void>(getThreadPoolWriter(), "S3ObjStor_copy");
|
||||||
|
copyS3File(client_ptr, bucket, object_from.absolute_path, 0, size, dest_s3->bucket, object_to.absolute_path,
|
||||||
|
settings_ptr->request_settings, object_to_attributes, scheduler, /* for_disk_s3= */ true);
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
@ -439,148 +416,15 @@ void S3ObjectStorage::copyObjectToAnotherObjectStorage( // NOLINT
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void S3ObjectStorage::copyObjectImpl(
|
|
||||||
const String & src_bucket,
|
|
||||||
const String & src_key,
|
|
||||||
const String & dst_bucket,
|
|
||||||
const String & dst_key,
|
|
||||||
size_t size,
|
|
||||||
std::optional<ObjectAttributes> metadata) const
|
|
||||||
{
|
|
||||||
auto client_ptr = client.get();
|
|
||||||
|
|
||||||
ProfileEvents::increment(ProfileEvents::S3CopyObject);
|
|
||||||
ProfileEvents::increment(ProfileEvents::DiskS3CopyObject);
|
|
||||||
Aws::S3::Model::CopyObjectRequest request;
|
|
||||||
request.SetCopySource(src_bucket + "/" + src_key);
|
|
||||||
request.SetBucket(dst_bucket);
|
|
||||||
request.SetKey(dst_key);
|
|
||||||
if (metadata)
|
|
||||||
{
|
|
||||||
request.SetMetadata(*metadata);
|
|
||||||
request.SetMetadataDirective(Aws::S3::Model::MetadataDirective::REPLACE);
|
|
||||||
}
|
|
||||||
|
|
||||||
auto outcome = client_ptr->CopyObject(request);
|
|
||||||
|
|
||||||
if (!outcome.IsSuccess() && (outcome.GetError().GetExceptionName() == "EntityTooLarge"
|
|
||||||
|| outcome.GetError().GetExceptionName() == "InvalidRequest"))
|
|
||||||
{ // Can't come here with MinIO, MinIO allows single part upload for large objects.
|
|
||||||
copyObjectMultipartImpl(src_bucket, src_key, dst_bucket, dst_key, size, metadata);
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
throwIfError(outcome);
|
|
||||||
|
|
||||||
auto settings_ptr = s3_settings.get();
|
|
||||||
if (settings_ptr->request_settings.check_objects_after_upload)
|
|
||||||
checkObjectExists(dst_bucket, dst_key, "Immediately after upload");
|
|
||||||
}
|
|
||||||
|
|
||||||
void S3ObjectStorage::copyObjectMultipartImpl(
|
|
||||||
const String & src_bucket,
|
|
||||||
const String & src_key,
|
|
||||||
const String & dst_bucket,
|
|
||||||
const String & dst_key,
|
|
||||||
size_t size,
|
|
||||||
std::optional<ObjectAttributes> metadata) const
|
|
||||||
{
|
|
||||||
auto settings_ptr = s3_settings.get();
|
|
||||||
auto client_ptr = client.get();
|
|
||||||
|
|
||||||
String multipart_upload_id;
|
|
||||||
|
|
||||||
{
|
|
||||||
ProfileEvents::increment(ProfileEvents::S3CreateMultipartUpload);
|
|
||||||
ProfileEvents::increment(ProfileEvents::DiskS3CreateMultipartUpload);
|
|
||||||
Aws::S3::Model::CreateMultipartUploadRequest request;
|
|
||||||
request.SetBucket(dst_bucket);
|
|
||||||
request.SetKey(dst_key);
|
|
||||||
if (metadata)
|
|
||||||
request.SetMetadata(*metadata);
|
|
||||||
|
|
||||||
auto outcome = client_ptr->CreateMultipartUpload(request);
|
|
||||||
|
|
||||||
throwIfError(outcome);
|
|
||||||
|
|
||||||
multipart_upload_id = outcome.GetResult().GetUploadId();
|
|
||||||
}
|
|
||||||
|
|
||||||
std::vector<String> part_tags;
|
|
||||||
|
|
||||||
size_t upload_part_size = settings_ptr->request_settings.getUploadSettings().min_upload_part_size;
|
|
||||||
for (size_t position = 0, part_number = 1; position < size; ++part_number, position += upload_part_size)
|
|
||||||
{
|
|
||||||
ProfileEvents::increment(ProfileEvents::S3UploadPartCopy);
|
|
||||||
ProfileEvents::increment(ProfileEvents::DiskS3UploadPartCopy);
|
|
||||||
Aws::S3::Model::UploadPartCopyRequest part_request;
|
|
||||||
part_request.SetCopySource(src_bucket + "/" + src_key);
|
|
||||||
part_request.SetBucket(dst_bucket);
|
|
||||||
part_request.SetKey(dst_key);
|
|
||||||
part_request.SetUploadId(multipart_upload_id);
|
|
||||||
part_request.SetPartNumber(static_cast<int>(part_number));
|
|
||||||
part_request.SetCopySourceRange(fmt::format("bytes={}-{}", position, std::min(size, position + upload_part_size) - 1));
|
|
||||||
|
|
||||||
auto outcome = client_ptr->UploadPartCopy(part_request);
|
|
||||||
if (!outcome.IsSuccess())
|
|
||||||
{
|
|
||||||
ProfileEvents::increment(ProfileEvents::S3AbortMultipartUpload);
|
|
||||||
ProfileEvents::increment(ProfileEvents::DiskS3AbortMultipartUpload);
|
|
||||||
Aws::S3::Model::AbortMultipartUploadRequest abort_request;
|
|
||||||
abort_request.SetBucket(dst_bucket);
|
|
||||||
abort_request.SetKey(dst_key);
|
|
||||||
abort_request.SetUploadId(multipart_upload_id);
|
|
||||||
client_ptr->AbortMultipartUpload(abort_request);
|
|
||||||
// In error case we throw exception later with first error from UploadPartCopy
|
|
||||||
}
|
|
||||||
throwIfError(outcome);
|
|
||||||
|
|
||||||
auto etag = outcome.GetResult().GetCopyPartResult().GetETag();
|
|
||||||
part_tags.push_back(etag);
|
|
||||||
}
|
|
||||||
|
|
||||||
{
|
|
||||||
ProfileEvents::increment(ProfileEvents::S3CompleteMultipartUpload);
|
|
||||||
ProfileEvents::increment(ProfileEvents::DiskS3CompleteMultipartUpload);
|
|
||||||
Aws::S3::Model::CompleteMultipartUploadRequest req;
|
|
||||||
req.SetBucket(dst_bucket);
|
|
||||||
req.SetKey(dst_key);
|
|
||||||
req.SetUploadId(multipart_upload_id);
|
|
||||||
|
|
||||||
Aws::S3::Model::CompletedMultipartUpload multipart_upload;
|
|
||||||
for (size_t i = 0; i < part_tags.size(); ++i)
|
|
||||||
{
|
|
||||||
Aws::S3::Model::CompletedPart part;
|
|
||||||
multipart_upload.AddParts(part.WithETag(part_tags[i]).WithPartNumber(static_cast<int>(i) + 1));
|
|
||||||
}
|
|
||||||
|
|
||||||
req.SetMultipartUpload(multipart_upload);
|
|
||||||
|
|
||||||
auto outcome = client_ptr->CompleteMultipartUpload(req);
|
|
||||||
|
|
||||||
throwIfError(outcome);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (settings_ptr->request_settings.check_objects_after_upload)
|
|
||||||
checkObjectExists(dst_bucket, dst_key, "Immediately after upload");
|
|
||||||
}
|
|
||||||
|
|
||||||
void S3ObjectStorage::copyObject( // NOLINT
|
void S3ObjectStorage::copyObject( // NOLINT
|
||||||
const StoredObject & object_from, const StoredObject & object_to, std::optional<ObjectAttributes> object_to_attributes)
|
const StoredObject & object_from, const StoredObject & object_to, std::optional<ObjectAttributes> object_to_attributes)
|
||||||
{
|
{
|
||||||
auto size = getObjectSize(bucket, object_from.absolute_path);
|
auto client_ptr = client.get();
|
||||||
static constexpr int64_t multipart_upload_threashold = 5UL * 1024 * 1024 * 1024;
|
auto size = S3::getObjectSize(*client_ptr, bucket, object_from.absolute_path, {}, /* for_disk_s3= */ true);
|
||||||
|
auto settings_ptr = s3_settings.get();
|
||||||
if (size >= multipart_upload_threashold)
|
auto scheduler = threadPoolCallbackRunner<void>(getThreadPoolWriter(), "S3ObjStor_copy");
|
||||||
{
|
copyS3File(client_ptr, bucket, object_from.absolute_path, 0, size, bucket, object_to.absolute_path,
|
||||||
copyObjectMultipartImpl(
|
settings_ptr->request_settings, object_to_attributes, scheduler, /* for_disk_s3= */ true);
|
||||||
bucket, object_from.absolute_path, bucket, object_to.absolute_path, size, object_to_attributes);
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
copyObjectImpl(
|
|
||||||
bucket, object_from.absolute_path, bucket, object_to.absolute_path, size, object_to_attributes);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
void S3ObjectStorage::setNewSettings(std::unique_ptr<S3ObjectStorageSettings> && s3_settings_)
|
void S3ObjectStorage::setNewSettings(std::unique_ptr<S3ObjectStorageSettings> && s3_settings_)
|
||||||
|
@ -8,8 +8,6 @@
|
|||||||
#include <Disks/ObjectStorages/S3/S3Capabilities.h>
|
#include <Disks/ObjectStorages/S3/S3Capabilities.h>
|
||||||
#include <memory>
|
#include <memory>
|
||||||
#include <aws/s3/S3Client.h>
|
#include <aws/s3/S3Client.h>
|
||||||
#include <aws/s3/model/HeadObjectResult.h>
|
|
||||||
#include <aws/s3/model/ListObjectsV2Result.h>
|
|
||||||
#include <Storages/StorageS3Settings.h>
|
#include <Storages/StorageS3Settings.h>
|
||||||
#include <Common/MultiVersion.h>
|
#include <Common/MultiVersion.h>
|
||||||
#include <Common/logger_useful.h>
|
#include <Common/logger_useful.h>
|
||||||
@ -167,28 +165,9 @@ private:
|
|||||||
|
|
||||||
void setNewClient(std::unique_ptr<Aws::S3::S3Client> && client_);
|
void setNewClient(std::unique_ptr<Aws::S3::S3Client> && client_);
|
||||||
|
|
||||||
void copyObjectImpl(
|
|
||||||
const String & src_bucket,
|
|
||||||
const String & src_key,
|
|
||||||
const String & dst_bucket,
|
|
||||||
const String & dst_key,
|
|
||||||
size_t size,
|
|
||||||
std::optional<ObjectAttributes> metadata = std::nullopt) const;
|
|
||||||
|
|
||||||
void copyObjectMultipartImpl(
|
|
||||||
const String & src_bucket,
|
|
||||||
const String & src_key,
|
|
||||||
const String & dst_bucket,
|
|
||||||
const String & dst_key,
|
|
||||||
size_t size,
|
|
||||||
std::optional<ObjectAttributes> metadata = std::nullopt) const;
|
|
||||||
|
|
||||||
void removeObjectImpl(const StoredObject & object, bool if_exists);
|
void removeObjectImpl(const StoredObject & object, bool if_exists);
|
||||||
void removeObjectsImpl(const StoredObjects & objects, bool if_exists);
|
void removeObjectsImpl(const StoredObjects & objects, bool if_exists);
|
||||||
|
|
||||||
size_t getObjectSize(const std::string & bucket_from, const std::string & key) const;
|
|
||||||
void checkObjectExists(const std::string & bucket_from, const std::string & key, std::string_view description) const;
|
|
||||||
|
|
||||||
std::string bucket;
|
std::string bucket;
|
||||||
|
|
||||||
MultiVersion<Aws::S3::S3Client> client;
|
MultiVersion<Aws::S3::S3Client> client;
|
||||||
|
@ -233,7 +233,10 @@ String readByEscapingRule(ReadBuffer & buf, FormatSettings::EscapingRule escapin
|
|||||||
readCSVField(result, buf, format_settings.csv);
|
readCSVField(result, buf, format_settings.csv);
|
||||||
break;
|
break;
|
||||||
case FormatSettings::EscapingRule::Escaped:
|
case FormatSettings::EscapingRule::Escaped:
|
||||||
readEscapedString(result, buf);
|
if constexpr (read_string)
|
||||||
|
readEscapedString(result, buf);
|
||||||
|
else
|
||||||
|
readTSVField(result, buf);
|
||||||
break;
|
break;
|
||||||
default:
|
default:
|
||||||
throw Exception(ErrorCodes::BAD_ARGUMENTS, "Cannot read value with {} escaping rule", escapingRuleToString(escaping_rule));
|
throw Exception(ErrorCodes::BAD_ARGUMENTS, "Cannot read value with {} escaping rule", escapingRuleToString(escaping_rule));
|
||||||
@ -251,6 +254,21 @@ String readStringByEscapingRule(ReadBuffer & buf, FormatSettings::EscapingRule e
|
|||||||
return readByEscapingRule<true>(buf, escaping_rule, format_settings);
|
return readByEscapingRule<true>(buf, escaping_rule, format_settings);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
String readStringOrFieldByEscapingRule(ReadBuffer & buf, FormatSettings::EscapingRule escaping_rule, const FormatSettings & format_settings)
|
||||||
|
{
|
||||||
|
/// For Quoted escaping rule we can read value as string only if it starts with `'`.
|
||||||
|
/// If there is no `'` it can be any other field number/array/etc.
|
||||||
|
if (escaping_rule == FormatSettings::EscapingRule::Quoted && !buf.eof() && *buf.position() != '\'')
|
||||||
|
return readFieldByEscapingRule(buf, escaping_rule, format_settings);
|
||||||
|
|
||||||
|
/// For JSON it's the same as for Quoted, but we check `"`.
|
||||||
|
if (escaping_rule == FormatSettings::EscapingRule::JSON && !buf.eof() && *buf.position() != '"')
|
||||||
|
return readFieldByEscapingRule(buf, escaping_rule, format_settings);
|
||||||
|
|
||||||
|
/// For other escaping rules we can read any field as string value.
|
||||||
|
return readStringByEscapingRule(buf, escaping_rule, format_settings);
|
||||||
|
}
|
||||||
|
|
||||||
DataTypePtr tryInferDataTypeByEscapingRule(const String & field, const FormatSettings & format_settings, FormatSettings::EscapingRule escaping_rule, JSONInferenceInfo * json_info)
|
DataTypePtr tryInferDataTypeByEscapingRule(const String & field, const FormatSettings & format_settings, FormatSettings::EscapingRule escaping_rule, JSONInferenceInfo * json_info)
|
||||||
{
|
{
|
||||||
switch (escaping_rule)
|
switch (escaping_rule)
|
||||||
@ -292,13 +310,14 @@ DataTypePtr tryInferDataTypeByEscapingRule(const String & field, const FormatSet
|
|||||||
return type;
|
return type;
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Case when CSV value is not in quotes. Check if it's a number, and if not, determine it's as a string.
|
/// Case when CSV value is not in quotes. Check if it's a number or date/datetime, and if not, determine it as a string.
|
||||||
auto type = tryInferNumberFromString(field, format_settings);
|
if (auto number_type = tryInferNumberFromString(field, format_settings))
|
||||||
|
return number_type;
|
||||||
|
|
||||||
if (!type)
|
if (auto date_type = tryInferDateOrDateTimeFromString(field, format_settings))
|
||||||
return std::make_shared<DataTypeString>();
|
return date_type;
|
||||||
|
|
||||||
return type;
|
return std::make_shared<DataTypeString>();
|
||||||
}
|
}
|
||||||
case FormatSettings::EscapingRule::Raw: [[fallthrough]];
|
case FormatSettings::EscapingRule::Raw: [[fallthrough]];
|
||||||
case FormatSettings::EscapingRule::Escaped:
|
case FormatSettings::EscapingRule::Escaped:
|
||||||
|
@ -34,8 +34,14 @@ void serializeFieldByEscapingRule(
|
|||||||
|
|
||||||
void writeStringByEscapingRule(const String & value, WriteBuffer & out, FormatSettings::EscapingRule escaping_rule, const FormatSettings & format_settings);
|
void writeStringByEscapingRule(const String & value, WriteBuffer & out, FormatSettings::EscapingRule escaping_rule, const FormatSettings & format_settings);
|
||||||
|
|
||||||
|
/// Read String serialized in specified escaping rule.
|
||||||
String readStringByEscapingRule(ReadBuffer & buf, FormatSettings::EscapingRule escaping_rule, const FormatSettings & format_settings);
|
String readStringByEscapingRule(ReadBuffer & buf, FormatSettings::EscapingRule escaping_rule, const FormatSettings & format_settings);
|
||||||
|
/// Read any field serialized in specified escaping rule. It can be any fild like number/array/etc.
|
||||||
|
/// This function should return value exactly as it was in the data without changes
|
||||||
|
/// (for example without parsing escaped sequences)
|
||||||
String readFieldByEscapingRule(ReadBuffer & buf, FormatSettings::EscapingRule escaping_rule, const FormatSettings & format_settings);
|
String readFieldByEscapingRule(ReadBuffer & buf, FormatSettings::EscapingRule escaping_rule, const FormatSettings & format_settings);
|
||||||
|
/// In case if we don't know if we have String value or not, but need to read String values as String (with correct escaped sequences parsing).
|
||||||
|
String readStringOrFieldByEscapingRule(ReadBuffer & buf, FormatSettings::EscapingRule escaping_rule, const FormatSettings & format_settings);
|
||||||
|
|
||||||
/// Try to determine the type of the field written by a specific escaping rule.
|
/// Try to determine the type of the field written by a specific escaping rule.
|
||||||
/// If cannot, return nullptr.
|
/// If cannot, return nullptr.
|
||||||
|
@ -68,6 +68,7 @@ FormatSettings getFormatSettings(ContextPtr context, const Settings & settings)
|
|||||||
format_settings.csv.arrays_as_nested_csv = settings.input_format_csv_arrays_as_nested_csv;
|
format_settings.csv.arrays_as_nested_csv = settings.input_format_csv_arrays_as_nested_csv;
|
||||||
format_settings.csv.use_best_effort_in_schema_inference = settings.input_format_csv_use_best_effort_in_schema_inference;
|
format_settings.csv.use_best_effort_in_schema_inference = settings.input_format_csv_use_best_effort_in_schema_inference;
|
||||||
format_settings.csv.skip_first_lines = settings.input_format_csv_skip_first_lines;
|
format_settings.csv.skip_first_lines = settings.input_format_csv_skip_first_lines;
|
||||||
|
format_settings.csv.try_detect_header = settings.input_format_csv_detect_header;
|
||||||
format_settings.hive_text.fields_delimiter = settings.input_format_hive_text_fields_delimiter;
|
format_settings.hive_text.fields_delimiter = settings.input_format_hive_text_fields_delimiter;
|
||||||
format_settings.hive_text.collection_items_delimiter = settings.input_format_hive_text_collection_items_delimiter;
|
format_settings.hive_text.collection_items_delimiter = settings.input_format_hive_text_collection_items_delimiter;
|
||||||
format_settings.hive_text.map_keys_delimiter = settings.input_format_hive_text_map_keys_delimiter;
|
format_settings.hive_text.map_keys_delimiter = settings.input_format_hive_text_map_keys_delimiter;
|
||||||
@ -78,6 +79,7 @@ FormatSettings getFormatSettings(ContextPtr context, const Settings & settings)
|
|||||||
format_settings.custom.row_after_delimiter = settings.format_custom_row_after_delimiter;
|
format_settings.custom.row_after_delimiter = settings.format_custom_row_after_delimiter;
|
||||||
format_settings.custom.row_before_delimiter = settings.format_custom_row_before_delimiter;
|
format_settings.custom.row_before_delimiter = settings.format_custom_row_before_delimiter;
|
||||||
format_settings.custom.row_between_delimiter = settings.format_custom_row_between_delimiter;
|
format_settings.custom.row_between_delimiter = settings.format_custom_row_between_delimiter;
|
||||||
|
format_settings.custom.try_detect_header = settings.input_format_custom_detect_header;
|
||||||
format_settings.date_time_input_format = settings.date_time_input_format;
|
format_settings.date_time_input_format = settings.date_time_input_format;
|
||||||
format_settings.date_time_output_format = settings.date_time_output_format;
|
format_settings.date_time_output_format = settings.date_time_output_format;
|
||||||
format_settings.input_format_ipv4_default_on_conversion_error = settings.input_format_ipv4_default_on_conversion_error;
|
format_settings.input_format_ipv4_default_on_conversion_error = settings.input_format_ipv4_default_on_conversion_error;
|
||||||
@ -138,6 +140,7 @@ FormatSettings getFormatSettings(ContextPtr context, const Settings & settings)
|
|||||||
format_settings.tsv.null_representation = settings.format_tsv_null_representation;
|
format_settings.tsv.null_representation = settings.format_tsv_null_representation;
|
||||||
format_settings.tsv.use_best_effort_in_schema_inference = settings.input_format_tsv_use_best_effort_in_schema_inference;
|
format_settings.tsv.use_best_effort_in_schema_inference = settings.input_format_tsv_use_best_effort_in_schema_inference;
|
||||||
format_settings.tsv.skip_first_lines = settings.input_format_tsv_skip_first_lines;
|
format_settings.tsv.skip_first_lines = settings.input_format_tsv_skip_first_lines;
|
||||||
|
format_settings.tsv.try_detect_header = settings.input_format_tsv_detect_header;
|
||||||
format_settings.values.accurate_types_of_literals = settings.input_format_values_accurate_types_of_literals;
|
format_settings.values.accurate_types_of_literals = settings.input_format_values_accurate_types_of_literals;
|
||||||
format_settings.values.deduce_templates_of_expressions = settings.input_format_values_deduce_templates_of_expressions;
|
format_settings.values.deduce_templates_of_expressions = settings.input_format_values_deduce_templates_of_expressions;
|
||||||
format_settings.values.interpret_expressions = settings.input_format_values_interpret_expressions;
|
format_settings.values.interpret_expressions = settings.input_format_values_interpret_expressions;
|
||||||
|
@ -124,6 +124,7 @@ struct FormatSettings
|
|||||||
bool use_best_effort_in_schema_inference = true;
|
bool use_best_effort_in_schema_inference = true;
|
||||||
UInt64 skip_first_lines = 0;
|
UInt64 skip_first_lines = 0;
|
||||||
String custom_delimiter;
|
String custom_delimiter;
|
||||||
|
bool try_detect_header = true;
|
||||||
} csv;
|
} csv;
|
||||||
|
|
||||||
struct HiveText
|
struct HiveText
|
||||||
@ -143,6 +144,7 @@ struct FormatSettings
|
|||||||
std::string row_between_delimiter;
|
std::string row_between_delimiter;
|
||||||
std::string field_delimiter;
|
std::string field_delimiter;
|
||||||
EscapingRule escaping_rule = EscapingRule::Escaped;
|
EscapingRule escaping_rule = EscapingRule::Escaped;
|
||||||
|
bool try_detect_header = true;
|
||||||
} custom;
|
} custom;
|
||||||
|
|
||||||
struct
|
struct
|
||||||
@ -251,6 +253,7 @@ struct FormatSettings
|
|||||||
bool enum_as_number = false;
|
bool enum_as_number = false;
|
||||||
bool use_best_effort_in_schema_inference = true;
|
bool use_best_effort_in_schema_inference = true;
|
||||||
UInt64 skip_first_lines = 0;
|
UInt64 skip_first_lines = 0;
|
||||||
|
bool try_detect_header = true;
|
||||||
} tsv;
|
} tsv;
|
||||||
|
|
||||||
struct
|
struct
|
||||||
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue
Block a user