Merge remote-tracking branch 'origin/master' into clickhouse-help

This commit is contained in:
Yatsishin Ilya 2023-02-07 15:46:59 +00:00
commit 1baa15d603
192 changed files with 9781 additions and 943 deletions

View File

@ -512,6 +512,75 @@ jobs:
docker ps --quiet | xargs --no-run-if-empty docker kill ||:
docker ps --all --quiet | xargs --no-run-if-empty docker rm -f ||:
sudo rm -fr "$TEMP_PATH"
############################################################################################
#################################### INSTALL PACKAGES ######################################
############################################################################################
InstallPackagesTestRelease:
needs: [BuilderDebRelease]
runs-on: [self-hosted, style-checker]
steps:
- name: Set envs
run: |
cat >> "$GITHUB_ENV" << 'EOF'
TEMP_PATH=${{runner.temp}}/test_install
REPORTS_PATH=${{runner.temp}}/reports_dir
CHECK_NAME=Install packages (amd64)
REPO_COPY=${{runner.temp}}/test_install/ClickHouse
EOF
- name: Download json reports
uses: actions/download-artifact@v3
with:
path: ${{ env.REPORTS_PATH }}
- name: Check out repository code
uses: ClickHouse/checkout@v1
with:
clear-repository: true
- name: Test packages installation
run: |
sudo rm -fr "$TEMP_PATH"
mkdir -p "$TEMP_PATH"
cp -r "$GITHUB_WORKSPACE" "$TEMP_PATH"
cd "$REPO_COPY/tests/ci"
python3 install_check.py "$CHECK_NAME"
- name: Cleanup
if: always()
run: |
docker ps --quiet | xargs --no-run-if-empty docker kill ||:
docker ps --all --quiet | xargs --no-run-if-empty docker rm -f ||:
sudo rm -fr "$TEMP_PATH"
InstallPackagesTestAarch64:
needs: [BuilderDebRelease]
runs-on: [self-hosted, style-checker-aarch64]
steps:
- name: Set envs
run: |
cat >> "$GITHUB_ENV" << 'EOF'
TEMP_PATH=${{runner.temp}}/test_install
REPORTS_PATH=${{runner.temp}}/reports_dir
CHECK_NAME=Install packages (arm64)
REPO_COPY=${{runner.temp}}/test_install/ClickHouse
EOF
- name: Download json reports
uses: actions/download-artifact@v3
with:
path: ${{ env.REPORTS_PATH }}
- name: Check out repository code
uses: ClickHouse/checkout@v1
with:
clear-repository: true
- name: Test packages installation
run: |
sudo rm -fr "$TEMP_PATH"
mkdir -p "$TEMP_PATH"
cp -r "$GITHUB_WORKSPACE" "$TEMP_PATH"
cd "$REPO_COPY/tests/ci"
python3 install_check.py "$CHECK_NAME"
- name: Cleanup
if: always()
run: |
docker ps --quiet | xargs --no-run-if-empty docker kill ||:
docker ps --all --quiet | xargs --no-run-if-empty docker rm -f ||:
sudo rm -fr "$TEMP_PATH"
##############################################################################################
########################### FUNCTIONAl STATELESS TESTS #######################################
##############################################################################################

View File

@ -946,6 +946,75 @@ jobs:
run: |
cd "$GITHUB_WORKSPACE/tests/ci"
python3 mark_release_ready.py
############################################################################################
#################################### INSTALL PACKAGES ######################################
############################################################################################
InstallPackagesTestRelease:
needs: [BuilderDebRelease]
runs-on: [self-hosted, style-checker]
steps:
- name: Set envs
run: |
cat >> "$GITHUB_ENV" << 'EOF'
TEMP_PATH=${{runner.temp}}/test_install
REPORTS_PATH=${{runner.temp}}/reports_dir
CHECK_NAME=Install packages (amd64)
REPO_COPY=${{runner.temp}}/test_install/ClickHouse
EOF
- name: Download json reports
uses: actions/download-artifact@v3
with:
path: ${{ env.REPORTS_PATH }}
- name: Check out repository code
uses: ClickHouse/checkout@v1
with:
clear-repository: true
- name: Test packages installation
run: |
sudo rm -fr "$TEMP_PATH"
mkdir -p "$TEMP_PATH"
cp -r "$GITHUB_WORKSPACE" "$TEMP_PATH"
cd "$REPO_COPY/tests/ci"
python3 install_check.py "$CHECK_NAME"
- name: Cleanup
if: always()
run: |
docker ps --quiet | xargs --no-run-if-empty docker kill ||:
docker ps --all --quiet | xargs --no-run-if-empty docker rm -f ||:
sudo rm -fr "$TEMP_PATH"
InstallPackagesTestAarch64:
needs: [BuilderDebRelease]
runs-on: [self-hosted, style-checker-aarch64]
steps:
- name: Set envs
run: |
cat >> "$GITHUB_ENV" << 'EOF'
TEMP_PATH=${{runner.temp}}/test_install
REPORTS_PATH=${{runner.temp}}/reports_dir
CHECK_NAME=Install packages (arm64)
REPO_COPY=${{runner.temp}}/test_install/ClickHouse
EOF
- name: Download json reports
uses: actions/download-artifact@v3
with:
path: ${{ env.REPORTS_PATH }}
- name: Check out repository code
uses: ClickHouse/checkout@v1
with:
clear-repository: true
- name: Test packages installation
run: |
sudo rm -fr "$TEMP_PATH"
mkdir -p "$TEMP_PATH"
cp -r "$GITHUB_WORKSPACE" "$TEMP_PATH"
cd "$REPO_COPY/tests/ci"
python3 install_check.py "$CHECK_NAME"
- name: Cleanup
if: always()
run: |
docker ps --quiet | xargs --no-run-if-empty docker kill ||:
docker ps --all --quiet | xargs --no-run-if-empty docker rm -f ||:
sudo rm -fr "$TEMP_PATH"
##############################################################################################
########################### FUNCTIONAl STATELESS TESTS #######################################
##############################################################################################

View File

@ -984,6 +984,75 @@ jobs:
docker ps --quiet | xargs --no-run-if-empty docker kill ||:
docker ps --all --quiet | xargs --no-run-if-empty docker rm -f ||:
sudo rm -fr "$TEMP_PATH"
############################################################################################
#################################### INSTALL PACKAGES ######################################
############################################################################################
InstallPackagesTestRelease:
needs: [BuilderDebRelease]
runs-on: [self-hosted, style-checker]
steps:
- name: Set envs
run: |
cat >> "$GITHUB_ENV" << 'EOF'
TEMP_PATH=${{runner.temp}}/test_install
REPORTS_PATH=${{runner.temp}}/reports_dir
CHECK_NAME=Install packages (amd64)
REPO_COPY=${{runner.temp}}/test_install/ClickHouse
EOF
- name: Download json reports
uses: actions/download-artifact@v3
with:
path: ${{ env.REPORTS_PATH }}
- name: Check out repository code
uses: ClickHouse/checkout@v1
with:
clear-repository: true
- name: Test packages installation
run: |
sudo rm -fr "$TEMP_PATH"
mkdir -p "$TEMP_PATH"
cp -r "$GITHUB_WORKSPACE" "$TEMP_PATH"
cd "$REPO_COPY/tests/ci"
python3 install_check.py "$CHECK_NAME"
- name: Cleanup
if: always()
run: |
docker ps --quiet | xargs --no-run-if-empty docker kill ||:
docker ps --all --quiet | xargs --no-run-if-empty docker rm -f ||:
sudo rm -fr "$TEMP_PATH"
InstallPackagesTestAarch64:
needs: [BuilderDebRelease]
runs-on: [self-hosted, style-checker-aarch64]
steps:
- name: Set envs
run: |
cat >> "$GITHUB_ENV" << 'EOF'
TEMP_PATH=${{runner.temp}}/test_install
REPORTS_PATH=${{runner.temp}}/reports_dir
CHECK_NAME=Install packages (arm64)
REPO_COPY=${{runner.temp}}/test_install/ClickHouse
EOF
- name: Download json reports
uses: actions/download-artifact@v3
with:
path: ${{ env.REPORTS_PATH }}
- name: Check out repository code
uses: ClickHouse/checkout@v1
with:
clear-repository: true
- name: Test packages installation
run: |
sudo rm -fr "$TEMP_PATH"
mkdir -p "$TEMP_PATH"
cp -r "$GITHUB_WORKSPACE" "$TEMP_PATH"
cd "$REPO_COPY/tests/ci"
python3 install_check.py "$CHECK_NAME"
- name: Cleanup
if: always()
run: |
docker ps --quiet | xargs --no-run-if-empty docker kill ||:
docker ps --all --quiet | xargs --no-run-if-empty docker rm -f ||:
sudo rm -fr "$TEMP_PATH"
##############################################################################################
########################### FUNCTIONAl STATELESS TESTS #######################################
##############################################################################################

View File

@ -604,6 +604,75 @@ jobs:
run: |
cd "$GITHUB_WORKSPACE/tests/ci"
python3 mark_release_ready.py
############################################################################################
#################################### INSTALL PACKAGES ######################################
############################################################################################
InstallPackagesTestRelease:
needs: [BuilderDebRelease]
runs-on: [self-hosted, style-checker]
steps:
- name: Set envs
run: |
cat >> "$GITHUB_ENV" << 'EOF'
TEMP_PATH=${{runner.temp}}/test_install
REPORTS_PATH=${{runner.temp}}/reports_dir
CHECK_NAME=Install packages (amd64)
REPO_COPY=${{runner.temp}}/test_install/ClickHouse
EOF
- name: Download json reports
uses: actions/download-artifact@v3
with:
path: ${{ env.REPORTS_PATH }}
- name: Check out repository code
uses: ClickHouse/checkout@v1
with:
clear-repository: true
- name: Test packages installation
run: |
sudo rm -fr "$TEMP_PATH"
mkdir -p "$TEMP_PATH"
cp -r "$GITHUB_WORKSPACE" "$TEMP_PATH"
cd "$REPO_COPY/tests/ci"
python3 install_check.py "$CHECK_NAME"
- name: Cleanup
if: always()
run: |
docker ps --quiet | xargs --no-run-if-empty docker kill ||:
docker ps --all --quiet | xargs --no-run-if-empty docker rm -f ||:
sudo rm -fr "$TEMP_PATH"
InstallPackagesTestAarch64:
needs: [BuilderDebRelease]
runs-on: [self-hosted, style-checker-aarch64]
steps:
- name: Set envs
run: |
cat >> "$GITHUB_ENV" << 'EOF'
TEMP_PATH=${{runner.temp}}/test_install
REPORTS_PATH=${{runner.temp}}/reports_dir
CHECK_NAME=Install packages (arm64)
REPO_COPY=${{runner.temp}}/test_install/ClickHouse
EOF
- name: Download json reports
uses: actions/download-artifact@v3
with:
path: ${{ env.REPORTS_PATH }}
- name: Check out repository code
uses: ClickHouse/checkout@v1
with:
clear-repository: true
- name: Test packages installation
run: |
sudo rm -fr "$TEMP_PATH"
mkdir -p "$TEMP_PATH"
cp -r "$GITHUB_WORKSPACE" "$TEMP_PATH"
cd "$REPO_COPY/tests/ci"
python3 install_check.py "$CHECK_NAME"
- name: Cleanup
if: always()
run: |
docker ps --quiet | xargs --no-run-if-empty docker kill ||:
docker ps --all --quiet | xargs --no-run-if-empty docker rm -f ||:
sudo rm -fr "$TEMP_PATH"
##############################################################################################
########################### FUNCTIONAl STATELESS TESTS #######################################
##############################################################################################

2
contrib/azure vendored

@ -1 +1 @@
Subproject commit e4fcdfc81e337e589ce231a452dcc280fcbb3f99
Subproject commit 096049bf24fffafcaccc132b9367694532716731

View File

@ -21,5 +21,3 @@ RUN yarn config set registry https://registry.npmjs.org \
COPY run.sh /run.sh
ENTRYPOINT ["/run.sh"]
CMD ["yarn", "build"]

View File

@ -25,7 +25,8 @@ done
sed -i '/onBrokenMarkdownLinks:/ s/ignore/error/g' docusaurus.config.js
if [[ $# -lt 1 ]] || [[ "$1" == "--"* ]]; then
export CI=true
export CI=true
yarn install
exec yarn build "$@"
fi

View File

@ -134,6 +134,14 @@
"name": "clickhouse/keeper-jepsen-test",
"dependent": []
},
"docker/test/install/deb": {
"name": "clickhouse/install-deb-test",
"dependent": []
},
"docker/test/install/rpm": {
"name": "clickhouse/install-rpm-test",
"dependent": []
},
"docker/docs/builder": {
"name": "clickhouse/docs-builder",
"dependent": [

View File

@ -231,6 +231,7 @@ function run_tests
--hung-check
--fast-tests-only
--no-random-settings
--no-random-merge-tree-settings
--no-long
--testname
--shard

View File

@ -0,0 +1,64 @@
FROM ubuntu:22.04
# The Dockerfile is nicely borrowed from
# https://github.com/lionelnicolas/docker-ubuntu-systemd/blob/83aa3249146f5df264fe45353f79fc76eb1e42d7/Dockerfile
ENV \
DEBIAN_FRONTEND=noninteractive \
LANG=C.UTF-8 \
container=docker \
init=/lib/systemd/systemd
# install systemd packages
RUN apt-get update && \
apt-get install -y --no-install-recommends \
systemd \
&& \
apt-get clean && \
rm -rf /var/lib/apt/lists
# configure systemd
# remove systemd 'wants' triggers
# remove everything except tmpfiles setup in sysinit target
# remove UTMP updater service
# disable /tmp mount
# fix missing BPF firewall support warning
# just for cosmetics, fix "not-found" entries while using "systemctl --all"
RUN \
find \
/etc/systemd/system/*.wants/* \
/lib/systemd/system/multi-user.target.wants/* \
/lib/systemd/system/sockets.target.wants/*initctl* \
! -type d \
-delete && \
find \
/lib/systemd/system/sysinit.target.wants \
! -type d \
! -name '*systemd-tmpfiles-setup*' \
-delete && \
find \
/lib/systemd \
-name systemd-update-utmp-runlevel.service \
-delete && \
rm -vf /usr/share/systemd/tmp.mount && \
sed -ri '/^IPAddressDeny/d' /lib/systemd/system/systemd-journald.service && \
for MATCH in \
plymouth-start.service \
plymouth-quit-wait.service \
syslog.socket \
syslog.service \
display-manager.service \
systemd-sysusers.service \
tmp.mount \
systemd-udevd.service \
; do \
grep -rn --binary-files=without-match ${MATCH} /lib/systemd/ | cut -d: -f1 | xargs sed -ri 's/(.*=.*)'${MATCH}'(.*)/\1\2/'; \
done && \
systemctl disable ondemand.service && \
systemctl set-default multi-user.target
VOLUME ["/run", "/run/lock"]
STOPSIGNAL SIGRTMIN+3
ENTRYPOINT ["/lib/systemd/systemd"]

View File

@ -0,0 +1,55 @@
FROM centos:8
# The Dockerfile is nicely borrowed from
# https://github.com/lionelnicolas/docker-ubuntu-systemd/blob/83aa3249146f5df264fe45353f79fc76eb1e42d7/Dockerfile
ENV \
LANG=C.UTF-8 \
container=docker \
init=/lib/systemd/systemd
# configure systemd
# remove systemd 'wants' triggers
# remove everything except tmpfiles setup in sysinit target
# remove UTMP updater service
# disable /tmp mount
# fix missing BPF firewall support warning
# just for cosmetics, fix "not-found" entries while using "systemctl --all"
RUN \
find \
/etc/systemd/system/*.wants/ \
/lib/systemd/system/multi-user.target.wants/ \
/lib/systemd/system/local-fs.target.wants/ \
/lib/systemd/system/sockets.target.wants/*initctl* \
! -type d \
-delete && \
find \
/lib/systemd/system/sysinit.target.wants \
! -type d \
! -name '*systemd-tmpfiles-setup*' \
-delete && \
find \
/lib/systemd \
-name systemd-update-utmp-runlevel.service \
-delete && \
rm -vf /usr/share/systemd/tmp.mount && \
sed -ri '/^IPAddressDeny/d' /lib/systemd/system/systemd-journald.service && \
for MATCH in \
plymouth-start.service \
plymouth-quit-wait.service \
syslog.socket \
syslog.service \
display-manager.service \
systemd-sysusers.service \
tmp.mount \
systemd-udevd.service \
; do \
grep -rn --binary-files=without-match ${MATCH} /lib/systemd/ | cut -d: -f1 | xargs sed -ri 's/(.*=.*)'${MATCH}'(.*)/\1\2/'; \
done && \
systemctl set-default multi-user.target
VOLUME ["/run", "/run/lock"]
STOPSIGNAL SIGRTMIN+3
ENTRYPOINT ["/lib/systemd/systemd"]

View File

@ -0,0 +1,226 @@
---
slug: /en/engines/table-engines/special/executable
sidebar_position: 40
sidebar_label: Executable
---
# Executable and ExecutablePool Table Engines
The `Executable` and `ExecutablePool` table engines allow you to define a table whose rows are generated from a script that you define (by writing rows to **stdout**). The executable script is stored in the `users_scripts` directory and can read data from any source.
- `Executable` tables: the script is run on every query
- `ExecutablePool` tables: maintains a pool of persistent processes, and takes processes from the pool for reads
You can optionally include one or more input queries that stream their results to **stdin** for the script to read.
## Creating an Executable Table
The `Executable` table engine requires two parameters: the name of the script and the format of the incoming data. You can optionally pass in one or more input queries:
```sql
Executable(script_name, format, [input_query...])
```
Here are the relevant settings for an `Executable` table:
- `send_chunk_header`
- Description: Send the number of rows in each chunk before sending a chunk to process. This setting can help to write your script in a more efficient way to preallocate some resources
- Default value: false
- `command_termination_timeout`
- Description: Command termination timeout in seconds
- Default value: 10
- `command_read_timeout`
- Description: Timeout for reading data from command stdout in milliseconds
- Default value: 10000
- `command_write_timeout`
- Description: Timeout for writing data to command stdin in milliseconds
- Default value: 10000
Let's look at an example. The following Python script is named `my_script.py` and is saved in the `user_scripts` folder. It reads in a number `i` and prints `i` random strings, with each string preceded by a number that is separated by a tab:
```python
#!/usr/bin/python3
import sys
import string
import random
def main():
# Read input value
for number in sys.stdin:
i = int(number)
# Generate some random rows
for id in range(0, i):
letters = string.ascii_letters
random_string = ''.join(random.choices(letters ,k=10))
print(str(id) + '\t' + random_string + '\n', end='')
# Flush results to stdout
sys.stdout.flush()
if __name__ == "__main__":
main()
```
The following `my_executable_table` is built from the output of `my_script.py`, which will generate 10 random strings everytime you run a `SELECT` from `my_executable_table`:
```sql
CREATE TABLE my_executable_table (
x UInt32,
y String
)
ENGINE = Executable('my_script.py', TabSeparated, (SELECT 10))
```
Creating the table returns immediately and does not invoke the script. Querying `my_executable_table` causes the script to be invoked:
```sql
SELECT * FROM my_executable_table
```
```response
┌─x─┬─y──────────┐
│ 0 │ BsnKBsNGNH │
│ 1 │ mgHfBCUrWM │
│ 2 │ iDQAVhlygr │
│ 3 │ uNGwDuXyCk │
│ 4 │ GcFdQWvoLB │
│ 5 │ UkciuuOTVO │
│ 6 │ HoKeCdHkbs │
│ 7 │ xRvySxqAcR │
│ 8 │ LKbXPHpyDI │
│ 9 │ zxogHTzEVV │
└───┴────────────┘
```
## Passing Query Results to a Script
Users of the Hacker News website leave comments. Python contains a natural language processing toolkit (`nltk`) with a `SentimentIntensityAnalyzer` for determining if comments are positive, negative, or neutral - including assigning a value between -1 (a very negative comment) and 1 (a very positive comment). Let's create an `Executable` table that computes the sentiment of Hacker News comments using `nltk`.
This example uses the `hackernews` table described [here](https://clickhouse.com/docs/en/engines/table-engines/mergetree-family/invertedindexes/#full-text-search-of-the-hacker-news-dataset). The `hackernews` table includes an `id` column of type `UInt64` and a `String` column named `comment`. Let's start by defining the `Executable` table:
```sql
CREATE TABLE sentiment (
id UInt64,
sentiment Float32
)
ENGINE = Executable(
'sentiment.py',
TabSeparated,
(SELECT id, comment FROM hackernews WHERE id > 0 AND comment != '' LIMIT 20)
);
```
Some comments about the `sentiment` table:
- The file `sentiment.py` is saved in the `user_scripts` folder (the default folder of the `user_scripts_path` setting)
- The `TabSeparated` format means our Python script needs to generate rows of raw data that contain tab-separated values
- The query selects two columns from `hackernews`. The Python script will need to parse out those column values from the incoming rows
Here is the defintion of `sentiment.py`:
```python
#!/usr/local/bin/python3.9
import sys
import nltk
from nltk.sentiment import SentimentIntensityAnalyzer
def main():
sentiment_analyzer = SentimentIntensityAnalyzer()
while True:
try:
row = sys.stdin.readline()
if row == '':
break
split_line = row.split("\t")
id = str(split_line[0])
comment = split_line[1]
score = sentiment_analyzer.polarity_scores(comment)['compound']
print(id + '\t' + str(score) + '\n', end='')
sys.stdout.flush()
except BaseException as x:
break
if __name__ == "__main__":
main()
```
Some comments about our Python script:
- For this to work, you will need to run `nltk.downloader.download('vader_lexicon')`. This could have been placed in the script, but then it would have been downloaded every time a query was executed on the `sentiment` table - which is not efficient
- Each value of `row` is going to be a row in the result set of `SELECT id, comment FROM hackernews WHERE id > 0 AND comment != '' LIMIT 20`
- The incoming row is tab-separated, so we parse out the `id` and `comment` using the Python `split` function
- The result of `polarity_scores` is a JSON object with a handful of values. We decided to just grab the `compound` value of this JSON object
- Recall that the `sentiment` table in ClickHouse uses the `TabSeparated` format and contains two columns, so our `print` function separates those columns with a tab
Every time you write a query that selects rows from the `sentiment` table, the `SELECT id, comment FROM hackernews WHERE id > 0 AND comment != '' LIMIT 20` query is executed and the result is passed to `sentiment.py`. Let's test it out:
```sql
SELECT *
FROM sentiment
```
The response looks like:
```response
┌───────id─┬─sentiment─┐
│ 7398199 │ 0.4404 │
│ 21640317 │ 0.1779 │
│ 21462000 │ 0 │
│ 25168863 │ 0 │
│ 25168978 │ -0.1531 │
│ 25169359 │ 0 │
│ 25169394 │ -0.9231 │
│ 25169766 │ 0.4137 │
│ 25172570 │ 0.7469 │
│ 25173687 │ 0.6249 │
│ 28291534 │ 0 │
│ 28291669 │ -0.4767 │
│ 28291731 │ 0 │
│ 28291949 │ -0.4767 │
│ 28292004 │ 0.3612 │
│ 28292050 │ -0.296 │
│ 28292322 │ 0 │
│ 28295172 │ 0.7717 │
│ 28295288 │ 0.4404 │
│ 21465723 │ -0.6956 │
└──────────┴───────────┘
```
## Creating an ExecutablePool Table
The syntax for `ExecutablePool` is similar to `Executable`, but there are a couple of relevant settings unique to an `ExecutablePool` table:
- `pool_size`
- Description: Processes pool size. If size is 0, then there are no size restrictions
- Default value: 16
- `max_command_execution_time`
- Description: Max command execution time in seconds
- Default value: 10
We can easily convert the `sentiment` table above to use `ExecutablePool` instead of `Executable`:
```sql
CREATE TABLE sentiment_pooled (
id UInt64,
sentiment Float32
)
ENGINE = ExecutablePool(
'sentiment.py',
TabSeparated,
(SELECT id, comment FROM hackernews WHERE id > 0 AND comment != '' LIMIT 20000)
)
SETTINGS
pool_size = 4;
```
ClickHouse will maintain 4 processes on-demand when your client queries the `sentiment_pooled` table.

View File

@ -1279,7 +1279,9 @@ The following settings are available:
- `size`: The maximum cache size in bytes. 0 means the query cache is disabled. Default value: `1073741824` (1 GiB).
- `max_entries`: The maximum number of `SELECT` query results stored in the cache. Default value: `1024`.
- `max_entry_size`: The maximum size in bytes `SELECT` query results may have to be saved in the cache. Default value: `1048576` (1 MiB).
- `max_entry_records`: The maximum number of records `SELECT` query results may have to be saved in the cache. Default value: `30000000` (30 mil).
- `max_entry_rows`: The maximum number of rows `SELECT` query results may have to be saved in the cache. Default value: `30000000` (30 mil).
Changed settings take effect immediately.
:::warning
Data for the query cache is allocated in DRAM. If memory is scarce, make sure to set a small value for `size` or disable the query cache altogether.
@ -1292,7 +1294,7 @@ Data for the query cache is allocated in DRAM. If memory is scarce, make sure to
<size>1073741824</size>
<max_entries>1024</max_entries>
<max_entry_size>1048576</max_entry_size>
<max_entry_records>30000000</max_entry_records>
<max_entry_rows>30000000</max_entry_rows>
</query_cache>
```

View File

@ -1,5 +1,5 @@
---
slug: /en/sql-reference/aggregate-functions/reference/sparkbar
slug: /en/sql-reference/aggregate-functions/reference/sparkbar
sidebar_position: 311
sidebar_label: sparkbar
---
@ -7,9 +7,11 @@ sidebar_label: sparkbar
# sparkbar
The function plots a frequency histogram for values `x` and the repetition rate `y` of these values over the interval `[min_x, max_x]`.
Repetitions for all `x` falling into the same bucket are averaged, so data should be pre-aggregated.
Negative repetitions are ignored.
If no interval is specified, then the minimum `x` is used as the interval start, and the maximum `x` — as the interval end.
If no interval is specified, then the minimum `x` is used as the interval start, and the maximum `x` — as the interval end.
Otherwise, values outside the interval are ignored.
**Syntax**
@ -37,29 +39,24 @@ sparkbar(width[, min_x, max_x])(x, y)
Query:
``` sql
CREATE TABLE spark_bar_data (`cnt` UInt64,`event_date` Date) ENGINE = MergeTree ORDER BY event_date SETTINGS index_granularity = 8192;
INSERT INTO spark_bar_data VALUES(1,'2020-01-01'),(4,'2020-01-02'),(5,'2020-01-03'),(2,'2020-01-04'),(3,'2020-01-05'),(7,'2020-01-06'),(6,'2020-01-07'),(8,'2020-01-08'),(2,'2020-01-11');
CREATE TABLE spark_bar_data (`value` Int64, `event_date` Date) ENGINE = MergeTree ORDER BY event_date;
SELECT sparkbar(9)(event_date,cnt) FROM spark_bar_data;
INSERT INTO spark_bar_data VALUES (1,'2020-01-01'), (3,'2020-01-02'), (4,'2020-01-02'), (-3,'2020-01-02'), (5,'2020-01-03'), (2,'2020-01-04'), (3,'2020-01-05'), (7,'2020-01-06'), (6,'2020-01-07'), (8,'2020-01-08'), (2,'2020-01-11');
SELECT sparkbar(9,toDate('2020-01-01'),toDate('2020-01-10'))(event_date,cnt) FROM spark_bar_data;
SELECT sparkbar(9)(event_date,cnt) FROM (SELECT sum(value) as cnt, event_date FROM spark_bar_data GROUP BY event_date);
SELECT sparkbar(9, toDate('2020-01-01'), toDate('2020-01-10'))(event_date,cnt) FROM (SELECT sum(value) as cnt, event_date FROM spark_bar_data GROUP BY event_date);
```
Result:
``` text
┌─sparkbar(9)(event_date, cnt)─┐
│ │
│ ▁▅▄▃██▅ ▁ │
│ │
│ ▂▅▂▃▆█ ▂ │
└──────────────────────────────┘
┌─sparkbar(9, toDate('2020-01-01'), toDate('2020-01-10'))(event_date, cnt)─┐
│ │
│▁▄▄▂▅▇█▁ │
│ │
│ ▂▅▂▃▇▆█ │
└──────────────────────────────────────────────────────────────────────────┘
```

View File

@ -0,0 +1,97 @@
---
slug: /en/engines/table-functions/executable
sidebar_position: 55
sidebar_label: executable
keywords: [udf, user defined function, clickhouse, executable, table, function]
---
# executable Table Function for UDFs
The `executable` table function creates a table based on the output of a user-defined function (UDF) that you define in a script that outputs rows to **stdout**. The executable script is stored in the `users_scripts` directory and can read data from any source.
You can optionally include one or more input queries that stream their results to **stdin** for the script to read.
:::note
A key advantage between ordinary UDF functions and the `executable` table function and `Executable` table engine is that ordinary UDF functions cannot change the row count. For example, if the input is 100 rows, then the result must return 100 rows. When using the `executable` table function or `Executable` table engine, your script can make any data transformations you want, including complex aggregations.
:::
## Syntax
The `executable` table function requires three parameters and accepts an optional list of input queries:
```sql
executable(script_name, format, structure, [input_query...])
```
- `script_name`: the file name of the script. saved in the `user_scripts` folder (the default folder of the `user_scripts_path` setting)
- `format`: the format of the generated table
- `structure`: the table schema of the generated table
- `input_query`: an optional query (or collection or queries) whose results are passed to the script via **stdin**
:::note
If you are going to invoke the same script repeatedly with the same input queries, consider using the [`Executable` table engine](../../engines/table-engines/special/executable.md).
:::
The following Python script is named `generate_random.py` and is saved in the `user_scripts` folder. It reads in a number `i` and prints `i` random strings, with each string preceded by a number that is separated by a tab:
```python
#!/usr/local/bin/python3.9
import sys
import string
import random
def main():
# Read input value
for number in sys.stdin:
i = int(number)
# Generate some random rows
for id in range(0, i):
letters = string.ascii_letters
random_string = ''.join(random.choices(letters ,k=10))
print(str(id) + '\t' + random_string + '\n', end='')
# Flush results to stdout
sys.stdout.flush()
if __name__ == "__main__":
main()
```
Let's invoke the script and have it generate 10 random strings:
```sql
SELECT * FROM executable('my_script.py', TabSeparated, 'id UInt32, random String', (SELECT 10))
```
The response looks like:
```response
┌─id─┬─random─────┐
│ 0 │ xheXXCiSkH │
│ 1 │ AqxvHAoTrl │
│ 2 │ JYvPCEbIkY │
│ 3 │ sWgnqJwGRm │
│ 4 │ fTZGrjcLon │
│ 5 │ ZQINGktPnd │
│ 6 │ YFSvGGoezb │
│ 7 │ QyMJJZOOia │
│ 8 │ NfiyDDhmcI │
│ 9 │ REJRdJpWrg │
└────┴────────────┘
```
## Passing Query Results to a Script
Be sure to check out the example in the `Executable` table engine on [how to pass query results to a script](../../engines/table-engines/special/executable#passing-query-results-to-a-script). Here is how you execute the same script in that example using the `executable` table function:
```sql
SELECT * FROM executable(
'sentiment.py',
TabSeparated,
'id UInt64, sentiment Float32',
(SELECT id, comment FROM hackernews WHERE id > 0 AND comment != '' LIMIT 20)
);
```

View File

@ -1,14 +1,15 @@
---
slug: /ru/sql-reference/aggregate-functions/reference/sparkbar
slug: /ru/sql-reference/aggregate-functions/reference/sparkbar
sidebar_position: 311
sidebar_label: sparkbar
---
# sparkbar {#sparkbar}
Функция строит гистограмму частот по заданным значениям `x` и частоте повторения этих значений `y` на интервале `[min_x, max_x]`.
Функция строит гистограмму частот по заданным значениям `x` и частоте повторения этих значений `y` на интервале `[min_x, max_x]`. Повторения для всех `x`, попавших в один бакет, усредняются, поэтому данные должны быть предварительно агрегированы. Отрицательные повторения игнорируются.
Если интервал для построения не указан, то в качестве нижней границы интервала будет взято минимальное значение `x`, а в качестве верхней границы — максимальное значение `x`.
Значения `x` вне указанного интервала игнорируются.
**Синтаксис**
@ -39,29 +40,23 @@ sparkbar(width[, min_x, max_x])(x, y)
Запрос:
``` sql
CREATE TABLE spark_bar_data (`cnt` UInt64,`event_date` Date) ENGINE = MergeTree ORDER BY event_date SETTINGS index_granularity = 8192;
INSERT INTO spark_bar_data VALUES(1,'2020-01-01'),(4,'2020-01-02'),(5,'2020-01-03'),(2,'2020-01-04'),(3,'2020-01-05'),(7,'2020-01-06'),(6,'2020-01-07'),(8,'2020-01-08'),(2,'2020-01-11');
CREATE TABLE spark_bar_data (`value` Int64, `event_date` Date) ENGINE = MergeTree ORDER BY event_date;
SELECT sparkbar(9)(event_date,cnt) FROM spark_bar_data;
INSERT INTO spark_bar_data VALUES (1,'2020-01-01'), (3,'2020-01-02'), (4,'2020-01-02'), (-3,'2020-01-02'), (5,'2020-01-03'), (2,'2020-01-04'), (3,'2020-01-05'), (7,'2020-01-06'), (6,'2020-01-07'), (8,'2020-01-08'), (2,'2020-01-11');
SELECT sparkbar(9,toDate('2020-01-01'),toDate('2020-01-10'))(event_date,cnt) FROM spark_bar_data;
SELECT sparkbar(9)(event_date,cnt) FROM (SELECT sum(value) as cnt, event_date FROM spark_bar_data GROUP BY event_date);
SELECT sparkbar(9, toDate('2020-01-01'), toDate('2020-01-10'))(event_date,cnt) FROM (SELECT sum(value) as cnt, event_date FROM spark_bar_data GROUP BY event_date);
```
Результат:
``` text
┌─sparkbar(9)(event_date, cnt)─┐
│ │
│ ▁▅▄▃██▅ ▁ │
│ │
│ ▂▅▂▃▆█ ▂ │
└──────────────────────────────┘
┌─sparkbar(9, toDate('2020-01-01'), toDate('2020-01-10'))(event_date, cnt)─┐
│ │
│▁▄▄▂▅▇█▁ │
│ │
│ ▂▅▂▃▇▆█ │
└──────────────────────────────────────────────────────────────────────────┘
```

View File

@ -102,7 +102,8 @@ done
EOF
chmod +x "$PKG_PATH/install/doinst.sh"
if [ -f "$PKG_PATH/DEBIAN/postinst" ]; then
tail +2 "$PKG_PATH/DEBIAN/postinst" >> "$PKG_PATH/install/doinst.sh"
# we don't need debconf source in doinst in any case
tail +2 "$PKG_PATH/DEBIAN/postinst" | grep -v debconf/confmodule >> "$PKG_PATH/install/doinst.sh"
fi
rm -rf "$PKG_PATH/DEBIAN"
if [ -f "/usr/bin/pigz" ]; then

View File

@ -0,0 +1,46 @@
#!/bin/sh
set -e
# set -x
PROGRAM=clickhouse-keeper
KEEPER_USER=${KEEPER_USER:=clickhouse}
KEEPER_GROUP=${KEEPER_GROUP:=clickhouse}
# Please note that we don't support paths with whitespaces. This is rather ignorant.
KEEPER_CONFDIR=${KEEPER_CONFDIR:=/etc/$PROGRAM}
KEEPER_DATADIR=${KEEPER_DATADIR:=/var/lib/clickhouse}
KEEPER_LOGDIR=${KEEPER_LOGDIR:=/var/log/$PROGRAM}
[ -f /usr/share/debconf/confmodule ] && . /usr/share/debconf/confmodule
[ -f /etc/default/clickhouse-keeper ] && . /etc/default/clickhouse-keeper
if [ ! -f "/etc/debian_version" ]; then
not_deb_os=1
fi
if [ "$1" = configure ] || [ -n "$not_deb_os" ]; then
if ! getent group "${KEEPER_GROUP}" > /dev/null 2>&1 ; then
groupadd --system "${KEEPER_GROUP}"
fi
GID=$(getent group "${KEEPER_GROUP}" | cut -d: -f 3)
if ! id "${KEEPER_USER}" > /dev/null 2>&1 ; then
adduser --system --home /dev/null --no-create-home \
--gid "${GID}" --shell /bin/false \
"${KEEPER_USER}"
fi
chown -R "${KEEPER_USER}:${KEEPER_GROUP}" "${KEEPER_CONFDIR}"
chmod 0755 "${KEEPER_CONFDIR}"
if ! [ -d "${KEEPER_DATADIR}" ]; then
mkdir -p "${KEEPER_DATADIR}"
chown -R "${KEEPER_USER}:${KEEPER_GROUP}" "${KEEPER_DATADIR}"
chmod 0700 "${KEEPER_DATADIR}"
fi
if ! [ -d "${KEEPER_LOGDIR}" ]; then
mkdir -p "${KEEPER_LOGDIR}"
chown -R "${KEEPER_USER}:${KEEPER_GROUP}" "${KEEPER_LOGDIR}"
chmod 0770 "${KEEPER_LOGDIR}"
fi
fi
# vim: ts=4: sw=4: sts=4: expandtab

View File

@ -0,0 +1,27 @@
[Unit]
Description=ClickHouse Keeper - zookeeper compatible distributed coordination server
Requires=network-online.target
# NOTE: that After/Wants=time-sync.target is not enough, you need to ensure
# that the time was adjusted already, if you use systemd-timesyncd you are
# safe, but if you use ntp or some other daemon, you should configure it
# additionaly.
After=time-sync.target network-online.target
Wants=time-sync.target
[Service]
Type=simple
User=clickhouse
Group=clickhouse
Restart=always
RestartSec=30
RuntimeDirectory=%p # %p is resolved to the systemd unit name
ExecStart=/usr/bin/clickhouse-keeper --config=/etc/clickhouse-keeper/keeper_config.xml --pid-file=%t/%p/%p.pid
# Minus means that this file is optional.
EnvironmentFile=-/etc/default/%p
LimitCORE=infinity
LimitNOFILE=500000
CapabilityBoundingSet=CAP_NET_ADMIN CAP_IPC_LOCK CAP_SYS_NICE CAP_NET_BIND_SERVICE
[Install]
# ClickHouse should not start from the rescue shell (rescue.target).
WantedBy=multi-user.target

View File

@ -30,6 +30,8 @@ contents:
type: config|noreplace
- src: root/usr/bin/clickhouse-keeper
dst: /usr/bin/clickhouse-keeper
- src: clickhouse-keeper.service
dst: /lib/systemd/system/clickhouse-keeper.service
# docs
- src: ../AUTHORS
dst: /usr/share/doc/clickhouse-keeper/AUTHORS
@ -39,3 +41,6 @@ contents:
dst: /usr/share/doc/clickhouse-keeper/LICENSE
- src: ../README.md
dst: /usr/share/doc/clickhouse-keeper/README.md
scripts:
postinstall: ./clickhouse-keeper.postinstall

View File

@ -11,8 +11,6 @@ CLICKHOUSE_DATADIR=${CLICKHOUSE_DATADIR:=/var/lib/clickhouse}
CLICKHOUSE_LOGDIR=${CLICKHOUSE_LOGDIR:=/var/log/clickhouse-server}
CLICKHOUSE_BINDIR=${CLICKHOUSE_BINDIR:=/usr/bin}
CLICKHOUSE_GENERIC_PROGRAM=${CLICKHOUSE_GENERIC_PROGRAM:=clickhouse}
EXTRACT_FROM_CONFIG=${CLICKHOUSE_GENERIC_PROGRAM}-extract-from-config
CLICKHOUSE_CONFIG=$CLICKHOUSE_CONFDIR/config.xml
CLICKHOUSE_PIDDIR=/var/run/$PROGRAM
[ -f /usr/share/debconf/confmodule ] && . /usr/share/debconf/confmodule

View File

@ -17,10 +17,10 @@ User=clickhouse
Group=clickhouse
Restart=always
RestartSec=30
RuntimeDirectory=clickhouse-server
ExecStart=/usr/bin/clickhouse-server --config=/etc/clickhouse-server/config.xml --pid-file=/run/clickhouse-server/clickhouse-server.pid
RuntimeDirectory=%p # %p is resolved to the systemd unit name
ExecStart=/usr/bin/clickhouse-server --config=/etc/clickhouse-server/config.xml --pid-file=%t/%p/%p.pid
# Minus means that this file is optional.
EnvironmentFile=-/etc/default/clickhouse
EnvironmentFile=-/etc/default/%p
LimitCORE=infinity
LimitNOFILE=500000
CapabilityBoundingSet=CAP_NET_ADMIN CAP_IPC_LOCK CAP_SYS_NICE CAP_NET_BIND_SERVICE

View File

@ -1,3 +1,4 @@
#include <boost/algorithm/string/join.hpp>
#include <cstdlib>
#include <fcntl.h>
#include <map>
@ -538,24 +539,28 @@ void Client::connect()
// Prints changed settings to stderr. Useful for debugging fuzzing failures.
void Client::printChangedSettings() const
{
const auto & changes = global_context->getSettingsRef().changes();
if (!changes.empty())
auto print_changes = [](const auto & changes, std::string_view settings_name)
{
fmt::print(stderr, "Changed settings: ");
for (size_t i = 0; i < changes.size(); ++i)
if (!changes.empty())
{
if (i)
fmt::print(stderr, "Changed {}: ", settings_name);
for (size_t i = 0; i < changes.size(); ++i)
{
fmt::print(stderr, ", ");
if (i)
fmt::print(stderr, ", ");
fmt::print(stderr, "{} = '{}'", changes[i].name, toString(changes[i].value));
}
fmt::print(stderr, "{} = '{}'", changes[i].name, toString(changes[i].value));
fmt::print(stderr, "\n");
}
fmt::print(stderr, "\n");
}
else
{
fmt::print(stderr, "No changed settings.\n");
}
else
{
fmt::print(stderr, "No changed {}.\n", settings_name);
}
};
print_changes(global_context->getSettingsRef().changes(), "settings");
print_changes(cmd_merge_tree_settings.changes(), "MergeTree settings");
}
@ -1387,6 +1392,8 @@ void Client::readArguments(
}
else if (arg == "--allow_repeated_settings")
allow_repeated_settings = true;
else if (arg == "--allow_merge_tree_settings")
allow_merge_tree_settings = true;
else
common_arguments.emplace_back(arg);
}

View File

@ -362,6 +362,7 @@ try
else
path = std::filesystem::path{KEEPER_DEFAULT_PATH};
std::filesystem::create_directories(path);
/// Check that the process user id matches the owner of the data.
const auto effective_user_id = geteuid();

View File

@ -82,9 +82,7 @@
#include <Common/ThreadFuzzer.h>
#include <Common/getHashOfLoadedBinary.h>
#include <Common/filesystemHelpers.h>
#if USE_BORINGSSL
#include <Compression/CompressionCodecEncrypted.h>
#endif
#include <Server/HTTP/HTTPServerConnectionFactory.h>
#include <Server/MySQLHandlerFactory.h>
#include <Server/PostgreSQLHandlerFactory.h>
@ -1348,9 +1346,8 @@ try
global_context->updateStorageConfiguration(*config);
global_context->updateInterserverCredentials(*config);
#if USE_BORINGSSL
global_context->updateQueryCacheConfiguration(*config);
CompressionCodecEncrypted::Configuration::instance().tryLoad(*config, "encryption_codecs");
#endif
#if USE_SSL
CertificateReloader::instance().tryLoad(*config);
#endif
@ -1534,13 +1531,7 @@ try
global_context->setMMappedFileCache(mmap_cache_size);
/// A cache for query results.
size_t query_cache_size = config().getUInt64("query_cache.size", 1_GiB);
if (query_cache_size)
global_context->setQueryCache(
query_cache_size,
config().getUInt64("query_cache.max_entries", 1024),
config().getUInt64("query_cache.max_entry_size", 1_MiB),
config().getUInt64("query_cache.max_entry_records", 30'000'000));
global_context->setQueryCache(config());
#if USE_EMBEDDED_COMPILER
/// 128 MB
@ -1564,10 +1555,8 @@ try
global_context->getMergeTreeSettings().sanityCheck(background_pool_tasks);
global_context->getReplicatedMergeTreeSettings().sanityCheck(background_pool_tasks);
}
#if USE_BORINGSSL
/// try set up encryption. There are some errors in config, error will be printed and server wouldn't start.
CompressionCodecEncrypted::Configuration::instance().load(config(), "encryption_codecs");
#endif
SCOPE_EXIT({
async_metrics.stop();

View File

@ -1516,7 +1516,7 @@
<!-- <size>1073741824</size> -->
<!-- <max_entries>1024</max_entries> -->
<!-- <max_entry_size>1048576</max_entry_size> -->
<!-- <max_entry_records>30000000</max_entry_records> -->
<!-- <max_entry_rows>30000000</max_entry_rows> -->
<!-- </query_cache> -->
<!-- Uncomment if enable merge tree metadata cache -->

View File

@ -10,22 +10,25 @@ mod ffi {
}
struct Item {
text: String,
text_no_newlines: String,
orig_text: String,
}
impl Item {
fn new(text: String) -> Self {
return Self{
// Text that will be printed by skim, and will be used for matching.
//
// Text that will be shown should not contains new lines since in this case skim may
// live some symbols on the screen, and this looks odd.
text: text.replace("\n", " "),
text_no_newlines: text.replace("\n", " "),
// This will be used when the match had been selected.
orig_text: text,
};
}
}
impl SkimItem for Item {
fn text(&self) -> Cow<str> {
return Cow::Borrowed(&self.text);
return Cow::Borrowed(&self.text_no_newlines);
}
fn output(&self) -> Cow<str> {
@ -44,6 +47,24 @@ fn skim(prefix: &CxxString, words: &CxxVector<CxxString>) -> Result<String, Stri
.query(Some(prefix.to_str().unwrap()))
.tac(true)
.tiebreak(Some("-score".to_string()))
// Exact mode performs better for SQL.
//
// Default fuzzy search is too smart for SQL, it even takes into account the case, which
// should not be accounted (you don't want to type "SELECT" instead of "select" to find the
// query).
//
// Exact matching seems better algorithm for SQL, it is not 100% exact, it splits by space,
// and apply separate matcher actually for each word.
// Note, that if you think that "space is not enough" as the delimiter, then you should
// first know that this is the delimiter only for the input query, so to match
// "system.query_log" you can use "sy qu log"
// Also it should be more common for users who did not know how to use fuzzy search.
// (also you can disable exact mode by prepending "'" char).
//
// Also it ignores the case correctly, i.e. it does not have penalty for case mismatch,
// like fuzzy algorithms (take a look at SkimScoreConfig::penalty_case_mismatch).
.exact(true)
.case(CaseMatching::Ignore)
.build()
.unwrap();

View File

@ -247,15 +247,8 @@ void Adam::merge(const IWeightsUpdater & rhs, Float64 frac, Float64 rhs_frac)
if (adam_rhs.average_gradient.empty())
return;
if (average_gradient.empty())
{
if (!average_squared_gradient.empty() ||
adam_rhs.average_gradient.size() != adam_rhs.average_squared_gradient.size())
throw Exception(ErrorCodes::LOGICAL_ERROR, "Average_gradient and average_squared_gradient must have same size");
average_gradient.resize(adam_rhs.average_gradient.size(), Float64{0.0});
average_squared_gradient.resize(adam_rhs.average_squared_gradient.size(), Float64{0.0});
}
average_gradient.resize(adam_rhs.average_gradient.size(), Float64{0.0});
average_squared_gradient.resize(adam_rhs.average_squared_gradient.size(), Float64{0.0});
for (size_t i = 0; i < average_gradient.size(); ++i)
{
@ -268,14 +261,8 @@ void Adam::merge(const IWeightsUpdater & rhs, Float64 frac, Float64 rhs_frac)
void Adam::update(UInt64 batch_size, std::vector<Float64> & weights, Float64 & bias, Float64 learning_rate, const std::vector<Float64> & batch_gradient)
{
if (average_gradient.empty())
{
if (!average_squared_gradient.empty())
throw Exception(ErrorCodes::LOGICAL_ERROR, "Average_gradient and average_squared_gradient must have same size");
average_gradient.resize(batch_gradient.size(), Float64{0.0});
average_squared_gradient.resize(batch_gradient.size(), Float64{0.0});
}
average_gradient.resize(batch_gradient.size(), Float64{0.0});
average_squared_gradient.resize(batch_gradient.size(), Float64{0.0});
for (size_t i = 0; i != average_gradient.size(); ++i)
{
@ -328,8 +315,7 @@ void Nesterov::write(WriteBuffer & buf) const
void Nesterov::merge(const IWeightsUpdater & rhs, Float64 frac, Float64 rhs_frac)
{
const auto & nesterov_rhs = static_cast<const Nesterov &>(rhs);
if (accumulated_gradient.empty())
accumulated_gradient.resize(nesterov_rhs.accumulated_gradient.size(), Float64{0.0});
accumulated_gradient.resize(nesterov_rhs.accumulated_gradient.size(), Float64{0.0});
for (size_t i = 0; i < accumulated_gradient.size(); ++i)
{
@ -339,10 +325,7 @@ void Nesterov::merge(const IWeightsUpdater & rhs, Float64 frac, Float64 rhs_frac
void Nesterov::update(UInt64 batch_size, std::vector<Float64> & weights, Float64 & bias, Float64 learning_rate, const std::vector<Float64> & batch_gradient)
{
if (accumulated_gradient.empty())
{
accumulated_gradient.resize(batch_gradient.size(), Float64{0.0});
}
accumulated_gradient.resize(batch_gradient.size(), Float64{0.0});
for (size_t i = 0; i < batch_gradient.size(); ++i)
{
@ -402,10 +385,7 @@ void Momentum::merge(const IWeightsUpdater & rhs, Float64 frac, Float64 rhs_frac
void Momentum::update(UInt64 batch_size, std::vector<Float64> & weights, Float64 & bias, Float64 learning_rate, const std::vector<Float64> & batch_gradient)
{
/// batch_size is already checked to be greater than 0
if (accumulated_gradient.empty())
{
accumulated_gradient.resize(batch_gradient.size(), Float64{0.0});
}
accumulated_gradient.resize(batch_gradient.size(), Float64{0.0});
for (size_t i = 0; i < batch_gradient.size(); ++i)
{

View File

@ -149,9 +149,11 @@ public:
class Momentum : public IWeightsUpdater
{
public:
Momentum() = default;
explicit Momentum(Float64 alpha_) : alpha(alpha_) {}
explicit Momentum(size_t num_params, Float64 alpha_ = 0.1) : alpha(alpha_)
{
accumulated_gradient.resize(num_params + 1, 0);
}
void update(UInt64 batch_size, std::vector<Float64> & weights, Float64 & bias, Float64 learning_rate, const std::vector<Float64> & batch_gradient) override;
@ -170,9 +172,10 @@ private:
class Nesterov : public IWeightsUpdater
{
public:
Nesterov() = default;
explicit Nesterov(Float64 alpha_) : alpha(alpha_) {}
explicit Nesterov(size_t num_params, Float64 alpha_ = 0.9) : alpha(alpha_)
{
accumulated_gradient.resize(num_params + 1, 0);
}
void addToBatch(
std::vector<Float64> & batch_gradient,
@ -201,10 +204,14 @@ private:
class Adam : public IWeightsUpdater
{
public:
Adam()
Adam(size_t num_params)
{
beta1_powered = beta1;
beta2_powered = beta2;
average_gradient.resize(num_params + 1, 0);
average_squared_gradient.resize(num_params + 1, 0);
}
void addToBatch(
@ -338,11 +345,11 @@ public:
if (weights_updater_name == "SGD")
new_weights_updater = std::make_shared<StochasticGradientDescent>();
else if (weights_updater_name == "Momentum")
new_weights_updater = std::make_shared<Momentum>();
new_weights_updater = std::make_shared<Momentum>(param_num);
else if (weights_updater_name == "Nesterov")
new_weights_updater = std::make_shared<Nesterov>();
new_weights_updater = std::make_shared<Nesterov>(param_num);
else if (weights_updater_name == "Adam")
new_weights_updater = std::make_shared<Adam>();
new_weights_updater = std::make_shared<Adam>(param_num);
else
throw Exception(ErrorCodes::LOGICAL_ERROR, "Illegal name of weights updater (should have been checked earlier)");

View File

@ -50,11 +50,13 @@ AggregateFunctionPtr createAggregateFunctionSparkbar(const std::string & name, c
assertBinary(name, arguments);
if (params.size() != 1 && params.size() != 3)
throw Exception(ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH, "The number of params does not match for aggregate function {}", name);
throw Exception(ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH,
"The number of params does not match for aggregate function '{}', expected 1 or 3, got {}", name, params.size());
if (params.size() == 3)
{
if (params.at(1).getType() != arguments[0]->getDefault().getType() || params.at(2).getType() != arguments[0]->getDefault().getType())
if (params.at(1).getType() != arguments[0]->getDefault().getType() ||
params.at(2).getType() != arguments[0]->getDefault().getType())
{
throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT,
"The second and third parameters are not the same type as the first arguments for aggregate function {}", name);
@ -63,7 +65,6 @@ AggregateFunctionPtr createAggregateFunctionSparkbar(const std::string & name, c
return createAggregateFunctionSparkbarImpl(name, *arguments[0], *arguments[1], arguments, params);
}
}
void registerAggregateFunctionSparkbar(AggregateFunctionFactory & factory)

View File

@ -18,10 +18,15 @@
namespace DB
{
namespace ErrorCodes
{
extern const int BAD_ARGUMENTS;
}
template<typename X, typename Y>
struct AggregateFunctionSparkbarData
{
/// TODO: calculate histogram instead of storing all points
using Points = HashMap<X, Y>;
Points points;
@ -31,20 +36,26 @@ struct AggregateFunctionSparkbarData
Y min_y = std::numeric_limits<Y>::max();
Y max_y = std::numeric_limits<Y>::lowest();
void insert(const X & x, const Y & y)
Y insert(const X & x, const Y & y)
{
auto result = points.insert({x, y});
if (!result.second)
result.first->getMapped() += y;
if (isNaN(y) || y <= 0)
return 0;
auto [it, inserted] = points.insert({x, y});
if (!inserted)
it->getMapped() += y;
return it->getMapped();
}
void add(X x, Y y)
{
insert(x, y);
auto new_y = insert(x, y);
min_x = std::min(x, min_x);
max_x = std::max(x, max_x);
min_y = std::min(y, min_y);
max_y = std::max(y, max_y);
max_y = std::max(new_y, max_y);
}
void merge(const AggregateFunctionSparkbarData & other)
@ -53,10 +64,14 @@ struct AggregateFunctionSparkbarData
return;
for (auto & point : other.points)
insert(point.getKey(), point.getMapped());
{
auto new_y = insert(point.getKey(), point.getMapped());
max_y = std::max(new_y, max_y);
}
min_x = std::min(other.min_x, min_x);
max_x = std::max(other.max_x, max_x);
min_y = std::min(other.min_y, min_y);
max_y = std::max(other.max_y, max_y);
}
@ -85,7 +100,6 @@ struct AggregateFunctionSparkbarData
size_t size;
readVarUInt(size, buf);
/// TODO Protection against huge size
X x;
Y y;
for (size_t i = 0; i < size; ++i)
@ -95,7 +109,6 @@ struct AggregateFunctionSparkbarData
insert(x, y);
}
}
};
template<typename X, typename Y>
@ -104,16 +117,17 @@ class AggregateFunctionSparkbar final
{
private:
size_t width;
X min_x;
X max_x;
bool specified_min_max_x;
const size_t width = 0;
template <class T>
size_t updateFrame(ColumnString::Chars & frame, const T value) const
/// Range for x specified in parameters.
const bool is_specified_range_x = false;
const X begin_x = std::numeric_limits<X>::min();
const X end_x = std::numeric_limits<X>::max();
size_t updateFrame(ColumnString::Chars & frame, Y value) const
{
static constexpr std::array<std::string_view, 9> bars{" ", "", "", "", "", "", "", "", ""};
const auto & bar = (isNaN(value) || value > 8 || value < 1) ? bars[0] : bars[static_cast<UInt8>(value)];
const auto & bar = (isNaN(value) || value < 1 || 8 < value) ? bars[0] : bars[static_cast<UInt8>(value)];
frame.insert(bar.begin(), bar.end());
return bar.size();
}
@ -122,161 +136,108 @@ private:
* The minimum value of y is rendered as the lowest height "",
* the maximum value of y is rendered as the highest height "", and the middle value will be rendered proportionally.
* If a bucket has no y value, it will be rendered as " ".
* If the actual number of buckets is greater than the specified bucket, it will be compressed by width.
* For example, there are actually 11 buckets, specify 10 buckets, and divide the 11 buckets as follows (11/10):
* 0.0-1.1, 1.1-2.2, 2.2-3.3, 3.3-4.4, 4.4-5.5, 5.5-6.6, 6.6-7.7, 7.7-8.8, 8.8-9.9, 9.9-11.
* The y value of the first bucket will be calculated as follows:
* the actual y value of the first position + the actual second position y*0.1, and the remaining y*0.9 is reserved for the next bucket.
* The next bucket will use the last y*0.9 + the actual third position y*0.2, and the remaining y*0.8 will be reserved for the next bucket. And so on.
*/
void render(ColumnString & to_column, const AggregateFunctionSparkbarData<X, Y> & data) const
{
size_t sz = 0;
auto & values = to_column.getChars();
auto & offsets = to_column.getOffsets();
auto update_column = [&] ()
if (data.points.empty())
{
values.push_back('\0');
offsets.push_back(offsets.empty() ? 1 : offsets.back() + 1);
return;
}
auto from_x = is_specified_range_x ? begin_x : data.min_x;
auto to_x = is_specified_range_x ? end_x : data.max_x;
if (from_x >= to_x)
{
size_t sz = updateFrame(values, 8);
values.push_back('\0');
offsets.push_back(offsets.empty() ? sz + 1 : offsets.back() + sz + 1);
};
if (data.points.empty() || !width)
return update_column();
size_t diff_x;
X min_x_local;
if (specified_min_max_x)
{
diff_x = max_x - min_x;
min_x_local = min_x;
}
else
{
diff_x = data.max_x - data.min_x;
min_x_local = data.min_x;
return;
}
if ((diff_x + 1) <= width)
{
Y min_y = data.min_y;
Y max_y = data.max_y;
Float64 diff_y = max_y - min_y;
PaddedPODArray<Y> histogram(width, 0);
PaddedPODArray<UInt64> fhistogram(width, 0);
if (diff_y != 0.0)
for (const auto & point : data.points)
{
if (point.getKey() < from_x || to_x < point.getKey())
continue;
X delta = to_x - from_x;
if (delta < std::numeric_limits<X>::max())
delta = delta + 1;
X value = point.getKey() - from_x;
Float64 w = histogram.size();
size_t index = std::min<size_t>(static_cast<size_t>(w / delta * value), histogram.size() - 1);
if (std::numeric_limits<Y>::max() - histogram[index] > point.getMapped())
{
for (size_t i = 0; i <= diff_x; ++i)
{
auto it = data.points.find(static_cast<X>(min_x_local + i));
bool found = it != data.points.end();
sz += updateFrame(values, found ? std::round(((it->getMapped() - min_y) / diff_y) * 7) + 1 : 0.0);
}
histogram[index] += point.getMapped();
fhistogram[index] += 1;
}
else
{
for (size_t i = 0; i <= diff_x; ++i)
sz += updateFrame(values, data.points.has(min_x_local + static_cast<X>(i)) ? 1 : 0);
/// In case of overflow, just saturate
histogram[index] = std::numeric_limits<Y>::max();
}
}
else
for (size_t i = 0; i < histogram.size(); ++i)
{
// begin reshapes to width buckets
Float64 multiple_d = (diff_x + 1) / static_cast<Float64>(width);
std::optional<Float64> min_y;
std::optional<Float64> max_y;
std::optional<Float64> new_y;
std::vector<std::optional<Float64>> new_points;
new_points.reserve(width);
std::pair<size_t, Float64> bound{0, 0.0};
size_t cur_bucket_num = 0;
// upper bound for bucket
auto upper_bound = [&](size_t bucket_num)
{
bound.second = (bucket_num + 1) * multiple_d;
bound.first = static_cast<size_t>(std::floor(bound.second));
};
upper_bound(cur_bucket_num);
for (size_t i = 0; i <= (diff_x + 1); ++i)
{
if (i == bound.first) // is bound
{
Float64 proportion = bound.second - bound.first;
auto it = data.points.find(min_x_local + static_cast<X>(i));
bool found = (it != data.points.end());
if (found && proportion > 0)
new_y = new_y.value_or(0) + it->getMapped() * proportion;
if (new_y)
{
Float64 avg_y = new_y.value() / multiple_d;
new_points.emplace_back(avg_y);
// If min_y has no value, or if the avg_y of the current bucket is less than min_y, update it.
if (!min_y || avg_y < min_y)
min_y = avg_y;
if (!max_y || avg_y > max_y)
max_y = avg_y;
}
else
{
new_points.emplace_back();
}
// next bucket
new_y = found ? ((1 - proportion) * it->getMapped()) : std::optional<Float64>();
upper_bound(++cur_bucket_num);
}
else
{
auto it = data.points.find(min_x_local + static_cast<X>(i));
if (it != data.points.end())
new_y = new_y.value_or(0) + it->getMapped();
}
}
if (!min_y || !max_y) // No value is set
return update_column();
Float64 diff_y = max_y.value() - min_y.value();
auto update_frame = [&] (const std::optional<Float64> & point_y)
{
sz += updateFrame(values, point_y ? std::round(((point_y.value() - min_y.value()) / diff_y) * 7) + 1 : 0);
};
auto update_frame_for_constant = [&] (const std::optional<Float64> & point_y)
{
sz += updateFrame(values, point_y ? 1 : 0);
};
if (diff_y != 0.0)
std::for_each(new_points.begin(), new_points.end(), update_frame);
else
std::for_each(new_points.begin(), new_points.end(), update_frame_for_constant);
if (fhistogram[i] > 0)
histogram[i] /= fhistogram[i];
}
update_column();
Y y_max = 0;
for (auto & y : histogram)
{
if (isNaN(y) || y <= 0)
continue;
y_max = std::max(y_max, y);
}
if (y_max == 0)
{
values.push_back('\0');
offsets.push_back(offsets.empty() ? 1 : offsets.back() + 1);
return;
}
for (auto & y : histogram)
{
if (isNaN(y) || y <= 0)
y = 0;
else
y = y * 7 / y_max + 1;
}
size_t sz = 0;
for (const auto & y : histogram)
sz += updateFrame(values, y);
values.push_back('\0');
offsets.push_back(offsets.empty() ? sz + 1 : offsets.back() + sz + 1);
}
public:
AggregateFunctionSparkbar(const DataTypes & arguments, const Array & params)
: IAggregateFunctionDataHelper<AggregateFunctionSparkbarData<X, Y>, AggregateFunctionSparkbar>(
arguments, params, std::make_shared<DataTypeString>())
: IAggregateFunctionDataHelper<AggregateFunctionSparkbarData<X, Y>, AggregateFunctionSparkbar>(arguments, params, std::make_shared<DataTypeString>())
, width(params.empty() ? 0 : params.at(0).safeGet<UInt64>())
, is_specified_range_x(params.size() >= 3)
, begin_x(is_specified_range_x ? static_cast<X>(params.at(1).safeGet<X>()) : std::numeric_limits<X>::min())
, end_x(is_specified_range_x ? static_cast<X>(params.at(2).safeGet<X>()) : std::numeric_limits<X>::max())
{
width = params.at(0).safeGet<UInt64>();
if (params.size() == 3)
{
specified_min_max_x = true;
min_x = static_cast<X>(params.at(1).safeGet<X>());
max_x = static_cast<X>(params.at(2).safeGet<X>());
}
else
{
specified_min_max_x = false;
min_x = std::numeric_limits<X>::min();
max_x = std::numeric_limits<X>::max();
}
if (width < 2 || 1024 < width)
throw Exception(ErrorCodes::BAD_ARGUMENTS, "Parameter width must be in range [2, 1024]");
if (begin_x >= end_x)
throw Exception(ErrorCodes::BAD_ARGUMENTS, "Parameter `min_x` must be less than `max_x`");
}
String getName() const override
@ -287,7 +248,7 @@ public:
void add(AggregateDataPtr __restrict place, const IColumn ** columns, size_t row_num, Arena * /*arena*/) const override
{
X x = assert_cast<const ColumnVector<X> *>(columns[0])->getData()[row_num];
if (min_x <= x && x <= max_x)
if (begin_x <= x && x <= end_x)
{
Y y = assert_cast<const ColumnVector<Y> *>(columns[1])->getData()[row_num];
this->data(place).add(x, y);

View File

@ -953,7 +953,12 @@ void BackupImpl::writeFile(const String & file_name, BackupEntryPtr entry)
{
LOG_TRACE(log, "Will copy file {}", adjusted_path);
if (!num_entries)
bool has_entries = false;
{
std::lock_guard lock{mutex};
has_entries = num_entries > 0;
}
if (!has_entries)
checkLockFile(true);
if (use_archives)

View File

@ -226,13 +226,7 @@ add_object_library(clickhouse_access Access)
add_object_library(clickhouse_backups Backups)
add_object_library(clickhouse_core Core)
add_object_library(clickhouse_core_mysql Core/MySQL)
if (ENABLE_OPENSSL OR ENABLE_OPENSSL_DYNAMIC)
add_headers_and_sources(dbms Compression)
list(REMOVE_ITEM dbms_headers Compression/CompressionCodecEncrypted.h)
list(REMOVE_ITEM dbms_sources Compression/CompressionCodecEncrypted.cpp)
else ()
add_object_library(clickhouse_compression Compression)
endif ()
add_object_library(clickhouse_compression Compression)
add_object_library(clickhouse_querypipeline QueryPipeline)
add_object_library(clickhouse_datatypes DataTypes)
add_object_library(clickhouse_datatypes_serializations DataTypes/Serializations)

View File

@ -41,6 +41,7 @@
#include <Parsers/ASTLiteral.h>
#include <Parsers/ASTIdentifier.h>
#include <Parsers/ASTColumnDeclaration.h>
#include <Parsers/ASTFunction.h>
#include <Parsers/Kusto/ParserKQLStatement.h>
#include <Processors/Formats/Impl/NullFormat.h>
@ -816,17 +817,15 @@ void ClientBase::processTextAsSingleQuery(const String & full_query)
void ClientBase::processOrdinaryQuery(const String & query_to_execute, ASTPtr parsed_query)
{
if (fake_drop)
{
if (parsed_query->as<ASTDropQuery>())
return;
}
if (fake_drop && parsed_query->as<ASTDropQuery>())
return;
auto query = query_to_execute;
/// Rewrite query only when we have query parameters.
/// Note that if query is rewritten, comments in query are lost.
/// But the user often wants to see comments in server logs, query log, processlist, etc.
/// For recent versions of the server query parameters will be transferred by network and applied on the server side.
auto query = query_to_execute;
if (!query_parameters.empty()
&& connection->getServerRevision(connection_parameters.timeouts) < DBMS_MIN_PROTOCOL_VERSION_WITH_PARAMETERS)
{
@ -838,6 +837,22 @@ void ClientBase::processOrdinaryQuery(const String & query_to_execute, ASTPtr pa
query = serializeAST(*parsed_query);
}
if (allow_merge_tree_settings && parsed_query->as<ASTCreateQuery>())
{
/// Rewrite query if new settings were added.
if (addMergeTreeSettings(*parsed_query->as<ASTCreateQuery>()))
{
/// Replace query parameters because AST cannot be serialized otherwise.
if (!query_parameters.empty())
{
ReplaceQueryParameterVisitor visitor(query_parameters);
visitor.visit(parsed_query);
}
query = serializeAST(*parsed_query);
}
}
int retries_left = 10;
while (retries_left)
{
@ -2065,6 +2080,41 @@ void ClientBase::initQueryIdFormats()
}
bool ClientBase::addMergeTreeSettings(ASTCreateQuery & ast_create)
{
if (ast_create.attach
|| !ast_create.storage
|| !ast_create.storage->isExtendedStorageDefinition()
|| !ast_create.storage->engine
|| ast_create.storage->engine->name.find("MergeTree") == std::string::npos)
return false;
auto all_changed = cmd_merge_tree_settings.allChanged();
if (all_changed.begin() == all_changed.end())
return false;
if (!ast_create.storage->settings)
{
auto settings_ast = std::make_shared<ASTSetQuery>();
settings_ast->is_standalone = false;
ast_create.storage->set(ast_create.storage->settings, settings_ast);
}
auto & storage_settings = *ast_create.storage->settings;
bool added_new_setting = false;
for (const auto & setting : all_changed)
{
if (!storage_settings.changes.tryGet(setting.getName()))
{
storage_settings.changes.emplace_back(setting.getName(), setting.getValue());
added_new_setting = true;
}
}
return added_new_setting;
}
void ClientBase::runInteractive()
{
if (config().has("query_id"))
@ -2302,6 +2352,30 @@ void ClientBase::parseAndCheckOptions(OptionsDescription & options_description,
cmd_settings.addProgramOptionsAsMultitokens(options_description.main_description.value());
else
cmd_settings.addProgramOptions(options_description.main_description.value());
if (allow_merge_tree_settings)
{
/// Add merge tree settings manually, because names of some settings
/// may clash. Query settings have higher priority and we just
/// skip ambiguous merge tree settings.
auto & main_options = options_description.main_description.value();
NameSet main_option_names;
for (const auto & option : main_options.options())
main_option_names.insert(option->long_name());
for (const auto & setting : cmd_merge_tree_settings.all())
{
if (main_option_names.contains(setting.getName()))
continue;
if (allow_repeated_settings)
cmd_merge_tree_settings.addProgramOptionAsMultitoken(main_options, setting);
else
cmd_merge_tree_settings.addProgramOption(main_options, setting);
}
}
/// Parse main commandline options.
auto parser = po::command_line_parser(arguments).options(options_description.main_description.value()).allow_unregistered();
po::parsed_options parsed = parser.run();

View File

@ -1,6 +1,7 @@
#pragma once
#include "Common/NamePrompter.h"
#include <Parsers/ASTCreateQuery.h>
#include <Common/ProgressIndication.h>
#include <Common/InterruptListener.h>
#include <Common/ShellCommand.h>
@ -14,6 +15,7 @@
#include <boost/program_options.hpp>
#include <Storages/StorageFile.h>
#include <Storages/SelectQueryInfo.h>
#include <Storages/MergeTree/MergeTreeSettings.h>
namespace po = boost::program_options;
@ -164,6 +166,7 @@ private:
void updateSuggest(const ASTPtr & ast);
void initQueryIdFormats();
bool addMergeTreeSettings(ASTCreateQuery & ast_create);
protected:
static bool isSyncInsertWithData(const ASTInsertQuery & insert_query, const ContextPtr & context);
@ -212,6 +215,7 @@ protected:
/// Settings specified via command line args
Settings cmd_settings;
MergeTreeSettings cmd_merge_tree_settings;
/// thread status should be destructed before shared context because it relies on process list.
std::optional<ThreadStatus> thread_status;
@ -298,6 +302,7 @@ protected:
std::vector<HostAndPort> hosts_and_ports{};
bool allow_repeated_settings = false;
bool allow_merge_tree_settings = false;
bool cancelled = false;

View File

@ -1,5 +1,6 @@
#include <Columns/ColumnArray.h>
#include <Columns/ColumnsNumber.h>
#include <Columns/ColumnDecimal.h>
#include <Columns/ColumnString.h>
#include <Columns/ColumnTuple.h>
#include <Columns/ColumnNullable.h>
@ -549,19 +550,48 @@ void ColumnArray::insertRangeFrom(const IColumn & src, size_t start, size_t leng
ColumnPtr ColumnArray::filter(const Filter & filt, ssize_t result_size_hint) const
{
if (typeid_cast<const ColumnUInt8 *>(data.get())) return filterNumber<UInt8>(filt, result_size_hint);
if (typeid_cast<const ColumnUInt16 *>(data.get())) return filterNumber<UInt16>(filt, result_size_hint);
if (typeid_cast<const ColumnUInt32 *>(data.get())) return filterNumber<UInt32>(filt, result_size_hint);
if (typeid_cast<const ColumnUInt64 *>(data.get())) return filterNumber<UInt64>(filt, result_size_hint);
if (typeid_cast<const ColumnInt8 *>(data.get())) return filterNumber<Int8>(filt, result_size_hint);
if (typeid_cast<const ColumnInt16 *>(data.get())) return filterNumber<Int16>(filt, result_size_hint);
if (typeid_cast<const ColumnInt32 *>(data.get())) return filterNumber<Int32>(filt, result_size_hint);
if (typeid_cast<const ColumnInt64 *>(data.get())) return filterNumber<Int64>(filt, result_size_hint);
if (typeid_cast<const ColumnFloat32 *>(data.get())) return filterNumber<Float32>(filt, result_size_hint);
if (typeid_cast<const ColumnFloat64 *>(data.get())) return filterNumber<Float64>(filt, result_size_hint);
if (typeid_cast<const ColumnString *>(data.get())) return filterString(filt, result_size_hint);
if (typeid_cast<const ColumnTuple *>(data.get())) return filterTuple(filt, result_size_hint);
if (typeid_cast<const ColumnNullable *>(data.get())) return filterNullable(filt, result_size_hint);
if (typeid_cast<const ColumnUInt8 *>(data.get()))
return filterNumber<UInt8>(filt, result_size_hint);
if (typeid_cast<const ColumnUInt16 *>(data.get()))
return filterNumber<UInt16>(filt, result_size_hint);
if (typeid_cast<const ColumnUInt32 *>(data.get()))
return filterNumber<UInt32>(filt, result_size_hint);
if (typeid_cast<const ColumnUInt64 *>(data.get()))
return filterNumber<UInt64>(filt, result_size_hint);
if (typeid_cast<const ColumnUInt128 *>(data.get()))
return filterNumber<UInt128>(filt, result_size_hint);
if (typeid_cast<const ColumnUInt256 *>(data.get()))
return filterNumber<UInt256>(filt, result_size_hint);
if (typeid_cast<const ColumnInt8 *>(data.get()))
return filterNumber<Int8>(filt, result_size_hint);
if (typeid_cast<const ColumnInt16 *>(data.get()))
return filterNumber<Int16>(filt, result_size_hint);
if (typeid_cast<const ColumnInt32 *>(data.get()))
return filterNumber<Int32>(filt, result_size_hint);
if (typeid_cast<const ColumnInt64 *>(data.get()))
return filterNumber<Int64>(filt, result_size_hint);
if (typeid_cast<const ColumnInt128 *>(data.get()))
return filterNumber<Int128>(filt, result_size_hint);
if (typeid_cast<const ColumnInt256 *>(data.get()))
return filterNumber<Int256>(filt, result_size_hint);
if (typeid_cast<const ColumnFloat32 *>(data.get()))
return filterNumber<Float32>(filt, result_size_hint);
if (typeid_cast<const ColumnFloat64 *>(data.get()))
return filterNumber<Float64>(filt, result_size_hint);
if (typeid_cast<const ColumnDecimal<Decimal32> *>(data.get()))
return filterNumber<Decimal32>(filt, result_size_hint);
if (typeid_cast<const ColumnDecimal<Decimal64> *>(data.get()))
return filterNumber<Decimal64>(filt, result_size_hint);
if (typeid_cast<const ColumnDecimal<Decimal128> *>(data.get()))
return filterNumber<Decimal128>(filt, result_size_hint);
if (typeid_cast<const ColumnDecimal<Decimal256> *>(data.get()))
return filterNumber<Decimal256>(filt, result_size_hint);
if (typeid_cast<const ColumnString *>(data.get()))
return filterString(filt, result_size_hint);
if (typeid_cast<const ColumnTuple *>(data.get()))
return filterTuple(filt, result_size_hint);
if (typeid_cast<const ColumnNullable *>(data.get()))
return filterNullable(filt, result_size_hint);
return filterGeneric(filt, result_size_hint);
}
@ -597,15 +627,17 @@ void ColumnArray::expand(const IColumn::Filter & mask, bool inverted)
template <typename T>
ColumnPtr ColumnArray::filterNumber(const Filter & filt, ssize_t result_size_hint) const
{
using ColVecType = ColumnVectorOrDecimal<T>;
if (getOffsets().empty())
return ColumnArray::create(data);
auto res = ColumnArray::create(data->cloneEmpty());
auto & res_elems = assert_cast<ColumnVector<T> &>(res->getData()).getData();
auto & res_elems = assert_cast<ColVecType &>(res->getData()).getData();
Offsets & res_offsets = res->getOffsets();
filterArraysImpl<T>(assert_cast<const ColumnVector<T> &>(*data).getData(), getOffsets(), res_elems, res_offsets, filt, result_size_hint);
filterArraysImpl<T>(assert_cast<const ColVecType &>(*data).getData(), getOffsets(), res_elems, res_offsets, filt, result_size_hint);
return res;
}
@ -932,20 +964,50 @@ ColumnPtr ColumnArray::replicate(const Offsets & replicate_offsets) const
if (replicate_offsets.empty())
return cloneEmpty();
if (typeid_cast<const ColumnUInt8 *>(data.get())) return replicateNumber<UInt8>(replicate_offsets);
if (typeid_cast<const ColumnUInt16 *>(data.get())) return replicateNumber<UInt16>(replicate_offsets);
if (typeid_cast<const ColumnUInt32 *>(data.get())) return replicateNumber<UInt32>(replicate_offsets);
if (typeid_cast<const ColumnUInt64 *>(data.get())) return replicateNumber<UInt64>(replicate_offsets);
if (typeid_cast<const ColumnInt8 *>(data.get())) return replicateNumber<Int8>(replicate_offsets);
if (typeid_cast<const ColumnInt16 *>(data.get())) return replicateNumber<Int16>(replicate_offsets);
if (typeid_cast<const ColumnInt32 *>(data.get())) return replicateNumber<Int32>(replicate_offsets);
if (typeid_cast<const ColumnInt64 *>(data.get())) return replicateNumber<Int64>(replicate_offsets);
if (typeid_cast<const ColumnFloat32 *>(data.get())) return replicateNumber<Float32>(replicate_offsets);
if (typeid_cast<const ColumnFloat64 *>(data.get())) return replicateNumber<Float64>(replicate_offsets);
if (typeid_cast<const ColumnString *>(data.get())) return replicateString(replicate_offsets);
if (typeid_cast<const ColumnConst *>(data.get())) return replicateConst(replicate_offsets);
if (typeid_cast<const ColumnNullable *>(data.get())) return replicateNullable(replicate_offsets);
if (typeid_cast<const ColumnTuple *>(data.get())) return replicateTuple(replicate_offsets);
if (typeid_cast<const ColumnUInt8 *>(data.get()))
return replicateNumber<UInt8>(replicate_offsets);
if (typeid_cast<const ColumnUInt16 *>(data.get()))
return replicateNumber<UInt16>(replicate_offsets);
if (typeid_cast<const ColumnUInt32 *>(data.get()))
return replicateNumber<UInt32>(replicate_offsets);
if (typeid_cast<const ColumnUInt64 *>(data.get()))
return replicateNumber<UInt64>(replicate_offsets);
if (typeid_cast<const ColumnUInt128 *>(data.get()))
return replicateNumber<UInt128>(replicate_offsets);
if (typeid_cast<const ColumnUInt256 *>(data.get()))
return replicateNumber<UInt256>(replicate_offsets);
if (typeid_cast<const ColumnInt8 *>(data.get()))
return replicateNumber<Int8>(replicate_offsets);
if (typeid_cast<const ColumnInt16 *>(data.get()))
return replicateNumber<Int16>(replicate_offsets);
if (typeid_cast<const ColumnInt32 *>(data.get()))
return replicateNumber<Int32>(replicate_offsets);
if (typeid_cast<const ColumnInt64 *>(data.get()))
return replicateNumber<Int64>(replicate_offsets);
if (typeid_cast<const ColumnInt128 *>(data.get()))
return replicateNumber<Int128>(replicate_offsets);
if (typeid_cast<const ColumnInt256 *>(data.get()))
return replicateNumber<Int256>(replicate_offsets);
if (typeid_cast<const ColumnFloat32 *>(data.get()))
return replicateNumber<Float32>(replicate_offsets);
if (typeid_cast<const ColumnFloat64 *>(data.get()))
return replicateNumber<Float64>(replicate_offsets);
if (typeid_cast<const ColumnDecimal<Decimal32> *>(data.get()))
return replicateNumber<Decimal32>(replicate_offsets);
if (typeid_cast<const ColumnDecimal<Decimal64> *>(data.get()))
return replicateNumber<Decimal64>(replicate_offsets);
if (typeid_cast<const ColumnDecimal<Decimal128> *>(data.get()))
return replicateNumber<Decimal128>(replicate_offsets);
if (typeid_cast<const ColumnDecimal<Decimal256> *>(data.get()))
return replicateNumber<Decimal256>(replicate_offsets);
if (typeid_cast<const ColumnString *>(data.get()))
return replicateString(replicate_offsets);
if (typeid_cast<const ColumnConst *>(data.get()))
return replicateConst(replicate_offsets);
if (typeid_cast<const ColumnNullable *>(data.get()))
return replicateNullable(replicate_offsets);
if (typeid_cast<const ColumnTuple *>(data.get()))
return replicateTuple(replicate_offsets);
return replicateGeneric(replicate_offsets);
}
@ -953,6 +1015,8 @@ ColumnPtr ColumnArray::replicate(const Offsets & replicate_offsets) const
template <typename T>
ColumnPtr ColumnArray::replicateNumber(const Offsets & replicate_offsets) const
{
using ColVecType = ColumnVectorOrDecimal<T>;
size_t col_size = size();
if (col_size != replicate_offsets.size())
throw Exception(ErrorCodes::SIZES_OF_COLUMNS_DOESNT_MATCH, "Size of offsets doesn't match size of column.");
@ -964,10 +1028,10 @@ ColumnPtr ColumnArray::replicateNumber(const Offsets & replicate_offsets) const
ColumnArray & res_arr = typeid_cast<ColumnArray &>(*res);
const typename ColumnVector<T>::Container & src_data = typeid_cast<const ColumnVector<T> &>(*data).getData();
const typename ColVecType::Container & src_data = typeid_cast<const ColVecType &>(*data).getData();
const Offsets & src_offsets = getOffsets();
typename ColumnVector<T>::Container & res_data = typeid_cast<ColumnVector<T> &>(res_arr.getData()).getData();
typename ColVecType::Container & res_data = typeid_cast<ColVecType &>(res_arr.getData()).getData();
Offsets & res_offsets = res_arr.getOffsets();
res_data.reserve(data->size() / col_size * replicate_offsets.back());

View File

@ -59,10 +59,7 @@ public:
void insertFrom(const IColumn & src, size_t n) override { data.push_back(static_cast<const Self &>(src).getData()[n]); }
void insertData(const char * src, size_t /*length*/) override;
void insertDefault() override { data.push_back(T()); }
virtual void insertManyDefaults(size_t length) override
{
data.resize_fill(data.size() + length);
}
void insertManyDefaults(size_t length) override { data.resize_fill(data.size() + length); }
void insert(const Field & x) override { data.push_back(x.get<T>()); }
void insertRangeFrom(const IColumn & src, size_t start, size_t length) override;

View File

@ -320,12 +320,20 @@ INSTANTIATE(UInt8)
INSTANTIATE(UInt16)
INSTANTIATE(UInt32)
INSTANTIATE(UInt64)
INSTANTIATE(UInt128)
INSTANTIATE(UInt256)
INSTANTIATE(Int8)
INSTANTIATE(Int16)
INSTANTIATE(Int32)
INSTANTIATE(Int64)
INSTANTIATE(Int128)
INSTANTIATE(Int256)
INSTANTIATE(Float32)
INSTANTIATE(Float64)
INSTANTIATE(Decimal32)
INSTANTIATE(Decimal64)
INSTANTIATE(Decimal128)
INSTANTIATE(Decimal256)
#undef INSTANTIATE

View File

@ -82,7 +82,12 @@ FilterDescription::FilterDescription(const IColumn & column_)
const auto size = res.size();
assert(size == null_map.size());
for (size_t i = 0; i < size; ++i)
res[i] = res[i] && !null_map[i];
{
auto has_val = static_cast<UInt8>(!!res[i]);
auto not_null = static_cast<UInt8>(!null_map[i]);
/// Instead of the logical AND operator(&&), the bitwise one(&) is utilized for the auto vectorization.
res[i] = has_val & not_null;
}
data = &res;
data_holder = std::move(mutable_holder);

View File

@ -1,6 +1,6 @@
#pragma once
#include <mutex>
#include <atomic>
#include <memory>
#include <base/defines.h>
@ -32,26 +32,25 @@ public:
MultiVersion() = default;
explicit MultiVersion(std::unique_ptr<const T> && value)
: current_version(std::move(value))
{
set(std::move(value));
}
/// Obtain current version for read-only usage. Returns shared_ptr, that manages lifetime of version.
Version get() const
{
/// NOTE: is it possible to lock-free replace of shared_ptr?
std::lock_guard lock(mutex);
return current_version;
return std::atomic_load(&current_version);
}
/// TODO: replace atomic_load/store() on shared_ptr (which is deprecated as of C++20) by C++20 std::atomic<std::shared_ptr>.
/// Clang 15 currently does not support it.
/// Update an object with new version.
void set(std::unique_ptr<const T> && value)
{
std::lock_guard lock(mutex);
current_version = std::move(value);
std::atomic_store(&current_version, Version{std::move(value)});
}
private:
Version current_version TSA_GUARDED_BY(mutex);
mutable std::mutex mutex;
Version current_version;
};

View File

@ -29,6 +29,7 @@
#cmakedefine01 USE_FASTOPS
#cmakedefine01 USE_NLP
#cmakedefine01 USE_VECTORSCAN
#cmakedefine01 USE_LIBURING
#cmakedefine01 USE_AVRO
#cmakedefine01 USE_CAPNP
#cmakedefine01 USE_PARQUET

View File

@ -11,10 +11,14 @@
// This depends on BoringSSL-specific API, notably <openssl/aead.h>.
#if USE_SSL
#include <openssl/digest.h>
#include <openssl/err.h>
#include <boost/algorithm/hex.hpp>
#include <openssl/aead.h>
# include <openssl/err.h>
# include <boost/algorithm/hex.hpp>
# if USE_BORINGSSL
# include <openssl/digest.h>
# include <openssl/aead.h>
# else
# include <openssl/evp.h>
# endif
#endif
// Common part for both parts (with SSL and without)
@ -87,23 +91,6 @@ constexpr size_t nonce_max_size = 13; /// Nonce size and one byte to show i
constexpr size_t actual_nonce_size = 12; /// Nonce actual size
const String empty_nonce = {"\0\0\0\0\0\0\0\0\0\0\0\0", actual_nonce_size};
/// Get encryption/decryption algorithms.
auto getMethod(EncryptionMethod Method)
{
if (Method == AES_128_GCM_SIV)
{
return EVP_aead_aes_128_gcm_siv;
}
else if (Method == AES_256_GCM_SIV)
{
return EVP_aead_aes_256_gcm_siv;
}
else
{
throw Exception(ErrorCodes::BAD_ARGUMENTS, "Wrong encryption Method. Got {}", getMethodName(Method));
}
}
/// Find out key size for each algorithm
UInt64 methodKeySize(EncryptionMethod Method)
{
@ -128,6 +115,24 @@ std::string lastErrorString()
return std::string(buffer.data());
}
#if USE_BORINGSSL
/// Get encryption/decryption algorithms.
auto getMethod(EncryptionMethod Method)
{
if (Method == AES_128_GCM_SIV)
{
return EVP_aead_aes_128_gcm_siv;
}
else if (Method == AES_256_GCM_SIV)
{
return EVP_aead_aes_256_gcm_siv;
}
else
{
throw Exception(ErrorCodes::BAD_ARGUMENTS, "Wrong encryption Method. Got {}", getMethodName(Method));
}
}
/// Encrypt plaintext with particular algorithm and put result into ciphertext_and_tag.
/// This function get key and nonce and encrypt text with their help.
/// If something went wrong (can't init context or can't encrypt data) it throws exception.
@ -186,6 +191,160 @@ size_t decrypt(std::string_view ciphertext, char * plaintext, EncryptionMethod m
return out_len;
}
#else
/// Get encryption/decryption algorithms.
auto getMethod(EncryptionMethod Method)
{
if (Method == AES_128_GCM_SIV)
{
return EVP_aes_128_gcm;
}
else if (Method == AES_256_GCM_SIV)
{
return EVP_aes_256_gcm;
}
else
{
throw Exception(ErrorCodes::BAD_ARGUMENTS, "Wrong encryption Method. Got {}", getMethodName(Method));
}
}
/// Encrypt plaintext with particular algorithm and put result into ciphertext_and_tag.
/// This function get key and nonce and encrypt text with their help.
/// If something went wrong (can't init context or can't encrypt data) it throws exception.
/// It returns length of encrypted text.
size_t encrypt(std::string_view plaintext, char * ciphertext_and_tag, EncryptionMethod method, const String & key, const String & nonce)
{
int out_len;
int ciphertext_len;
EVP_CIPHER_CTX *encrypt_ctx;
if (!(encrypt_ctx = EVP_CIPHER_CTX_new()))
throw Exception::createDeprecated(lastErrorString(), ErrorCodes::OPENSSL_ERROR);
try
{
const int ok_cryptinit = EVP_EncryptInit_ex(encrypt_ctx,
getMethod(method)(),
nullptr, nullptr, nullptr);
if (!ok_cryptinit)
throw Exception::createDeprecated(lastErrorString(), ErrorCodes::OPENSSL_ERROR);
const int ok_cipherctrl = EVP_CIPHER_CTX_ctrl(encrypt_ctx,
EVP_CTRL_GCM_SET_IVLEN,
static_cast<int32_t>(nonce.size()),
nullptr);
if (!ok_cipherctrl)
throw Exception::createDeprecated(lastErrorString(), ErrorCodes::OPENSSL_ERROR);
const int ok_nonceinit = EVP_EncryptInit_ex(encrypt_ctx, nullptr, nullptr,
reinterpret_cast<const uint8_t*>(key.data()),
reinterpret_cast<const uint8_t *>(nonce.data()));
if (!ok_nonceinit)
throw Exception::createDeprecated(lastErrorString(), ErrorCodes::OPENSSL_ERROR);
const int ok_encryptupdate = EVP_EncryptUpdate(encrypt_ctx,
reinterpret_cast<uint8_t *>(ciphertext_and_tag),
&out_len,
reinterpret_cast<const uint8_t *>(plaintext.data()),
static_cast<int32_t>(plaintext.size()));
ciphertext_len = out_len;
if (!ok_encryptupdate)
throw Exception::createDeprecated(lastErrorString(), ErrorCodes::OPENSSL_ERROR);
const int ok_encryptfinal = EVP_EncryptFinal_ex(encrypt_ctx,
reinterpret_cast<uint8_t *>(ciphertext_and_tag) + out_len,
reinterpret_cast<int32_t *>(&out_len));
ciphertext_len += out_len;
if (!ok_encryptfinal)
throw Exception::createDeprecated(lastErrorString(), ErrorCodes::OPENSSL_ERROR);
/* Get the tag */
const int ok_tag = EVP_CIPHER_CTX_ctrl(encrypt_ctx,
EVP_CTRL_GCM_GET_TAG,
tag_size,
reinterpret_cast<uint8_t *>(ciphertext_and_tag) + plaintext.size());
if (!ok_tag)
throw Exception::createDeprecated(lastErrorString(), ErrorCodes::OPENSSL_ERROR);
}
catch (...)
{
EVP_CIPHER_CTX_free(encrypt_ctx);
throw;
}
EVP_CIPHER_CTX_free(encrypt_ctx);
return ciphertext_len + tag_size;
}
/// Encrypt plaintext with particular algorithm and put result into ciphertext_and_tag.
/// This function get key and nonce and encrypt text with their help.
/// If something went wrong (can't init context or can't encrypt data) it throws exception.
/// It returns length of encrypted text.
size_t decrypt(std::string_view ciphertext, char * plaintext, EncryptionMethod method, const String & key, const String & nonce)
{
int out_len;
int plaintext_len;
EVP_CIPHER_CTX *decrypt_ctx;
if (!(decrypt_ctx = EVP_CIPHER_CTX_new()))
throw Exception::createDeprecated(lastErrorString(), ErrorCodes::OPENSSL_ERROR);
try
{
const int ok_cryptinit = EVP_DecryptInit_ex(decrypt_ctx,
getMethod(method)(),
nullptr, nullptr, nullptr);
if (!ok_cryptinit)
throw Exception::createDeprecated(lastErrorString(), ErrorCodes::OPENSSL_ERROR);
const int ok_cipherctrl = EVP_CIPHER_CTX_ctrl(decrypt_ctx,
EVP_CTRL_GCM_SET_IVLEN,
static_cast<int32_t>(nonce.size()), nullptr);
if (!ok_cipherctrl)
throw Exception::createDeprecated(lastErrorString(), ErrorCodes::OPENSSL_ERROR);
const int ok_nonceinit = EVP_DecryptInit_ex(decrypt_ctx, nullptr, nullptr,
reinterpret_cast<const uint8_t*>(key.data()),
reinterpret_cast<const uint8_t *>(nonce.data()));
if (!ok_nonceinit)
throw Exception::createDeprecated(lastErrorString(), ErrorCodes::OPENSSL_ERROR);
const int ok_decryptudpate = EVP_DecryptUpdate(decrypt_ctx,
reinterpret_cast<uint8_t *>(plaintext),
reinterpret_cast<int32_t *>(&out_len),
reinterpret_cast<const uint8_t *>(ciphertext.data()),
static_cast<int32_t>(ciphertext.size()) - tag_size);
plaintext_len = out_len;
if (!ok_decryptudpate)
throw Exception::createDeprecated(lastErrorString(), ErrorCodes::OPENSSL_ERROR);
const int ok_tag = EVP_CIPHER_CTX_ctrl(decrypt_ctx,
EVP_CTRL_GCM_SET_TAG,
tag_size,
reinterpret_cast<uint8_t *>(const_cast<char *>(ciphertext.data())) + ciphertext.size() - tag_size);
if (!ok_tag)
throw Exception::createDeprecated(lastErrorString(), ErrorCodes::OPENSSL_ERROR);
const int ok_decryptfinal = EVP_DecryptFinal_ex(decrypt_ctx,
reinterpret_cast<uint8_t *>(plaintext) + out_len,
reinterpret_cast<int32_t *>(&out_len));
if (!ok_decryptfinal)
throw Exception::createDeprecated(lastErrorString(), ErrorCodes::OPENSSL_ERROR);
}
catch (...)
{
EVP_CIPHER_CTX_free(decrypt_ctx);
throw;
}
EVP_CIPHER_CTX_free(decrypt_ctx);
return plaintext_len + out_len;
}
#endif
/// Register codec in factory
void registerEncryptionCodec(CompressionCodecFactory & factory, EncryptionMethod Method)

View File

@ -178,9 +178,7 @@ void registerCodecDelta(CompressionCodecFactory & factory);
void registerCodecT64(CompressionCodecFactory & factory);
void registerCodecDoubleDelta(CompressionCodecFactory & factory);
void registerCodecGorilla(CompressionCodecFactory & factory);
#if USE_BORINGSSL
void registerCodecEncrypted(CompressionCodecFactory & factory);
#endif
void registerCodecFPC(CompressionCodecFactory & factory);
#endif
@ -197,9 +195,7 @@ CompressionCodecFactory::CompressionCodecFactory()
registerCodecT64(*this);
registerCodecDoubleDelta(*this);
registerCodecGorilla(*this);
#if USE_BORINGSSL
registerCodecEncrypted(*this);
#endif
registerCodecFPC(*this);
#ifdef ENABLE_QPL_COMPRESSION
registerCodecDeflateQpl(*this);

View File

@ -5,10 +5,17 @@
#include <base/range.h>
#include <boost/blank.hpp>
#include <unordered_map>
#include <boost/program_options/options_description.hpp>
namespace boost::program_options
{
class options_description;
}
namespace DB
{
class ReadBuffer;
class WriteBuffer;
@ -19,7 +26,6 @@ enum class SettingsWriteFormat
DEFAULT = STRINGS_WITH_FLAGS,
};
/** Template class to define collections of settings.
* Example of usage:
*
@ -119,6 +125,18 @@ public:
std::conditional_t<Traits::allow_custom_settings, const CustomSettingMap::mapped_type*, boost::blank> custom_setting;
};
/// Adds program options to set the settings from a command line.
/// (Don't forget to call notify() on the `variables_map` after parsing it!)
void addProgramOptions(boost::program_options::options_description & options);
/// Adds program options as to set the settings from a command line.
/// Allows to set one setting multiple times, the last value will be used.
/// (Don't forget to call notify() on the `variables_map` after parsing it!)
void addProgramOptionsAsMultitokens(boost::program_options::options_description & options);
void addProgramOption(boost::program_options::options_description & options, const SettingFieldRef & field);
void addProgramOptionAsMultitoken(boost::program_options::options_description & options, const SettingFieldRef & field);
enum SkipFlags
{
SKIP_NONE = 0,
@ -518,6 +536,38 @@ String BaseSettings<TTraits>::toString() const
return res;
}
template <typename TTraits>
void BaseSettings<TTraits>::addProgramOptions(boost::program_options::options_description & options)
{
for (const auto & field : all())
addProgramOption(options, field);
}
template <typename TTraits>
void BaseSettings<TTraits>::addProgramOptionsAsMultitokens(boost::program_options::options_description & options)
{
for (const auto & field : all())
addProgramOptionAsMultitoken(options, field);
}
template <typename TTraits>
void BaseSettings<TTraits>::addProgramOption(boost::program_options::options_description & options, const SettingFieldRef & field)
{
const std::string_view name = field.getName();
auto on_program_option = boost::function1<void, const std::string &>([this, name](const std::string & value) { set(name, value); });
options.add(boost::shared_ptr<boost::program_options::option_description>(new boost::program_options::option_description(
name.data(), boost::program_options::value<std::string>()->composing()->notifier(on_program_option), field.getDescription())));
}
template <typename TTraits>
void BaseSettings<TTraits>::addProgramOptionAsMultitoken(boost::program_options::options_description & options, const SettingFieldRef & field)
{
const std::string_view name = field.getName();
auto on_program_option = boost::function1<void, const Strings &>([this, name](const Strings & values) { set(name, values.back()); });
options.add(boost::shared_ptr<boost::program_options::option_description>(new boost::program_options::option_description(
name.data(), boost::program_options::value<Strings>()->multitoken()->composing()->notifier(on_program_option), field.getDescription())));
}
template <typename TTraits>
bool operator==(const BaseSettings<TTraits> & left, const BaseSettings<TTraits> & right)
{

View File

@ -6,7 +6,6 @@
#include <Columns/ColumnMap.h>
#include <Common/typeid_cast.h>
#include <cstring>
#include <boost/program_options/options_description.hpp>
namespace DB
{
@ -82,38 +81,6 @@ void Settings::dumpToMapColumn(IColumn * column, bool changed_only)
offsets.push_back(offsets.back() + size);
}
void Settings::addProgramOptions(boost::program_options::options_description & options)
{
for (const auto & field : all())
{
addProgramOption(options, field);
}
}
void Settings::addProgramOptionsAsMultitokens(boost::program_options::options_description & options)
{
for (const auto & field : all())
{
addProgramOptionAsMultitoken(options, field);
}
}
void Settings::addProgramOption(boost::program_options::options_description & options, const SettingFieldRef & field)
{
const std::string_view name = field.getName();
auto on_program_option = boost::function1<void, const std::string &>([this, name](const std::string & value) { set(name, value); });
options.add(boost::shared_ptr<boost::program_options::option_description>(new boost::program_options::option_description(
name.data(), boost::program_options::value<std::string>()->composing()->notifier(on_program_option), field.getDescription())));
}
void Settings::addProgramOptionAsMultitoken(boost::program_options::options_description & options, const SettingFieldRef & field)
{
const std::string_view name = field.getName();
auto on_program_option = boost::function1<void, const Strings &>([this, name](const Strings & values) { set(name, values.back()); });
options.add(boost::shared_ptr<boost::program_options::option_description>(new boost::program_options::option_description(
name.data(), boost::program_options::value<Strings>()->multitoken()->composing()->notifier(on_program_option), field.getDescription())));
}
void Settings::checkNoSettingNamesAtTopLevel(const Poco::Util::AbstractConfiguration & config, const String & config_path)
{
if (config.getBool("skip_check_for_incorrect_settings", false))

View File

@ -13,12 +13,6 @@ namespace Poco::Util
class AbstractConfiguration;
}
namespace boost::program_options
{
class options_description;
}
namespace DB
{
class IColumn;
@ -96,6 +90,7 @@ class IColumn;
M(Bool, s3_create_new_file_on_insert, false, "Enables or disables creating a new file on each insert in s3 engine tables", 0) \
M(Bool, s3_check_objects_after_upload, false, "Check each uploaded object to s3 with head request to be sure that upload was successful", 0) \
M(Bool, s3_allow_parallel_part_upload, true, "Use multiple threads for s3 multipart upload. It may lead to slightly higher memory usage", 0) \
M(Bool, s3_throw_on_zero_files_match, false, "Throw an error, when ListObjects request cannot match any files", 0) \
M(Bool, enable_s3_requests_logging, false, "Enable very explicit logging of S3 requests. Makes sense for debug only.", 0) \
M(UInt64, hdfs_replication, 0, "The actual number of replications can be specified when the hdfs file is created.", 0) \
M(Bool, hdfs_truncate_on_insert, false, "Enables or disables truncate before insert in s3 engine tables", 0) \
@ -894,6 +889,7 @@ class IColumn;
M(Bool, input_format_bson_skip_fields_with_unsupported_types_in_schema_inference, false, "Skip fields with unsupported types while schema inference for format BSON.", 0) \
\
M(Bool, regexp_dict_allow_other_sources, false, "Allow regexp_tree dictionary to use sources other than yaml source.", 0) \
M(Bool, regexp_dict_allow_hyperscan, false, "Allow regexp_tree dictionary using Hyperscan library.", 0) \
// End of FORMAT_FACTORY_SETTINGS
// Please add settings non-related to formats into the COMMON_SETTINGS above.
@ -926,25 +922,12 @@ struct Settings : public BaseSettings<SettingsTraits>, public IHints<2, Settings
/// Dumps profile events to column of type Map(String, String)
void dumpToMapColumn(IColumn * column, bool changed_only = true);
/// Adds program options to set the settings from a command line.
/// (Don't forget to call notify() on the `variables_map` after parsing it!)
void addProgramOptions(boost::program_options::options_description & options);
/// Adds program options as to set the settings from a command line.
/// Allows to set one setting multiple times, the last value will be used.
/// (Don't forget to call notify() on the `variables_map` after parsing it!)
void addProgramOptionsAsMultitokens(boost::program_options::options_description & options);
/// Check that there is no user-level settings at the top level in config.
/// This is a common source of mistake (user don't know where to write user-level setting).
static void checkNoSettingNamesAtTopLevel(const Poco::Util::AbstractConfiguration & config, const String & config_path);
std::vector<String> getAllRegisteredNames() const override;
void addProgramOption(boost::program_options::options_description & options, const SettingFieldRef & field);
void addProgramOptionAsMultitoken(boost::program_options::options_description & options, const SettingFieldRef & field);
void set(std::string_view name, const Field & value) override;
void setDefaultValue(const String & name) { resetToDefault(name); }

View File

@ -1,3 +1,4 @@
#include <exception>
#include <optional>
#include <string_view>
@ -88,8 +89,15 @@ struct RegExpTreeDictionary::RegexTreeNode
UInt64 parent_id;
std::string regex;
re2_st::RE2 searcher;
RegexTreeNode(UInt64 id_, UInt64 parent_id_, const String & regex_, const re2_st::RE2::Options & regexp_options):
id(id_), parent_id(parent_id_), regex(regex_), searcher(regex_, regexp_options) {}
bool match(const char * haystack, size_t size) const
{
return searcher.Match(haystack, 0, size, re2_st::RE2::Anchor::UNANCHORED, nullptr, 0);
}
struct AttributeValue
{
Field field;
@ -118,7 +126,7 @@ std::vector<StringPiece> createStringPieces(const String & value, int num_captur
}
int ref_num = value[i+1]-'0';
if (ref_num >= num_captures)
LOG_DEBUG(logger,
LOG_TRACE(logger,
"Reference Id {} in set string is invalid, the regexp {} only has {} capturing groups",
ref_num, regex, num_captures-1);
result.push_back(StringPiece(ref_num));
@ -137,13 +145,60 @@ std::vector<StringPiece> createStringPieces(const String & value, int num_captur
void RegExpTreeDictionary::calculateBytesAllocated()
{
for (const String & regex : regexps)
for (const String & regex : simple_regexps)
bytes_allocated += regex.size();
bytes_allocated += sizeof(UInt64) * regexp_ids.size();
bytes_allocated += (sizeof(RegexTreeNode) + sizeof(UInt64)) * regex_nodes.size();
bytes_allocated += 2 * sizeof(UInt64) * topology_order.size();
}
namespace
{
/// hyper scan is not good at processing regex containing {0, 200}
/// This will make re compilation slow and failed. So we select this heavy regular expressions and
/// process it with re2.
struct RegexChecker
{
re2_st::RE2 searcher;
RegexChecker() : searcher(R"(\{([\d]+),([\d]+)\})") {}
static bool isFigureLargerThanFifty(const String & str)
try
{
auto number = std::stoi(str);
return number > 50;
}
catch (std::exception &)
{
return false;
}
[[maybe_unused]]
bool isSimpleRegex(const String & regex) const
{
re2_st::StringPiece haystack(regex.data(), regex.size());
re2_st::StringPiece matches[10];
size_t start_pos = 0;
while (start_pos < regex.size())
{
if (searcher.Match(haystack, start_pos, regex.size(), re2_st::RE2::Anchor::UNANCHORED, matches, 10))
{
const auto & match = matches[0];
start_pos += match.length();
const auto & match1 = matches[1];
const auto & match2 = matches[2];
if (isFigureLargerThanFifty(match1.ToString()) || isFigureLargerThanFifty(match2.ToString()))
return false;
}
else
break;
}
return true;
}
};
}
void RegExpTreeDictionary::initRegexNodes(Block & block)
{
auto id_column = block.getByName(kId).column;
@ -152,6 +207,8 @@ void RegExpTreeDictionary::initRegexNodes(Block & block)
auto keys_column = block.getByName(kKeys).column;
auto values_column = block.getByName(kValues).column;
RegexChecker checker;
size_t size = block.rows();
for (size_t i = 0; i < size; i++)
{
@ -165,12 +222,10 @@ void RegExpTreeDictionary::initRegexNodes(Block & block)
if (id == 0)
throw Exception(ErrorCodes::INCORRECT_DICTIONARY_DEFINITION, "There are invalid id {}", id);
regexps.push_back(regex);
regexp_ids.push_back(id);
re2_st::RE2::Options regexp_options;
regexp_options.set_log_errors(false);
RegexTreeNodePtr node = std::make_unique<RegexTreeNode>(id, parent_id, regex, regexp_options);
RegexTreeNodePtr node = std::make_shared<RegexTreeNode>(id, parent_id, regex, regexp_options);
int num_captures = std::min(node->searcher.NumberOfCapturingGroups() + 1, 10);
@ -196,7 +251,16 @@ void RegExpTreeDictionary::initRegexNodes(Block & block)
}
}
}
regex_nodes.emplace(id, std::move(node));
regex_nodes.emplace(id, node);
#if USE_VECTORSCAN
if (use_vectorscan && checker.isSimpleRegex(regex))
{
simple_regexps.push_back(regex);
regexp_ids.push_back(id);
}
else
#endif
complex_regexp_nodes.push_back(node);
}
}
@ -226,7 +290,7 @@ void RegExpTreeDictionary::initTopologyOrder(UInt64 node_idx, std::set<UInt64> &
visited.insert(node_idx);
for (UInt64 child_idx : regex_nodes[node_idx]->children)
if (visited.contains(child_idx))
throw Exception(ErrorCodes::INCORRECT_DICTIONARY_DEFINITION, "Invalid Regex tree");
throw Exception(ErrorCodes::INCORRECT_DICTIONARY_DEFINITION, "Invalid Regex tree. The input tree is cyclical");
else
initTopologyOrder(child_idx, visited, topology_id);
topology_order[node_idx] = topology_id++;
@ -245,12 +309,18 @@ void RegExpTreeDictionary::loadData()
initRegexNodes(block);
}
initGraph();
if (regexps.empty())
if (simple_regexps.empty() && complex_regexp_nodes.empty())
throw Exception(ErrorCodes::INCORRECT_DICTIONARY_DEFINITION, "There are no available regular expression. Please check your config");
LOG_INFO(logger, "There are {} simple regexps and {} complex regexps", simple_regexps.size(), complex_regexp_nodes.size());
/// If all the regexps cannot work with hyperscan, we should set this flag off to avoid exceptions.
if (simple_regexps.empty())
use_vectorscan = false;
if (!use_vectorscan)
return;
#if USE_VECTORSCAN
try
{
std::vector<std::string_view> regexps_views(regexps.begin(), regexps.end());
std::vector<std::string_view> regexps_views(simple_regexps.begin(), simple_regexps.end());
hyperscan_regex = MultiRegexps::getOrSet<true, false>(regexps_views, std::nullopt);
hyperscan_regex->get();
}
@ -258,7 +328,6 @@ void RegExpTreeDictionary::loadData()
{
/// Some compile errors will be thrown as LOGICAL ERROR and cause crash, e.g. empty expression or expressions are too large.
/// We catch the error here and rethrow again.
/// TODO: fallback to other engine, like re2, when exceptions occur.
throw Exception(ErrorCodes::INCORRECT_DICTIONARY_DEFINITION, "Error occurs when compiling regular expressions, reason: {}", e.message());
}
#endif
@ -270,8 +339,17 @@ void RegExpTreeDictionary::loadData()
}
RegExpTreeDictionary::RegExpTreeDictionary(
const StorageID & id_, const DictionaryStructure & structure_, DictionarySourcePtr source_ptr_, Configuration configuration_)
: IDictionary(id_), structure(structure_), source_ptr(source_ptr_), configuration(configuration_), logger(&Poco::Logger::get("RegExpTreeDictionary"))
const StorageID & id_,
const DictionaryStructure & structure_,
DictionarySourcePtr source_ptr_,
Configuration configuration_,
bool use_vectorscan_)
: IDictionary(id_),
structure(structure_),
source_ptr(source_ptr_),
configuration(configuration_),
use_vectorscan(use_vectorscan_),
logger(&Poco::Logger::get("RegExpTreeDictionary"))
{
if (auto * ch_source = typeid_cast<ClickHouseDictionarySource *>(source_ptr.get()))
{
@ -289,12 +367,15 @@ RegExpTreeDictionary::RegExpTreeDictionary(
calculateBytesAllocated();
}
String processBackRefs(const String & data, const re2_st::RE2 & searcher, const std::vector<StringPiece> & pieces)
std::pair<String, bool> processBackRefs(const String & data, const re2_st::RE2 & searcher, const std::vector<StringPiece> & pieces)
{
re2_st::StringPiece haystack(data.data(), data.size());
re2_st::StringPiece matches[10];
String result;
searcher.Match(haystack, 0, data.size(), re2_st::RE2::Anchor::UNANCHORED, matches, 10);
/// if the pattern is a single '$1' but fails to match, we would use the default value.
if (pieces.size() == 1 && pieces[0].ref_num >= 0 && pieces[0].ref_num < 10 && matches[pieces[0].ref_num].empty())
return std::make_pair(result, true);
for (const auto & item : pieces)
{
if (item.ref_num >= 0 && item.ref_num < 10)
@ -302,7 +383,7 @@ String processBackRefs(const String & data, const re2_st::RE2 & searcher, const
else
result += item.literal;
}
return result;
return {result, false};
}
// walk towards root and collect attributes.
@ -312,7 +393,9 @@ bool RegExpTreeDictionary::setAttributes(
std::unordered_map<String, Field> & attributes_to_set,
const String & data,
std::unordered_set<UInt64> & visited_nodes,
const std::unordered_map<String, const DictionaryAttribute &> & attributes) const
const std::unordered_map<String, const DictionaryAttribute &> & attributes,
const std::unordered_map<String, ColumnPtr> & defaults,
size_t key_index) const
{
if (visited_nodes.contains(id))
@ -325,8 +408,14 @@ bool RegExpTreeDictionary::setAttributes(
continue;
if (value.containsBackRefs())
{
String updated_str = processBackRefs(data, regex_nodes.at(id)->searcher, value.pieces);
attributes_to_set[name] = parseStringToField(updated_str, attributes.at(name).type);
auto [updated_str, use_default] = processBackRefs(data, regex_nodes.at(id)->searcher, value.pieces);
if (use_default)
{
DefaultValueProvider default_value(attributes.at(name).null_value, defaults.at(name));
attributes_to_set[name] = default_value.getDefaultValue(key_index);
}
else
attributes_to_set[name] = parseStringToField(updated_str, attributes.at(name).type);
}
else
attributes_to_set[name] = value.field;
@ -334,18 +423,17 @@ bool RegExpTreeDictionary::setAttributes(
auto parent_id = regex_nodes.at(id)->parent_id;
if (parent_id > 0)
setAttributes(parent_id, attributes_to_set, data, visited_nodes, attributes);
setAttributes(parent_id, attributes_to_set, data, visited_nodes, attributes, defaults, key_index);
// if all the attributes have set, the walking through can be stopped.
/// if all the attributes have set, the walking through can be stopped.
return attributes_to_set.size() == attributes.size();
}
#if USE_VECTORSCAN
namespace
{
struct MatchContext
{
std::unordered_set<UInt64> matched_idx_set;
std::set<UInt64> matched_idx_set;
std::vector<std::pair<UInt64, UInt64>> matched_idx_sorted_list;
const std::vector<UInt64> & regexp_ids ;
@ -354,14 +442,23 @@ namespace
MatchContext(const std::vector<UInt64> & regexp_ids_, const std::unordered_map<UInt64, UInt64> & topology_order_)
: regexp_ids(regexp_ids_), topology_order(topology_order_) {}
void insert(unsigned int id)
[[maybe_unused]]
void insertIdx(unsigned int idx)
{
UInt64 idx = regexp_ids[id-1];
UInt64 topological_order = topology_order.at(idx);
matched_idx_set.emplace(idx);
matched_idx_sorted_list.push_back(std::make_pair(topological_order, idx));
UInt64 node_id = regexp_ids[idx-1];
UInt64 topological_order = topology_order.at(node_id);
matched_idx_set.emplace(node_id);
matched_idx_sorted_list.push_back(std::make_pair(topological_order, node_id));
}
void insertNodeID(UInt64 id)
{
UInt64 topological_order = topology_order.at(id);
matched_idx_set.emplace(id);
matched_idx_sorted_list.push_back(std::make_pair(topological_order, id));
}
/// Sort by topological order, which indicates the matching priorities.
void sort()
{
std::sort(matched_idx_sorted_list.begin(), matched_idx_sorted_list.end());
@ -373,24 +470,28 @@ namespace
}
};
}
#endif // USE_VECTORSCAN
std::unordered_map<String, ColumnPtr> RegExpTreeDictionary::matchSearchAllIndices(
[[maybe_unused]] const ColumnString::Chars & keys_data,
[[maybe_unused]] const ColumnString::Offsets & keys_offsets,
[[maybe_unused]] const std::unordered_map<String, const DictionaryAttribute &> & attributes,
[[maybe_unused]] const std::unordered_map<String, ColumnPtr> & defaults) const
std::unordered_map<String, ColumnPtr> RegExpTreeDictionary::match(
const ColumnString::Chars & keys_data,
const ColumnString::Offsets & keys_offsets,
const std::unordered_map<String, const DictionaryAttribute &> & attributes,
const std::unordered_map<String, ColumnPtr> & defaults) const
{
#if USE_VECTORSCAN
hs_scratch_t * scratch = nullptr;
hs_error_t err = hs_clone_scratch(hyperscan_regex->get()->getScratch(), &scratch);
if (err != HS_SUCCESS)
if (use_vectorscan)
{
throw Exception(ErrorCodes::CANNOT_ALLOCATE_MEMORY, "Could not clone scratch space for hyperscan");
hs_error_t err = hs_clone_scratch(hyperscan_regex->get()->getScratch(), &scratch);
if (err != HS_SUCCESS)
{
throw Exception(ErrorCodes::CANNOT_ALLOCATE_MEMORY, "Could not clone scratch space for hyperscan");
}
}
MultiRegexps::ScratchPtr smart_scratch(scratch);
#endif
std::unordered_map<String, MutableColumnPtr> columns;
@ -402,16 +503,6 @@ std::unordered_map<String, ColumnPtr> RegExpTreeDictionary::matchSearchAllIndice
columns[name] = std::move(col_ptr);
}
auto on_match = [](unsigned int id,
unsigned long long /* from */, // NOLINT
unsigned long long /* to */, // NOLINT
unsigned int /* flags */,
void * context) -> int
{
static_cast<MatchContext *>(context)->insert(id);
return 0;
};
UInt64 offset = 0;
for (size_t key_idx = 0; key_idx < keys_offsets.size(); ++key_idx)
{
@ -420,25 +511,46 @@ std::unordered_map<String, ColumnPtr> RegExpTreeDictionary::matchSearchAllIndice
MatchContext match_result(regexp_ids, topology_order);
err = hs_scan(
hyperscan_regex->get()->getDB(),
reinterpret_cast<const char *>(keys_data.data()) + offset,
static_cast<unsigned>(length),
0,
smart_scratch.get(),
on_match,
&match_result);
#if USE_VECTORSCAN
if (use_vectorscan)
{
auto on_match = [](unsigned int id,
unsigned long long /* from */, // NOLINT
unsigned long long /* to */, // NOLINT
unsigned int /* flags */,
void * context) -> int
{
static_cast<MatchContext *>(context)->insertIdx(id);
return 0;
};
hs_error_t err = hs_scan(
hyperscan_regex->get()->getDB(),
reinterpret_cast<const char *>(keys_data.data()) + offset,
static_cast<unsigned>(length),
0,
smart_scratch.get(),
on_match,
&match_result);
if (err != HS_SUCCESS)
throw Exception(ErrorCodes::HYPERSCAN_CANNOT_SCAN_TEXT, "Failed to scan data with vectorscan");
if (err != HS_SUCCESS)
throw Exception(ErrorCodes::HYPERSCAN_CANNOT_SCAN_TEXT, "Failed to scan data with vectorscan");
}
#endif
for (const auto & node_ptr : complex_regexp_nodes)
{
if (node_ptr->match(reinterpret_cast<const char *>(keys_data.data()) + offset, length))
{
match_result.insertNodeID(node_ptr->id);
}
}
match_result.sort();
// Walk through the regex tree util all attributes are set;
/// Walk through the regex tree util all attributes are set;
std::unordered_map<String, Field> attributes_to_set;
std::unordered_set<UInt64> visited_nodes;
// check if it is a valid id
/// Some node matches but its parents cannot match. In this case we must regard this node unmatched.
auto is_invalid = [&](UInt64 id)
{
while (id)
@ -459,7 +571,7 @@ std::unordered_map<String, ColumnPtr> RegExpTreeDictionary::matchSearchAllIndice
continue;
if (visited_nodes.contains(id))
continue;
if (setAttributes(id, attributes_to_set, str, visited_nodes, attributes))
if (setAttributes(id, attributes_to_set, str, visited_nodes, attributes, defaults, key_idx))
break;
}
@ -468,12 +580,11 @@ std::unordered_map<String, ColumnPtr> RegExpTreeDictionary::matchSearchAllIndice
if (attributes_to_set.contains(name))
continue;
/// TODO: default value might be a back-reference, that is useful in lib ua-core
DefaultValueProvider default_value(attr.null_value, defaults.at(name));
columns[name]->insert(default_value.getDefaultValue(key_idx));
}
// insert to columns
/// insert to columns
for (const auto & [name, value] : attributes_to_set)
columns[name]->insert(value);
@ -485,9 +596,6 @@ std::unordered_map<String, ColumnPtr> RegExpTreeDictionary::matchSearchAllIndice
result.emplace(name, std::move(mutable_ptr));
return result;
#else
throw Exception(ErrorCodes::UNSUPPORTED_METHOD, "Multi search all indices is not implemented when USE_VECTORSCAN is off");
#endif // USE_VECTORSCAN
}
Columns RegExpTreeDictionary::getColumns(
@ -516,7 +624,7 @@ Columns RegExpTreeDictionary::getColumns(
/// calculate matches
const ColumnString * key_column = typeid_cast<const ColumnString *>(key_columns[0].get());
const auto & columns_map = matchSearchAllIndices(
const auto & columns_map = match(
key_column->getChars(),
key_column->getOffsets(),
attributes,
@ -561,7 +669,7 @@ void registerDictionaryRegExpTree(DictionaryFactory & factory)
"regexp_tree dictionary doesn't accept sources other than yaml source. "
"To active it, please set regexp_dict_allow_other_sources=true");
return std::make_unique<RegExpTreeDictionary>(dict_id, dict_struct, std::move(source_ptr), configuration);
return std::make_unique<RegExpTreeDictionary>(dict_id, dict_struct, std::move(source_ptr), configuration, context->getSettings().regexp_dict_allow_hyperscan);
};
factory.registerLayout("regexp_tree", create_layout, true);

View File

@ -43,7 +43,11 @@ public:
const std::string name = "RegExpTree";
RegExpTreeDictionary(
const StorageID & id_, const DictionaryStructure & structure_, DictionarySourcePtr source_ptr_, Configuration configuration_);
const StorageID & id_,
const DictionaryStructure & structure_,
DictionarySourcePtr source_ptr_,
Configuration configuration_,
bool use_vectorscan_);
std::string getTypeName() const override { return name; }
@ -79,7 +83,7 @@ public:
std::shared_ptr<const IExternalLoadable> clone() const override
{
return std::make_shared<RegExpTreeDictionary>(getDictionaryID(), structure, source_ptr->clone(), configuration);
return std::make_shared<RegExpTreeDictionary>(getDictionaryID(), structure, source_ptr->clone(), configuration, use_vectorscan);
}
ColumnUInt8::Ptr hasKeys(const Columns &, const DataTypes &) const override
@ -122,11 +126,6 @@ private:
mutable std::atomic<size_t> query_count{0};
mutable std::atomic<size_t> found_count{0};
std::vector<std::string> regexps;
std::vector<UInt64> regexp_ids;
Poco::Logger * logger;
void calculateBytesAllocated();
void loadData();
@ -135,7 +134,7 @@ private:
void initTopologyOrder(UInt64 node_idx, std::set<UInt64> & visited, UInt64 & topology_id);
void initGraph();
std::unordered_map<String, ColumnPtr> matchSearchAllIndices(
std::unordered_map<String, ColumnPtr> match(
const ColumnString::Chars & keys_data,
const ColumnString::Offsets & keys_offsets,
const std::unordered_map<String, const DictionaryAttribute &> & attributes,
@ -146,16 +145,26 @@ private:
std::unordered_map<String, Field> & attributes_to_set,
const String & data,
std::unordered_set<UInt64> & visited_nodes,
const std::unordered_map<String, const DictionaryAttribute &> & attributes) const;
const std::unordered_map<String, const DictionaryAttribute &> & attributes,
const std::unordered_map<String, ColumnPtr> & defaults,
size_t key_index) const;
struct RegexTreeNode;
using RegexTreeNodePtr = std::unique_ptr<RegexTreeNode>;
using RegexTreeNodePtr = std::shared_ptr<RegexTreeNode>;
bool use_vectorscan;
std::vector<std::string> simple_regexps;
std::vector<UInt64> regexp_ids;
std::vector<RegexTreeNodePtr> complex_regexp_nodes;
std::map<UInt64, RegexTreeNodePtr> regex_nodes;
std::unordered_map<UInt64, UInt64> topology_order;
#if USE_VECTORSCAN
MultiRegexps::DeferredConstructedRegexpsPtr hyperscan_regex;
#endif
Poco::Logger * logger;
};
}

View File

@ -1,6 +1,7 @@
#if defined(OS_LINUX)
#include "IOUringReader.h"
#if USE_LIBURING
#include <base/errnoToString.h>
#include <Common/assert_cast.h>
#include <Common/Exception.h>

View File

@ -1,5 +1,8 @@
#pragma once
#if defined(OS_LINUX)
#include "config.h"
#if USE_LIBURING
#include <Common/ThreadPool.h>
#include <IO/AsynchronousReader.h>

View File

@ -7,7 +7,7 @@
#include <Disks/IO/ThreadPoolReader.h>
#include <IO/SynchronousReader.h>
#include <Common/ProfileEvents.h>
#include "config.h"
namespace ProfileEvents
{
@ -84,7 +84,7 @@ std::unique_ptr<ReadBufferFromFileBase> createReadBufferFromFileBase(
}
else if (settings.local_fs_method == LocalFSReadMethod::io_uring)
{
#if defined(OS_LINUX)
#if USE_LIBURING
static std::shared_ptr<IOUringReader> reader = std::make_shared<IOUringReader>(512);
if (!reader->isSupported())
throw Exception(ErrorCodes::UNSUPPORTED_METHOD, "io_uring is not supported by this system");

View File

@ -0,0 +1,41 @@
#include <string>
#include <vector>
#include <Common/logger_useful.h>
#include "config.h"
#if USE_AZURE_BLOB_STORAGE
#include <azure/storage/blobs.hpp>
#include <azure/storage/common/internal/xml_wrapper.hpp>
#include <azure/storage/blobs/blob_container_client.hpp>
#include <azure/storage/blobs/blob_options.hpp>
#include <gtest/gtest.h>
TEST(AzureXMLWrapper, TestLeak)
{
std::string str = "<hello>world</hello>";
Azure::Storage::_internal::XmlReader reader(str.c_str(), str.length());
Azure::Storage::_internal::XmlReader reader2(std::move(reader));
Azure::Storage::_internal::XmlReader reader3 = std::move(reader2);
reader3.Read();
}
TEST(AzureBlobContainerClient, CurlMemoryLeak)
{
using Azure::Storage::Blobs::BlobContainerClient;
using Azure::Storage::Blobs::BlobClientOptions;
static constexpr auto unavailable_url = "http://unavailable:19999/bucket";
static constexpr auto container = "container";
BlobClientOptions options;
options.Retry.MaxRetries = 0;
auto client = std::make_unique<BlobContainerClient>(BlobContainerClient::CreateFromConnectionString(unavailable_url, container, options));
EXPECT_THROW({ client->ListBlobs(); }, Azure::Core::Http::TransportException);
}
#endif

View File

@ -1,25 +0,0 @@
#include <string>
#include <vector>
#include <Common/logger_useful.h>
#include "config.h"
#if USE_AZURE_BLOB_STORAGE
#include <azure/storage/blobs.hpp>
#include <azure/storage/common/internal/xml_wrapper.hpp>
#include <gtest/gtest.h>
TEST(AzureXMLWrapper, TestLeak)
{
std::string str = "<hello>world</hello>";
Azure::Storage::_internal::XmlReader reader(str.c_str(), str.length());
Azure::Storage::_internal::XmlReader reader2(std::move(reader));
Azure::Storage::_internal::XmlReader reader3 = std::move(reader2);
reader3.Read();
}
#endif

View File

@ -982,7 +982,7 @@ struct JSONExtractTree
return false;
}
assert_cast<ColumnDecimal<DecimalType> &>(dest).insert(value);
assert_cast<ColumnDecimal<DecimalType> &>(dest).insertValue(value);
return true;
}

View File

@ -209,9 +209,13 @@ struct AggregationMethodOneNumber
// Insert the key from the hash table into columns.
static void insertKeyIntoColumns(const Key & key, std::vector<IColumn *> & key_columns, const Sizes & /*key_sizes*/)
{
static_assert(sizeof(FieldType) <= sizeof(Key));
const auto * key_holder = reinterpret_cast<const char *>(&key);
auto * column = static_cast<ColumnVectorHelper *>(key_columns[0]);
column->insertRawData<sizeof(FieldType)>(key_holder);
if constexpr (sizeof(FieldType) < sizeof(Key) && std::endian::native == std::endian::big)
column->insertRawData<sizeof(FieldType)>(key_holder + (sizeof(Key) - sizeof(FieldType)));
else
column->insertRawData<sizeof(FieldType)>(key_holder);
}
};

View File

@ -94,6 +94,8 @@ struct BloomFilterHash
else if (which.isFloat32()) return build_hash_column(getNumberTypeHash<Float64, Float64>(field));
else if (which.isFloat64()) return build_hash_column(getNumberTypeHash<Float64, Float64>(field));
else if (which.isUUID()) return build_hash_column(getNumberTypeHash<UUID, UUID>(field));
else if (which.isIPv4()) return build_hash_column(getNumberTypeHash<IPv4, IPv4>(field));
else if (which.isIPv6()) return build_hash_column(getNumberTypeHash<IPv6, IPv6>(field));
else if (which.isString()) return build_hash_column(getStringTypeHash(field));
else if (which.isFixedString()) return build_hash_column(getFixedStringTypeHash(field, data_type));
else throw Exception(ErrorCodes::BAD_ARGUMENTS, "Unexpected type {} of bloom filter index.", data_type->getName());
@ -156,6 +158,8 @@ struct BloomFilterHash
else if (which.isFloat32()) getNumberTypeHash<Float32, is_first>(column, vec, pos);
else if (which.isFloat64()) getNumberTypeHash<Float64, is_first>(column, vec, pos);
else if (which.isUUID()) getNumberTypeHash<UUID, is_first>(column, vec, pos);
else if (which.isIPv4()) getNumberTypeHash<IPv4, is_first>(column, vec, pos);
else if (which.isIPv6()) getNumberTypeHash<IPv6, is_first>(column, vec, pos);
else if (which.isString()) getStringTypeHash<is_first>(column, vec, pos);
else if (which.isFixedString()) getStringTypeHash<is_first>(column, vec, pos);
else throw Exception(ErrorCodes::BAD_ARGUMENTS, "Unexpected type {} of bloom filter index.", data_type->getName());

View File

@ -313,14 +313,6 @@ Pipe && QueryCache::Reader::getPipe()
return std::move(pipe);
}
QueryCache::QueryCache(size_t max_cache_size_in_bytes_, size_t max_cache_entries_, size_t max_cache_entry_size_in_bytes_, size_t max_cache_entry_size_in_rows_)
: max_cache_size_in_bytes(max_cache_size_in_bytes_)
, max_cache_entries(max_cache_entries_)
, max_cache_entry_size_in_bytes(max_cache_entry_size_in_bytes_)
, max_cache_entry_size_in_rows(max_cache_entry_size_in_rows_)
{
}
QueryCache::Reader QueryCache::createReader(const Key & key)
{
std::lock_guard lock(mutex);
@ -343,14 +335,22 @@ void QueryCache::reset()
size_t QueryCache::recordQueryRun(const Key & key)
{
static constexpr size_t TIMES_EXECUTED_MAX_SIZE = 10'000;
std::lock_guard times_executed_lock(mutex);
std::lock_guard lock(mutex);
size_t times = ++times_executed[key];
// Regularly drop times_executed to avoid DOS-by-unlimited-growth.
static constexpr size_t TIMES_EXECUTED_MAX_SIZE = 10'000;
if (times_executed.size() > TIMES_EXECUTED_MAX_SIZE)
times_executed.clear();
return times;
}
void QueryCache::updateConfiguration(const Poco::Util::AbstractConfiguration & config)
{
std::lock_guard lock(mutex);
max_cache_size_in_bytes = config.getUInt64("query_cache.size", 1_GiB);
max_cache_entries = config.getUInt64("query_cache.max_entries", 1024);
max_cache_entry_size_in_bytes = config.getUInt64("query_cache.max_entry_size", 1_MiB);
max_cache_entry_size_in_rows = config.getUInt64("query_cache.max_entry_rows", 30'000'000);
}
}

View File

@ -2,6 +2,7 @@
#include <Core/Block.h>
#include <Parsers/IAST_fwd.h>
#include <Poco/Util/LayeredConfiguration.h>
#include <Processors/Chunk.h>
#include <QueryPipeline/Pipe.h>
@ -132,7 +133,7 @@ public:
friend class QueryCache; /// for createReader()
};
QueryCache(size_t max_cache_size_in_bytes_, size_t max_cache_entries_, size_t max_cache_entry_size_in_bytes_, size_t max_cache_entry_size_in_rows_);
void updateConfiguration(const Poco::Util::AbstractConfiguration & config);
Reader createReader(const Key & key);
Writer createWriter(const Key & key, std::chrono::milliseconds min_query_runtime);
@ -154,11 +155,13 @@ private:
Cache cache TSA_GUARDED_BY(mutex);
TimesExecuted times_executed TSA_GUARDED_BY(mutex);
size_t cache_size_in_bytes TSA_GUARDED_BY(mutex) = 0; /// updated in each cache insert/delete
const size_t max_cache_size_in_bytes;
const size_t max_cache_entries;
const size_t max_cache_entry_size_in_bytes;
const size_t max_cache_entry_size_in_rows;
/// Cache configuration
size_t max_cache_size_in_bytes TSA_GUARDED_BY(mutex) = 0;
size_t max_cache_entries TSA_GUARDED_BY(mutex) = 0;
size_t max_cache_entry_size_in_bytes TSA_GUARDED_BY(mutex) = 0;
size_t max_cache_entry_size_in_rows TSA_GUARDED_BY(mutex) = 0;
size_t cache_size_in_bytes TSA_GUARDED_BY(mutex) = 0; /// Updated in each cache insert/delete
friend class StorageSystemQueryCache;
};

View File

@ -2041,14 +2041,22 @@ void Context::dropIndexMarkCache() const
shared->index_mark_cache->reset();
}
void Context::setQueryCache(size_t max_size_in_bytes, size_t max_entries, size_t max_entry_size_in_bytes, size_t max_entry_size_in_records)
void Context::setQueryCache(const Poco::Util::AbstractConfiguration & config)
{
auto lock = getLock();
if (shared->query_cache)
throw Exception(ErrorCodes::LOGICAL_ERROR, "Query cache has been already created.");
shared->query_cache = std::make_shared<QueryCache>(max_size_in_bytes, max_entries, max_entry_size_in_bytes, max_entry_size_in_records);
shared->query_cache = std::make_shared<QueryCache>();
shared->query_cache->updateConfiguration(config);
}
void Context::updateQueryCacheConfiguration(const Poco::Util::AbstractConfiguration & config)
{
auto lock = getLock();
if (shared->query_cache)
shared->query_cache->updateConfiguration(config);
}
QueryCachePtr Context::getQueryCache() const

View File

@ -872,7 +872,8 @@ public:
void dropMMappedFileCache() const;
/// Create a cache of query results for statements which run repeatedly.
void setQueryCache(size_t max_size_in_bytes, size_t max_entries, size_t max_entry_size_in_bytes, size_t max_entry_size_in_records);
void setQueryCache(const Poco::Util::AbstractConfiguration & config);
void updateQueryCacheConfiguration(const Poco::Util::AbstractConfiguration & config);
std::shared_ptr<QueryCache> getQueryCache() const;
void dropQueryCache() const;

View File

@ -1801,11 +1801,16 @@ ExpressionActionsPtr ExpressionAnalyzer::getActions(bool add_aliases, bool proje
getActionsDAG(add_aliases, project_result), ExpressionActionsSettings::fromContext(getContext(), compile_expressions));
}
ExpressionActionsPtr ExpressionAnalyzer::getConstActions(const ColumnsWithTypeAndName & constant_inputs)
ActionsDAGPtr ExpressionAnalyzer::getConstActionsDAG(const ColumnsWithTypeAndName & constant_inputs)
{
auto actions = std::make_shared<ActionsDAG>(constant_inputs);
getRootActions(query, true /* no_makeset_for_subqueries */, actions, true /* only_consts */);
return actions;
}
ExpressionActionsPtr ExpressionAnalyzer::getConstActions(const ColumnsWithTypeAndName & constant_inputs)
{
auto actions = getConstActionsDAG(constant_inputs);
return std::make_shared<ExpressionActions>(actions, ExpressionActionsSettings::fromContext(getContext()));
}

View File

@ -119,8 +119,9 @@ public:
ActionsDAGPtr getActionsDAG(bool add_aliases, bool project_result = true);
ExpressionActionsPtr getActions(bool add_aliases, bool project_result = true, CompileExpressions compile_expressions = CompileExpressions::no);
/// Actions that can be performed on an empty block: adding constants and applying functions that depend only on constants.
/// Get actions to evaluate a constant expression. The function adds constants and applies functions that depend only on constants.
/// Does not execute subqueries.
ActionsDAGPtr getConstActionsDAG(const ColumnsWithTypeAndName & constant_inputs = {});
ExpressionActionsPtr getConstActions(const ColumnsWithTypeAndName & constant_inputs = {});
/** Sets that require a subquery to be create.

View File

@ -70,7 +70,7 @@ std::pair<Field, std::shared_ptr<const IDataType>> evaluateConstantExpression(co
if (context->getClientInfo().query_kind != ClientInfo::QueryKind::SECONDARY_QUERY && context->getSettingsRef().normalize_function_names)
FunctionNameNormalizer().visit(ast.get());
String name = ast->getColumnName();
String result_name = ast->getColumnName();
auto syntax_result = TreeRewriter(context).analyze(ast, source_columns);
/// AST potentially could be transformed to literal during TreeRewriter analyze.
@ -78,33 +78,37 @@ std::pair<Field, std::shared_ptr<const IDataType>> evaluateConstantExpression(co
if (ASTLiteral * literal = ast->as<ASTLiteral>())
return getFieldAndDataTypeFromLiteral(literal);
ExpressionActionsPtr expr_for_constant_folding = ExpressionAnalyzer(ast, syntax_result, context).getConstActions();
auto actions = ExpressionAnalyzer(ast, syntax_result, context).getConstActionsDAG();
/// There must be at least one column in the block so that it knows the number of rows.
Block block_with_constants{{ ColumnConst::create(ColumnUInt8::create(1, 0), 1), std::make_shared<DataTypeUInt8>(), "_dummy" }};
ColumnPtr result_column;
DataTypePtr result_type;
for (const auto & action_node : actions->getOutputs())
{
if ((action_node->result_name == result_name) && action_node->column)
{
result_column = action_node->column;
result_type = action_node->result_type;
break;
}
}
expr_for_constant_folding->execute(block_with_constants);
if (!result_column)
throw Exception(ErrorCodes::BAD_ARGUMENTS,
"Element of set in IN, VALUES or LIMIT or aggregate function parameter "
"is not a constant expression (result column not found): {}", result_name);
if (!block_with_constants || block_with_constants.rows() == 0)
if (result_column->empty())
throw Exception(ErrorCodes::LOGICAL_ERROR,
"Logical error: empty block after evaluation "
"Logical error: empty result column after evaluation "
"of constant expression for IN, VALUES or LIMIT or aggregate function parameter");
if (!block_with_constants.has(name))
throw Exception(ErrorCodes::BAD_ARGUMENTS,
"Element of set in IN, VALUES or LIMIT or aggregate function parameter "
"is not a constant expression (result column not found): {}", name);
const ColumnWithTypeAndName & result = block_with_constants.getByName(name);
const IColumn & result_column = *result.column;
/// Expressions like rand() or now() are not constant
if (!isColumnConst(result_column))
if (!isColumnConst(*result_column))
throw Exception(ErrorCodes::BAD_ARGUMENTS,
"Element of set in IN, VALUES or LIMIT or aggregate function parameter "
"is not a constant expression (result column is not const): {}", name);
"is not a constant expression (result column is not const): {}", result_name);
return std::make_pair(result_column[0], result.type);
return std::make_pair((*result_column)[0], result_type);
}

View File

@ -74,6 +74,11 @@ void ASTStorage::formatImpl(const FormatSettings & s, FormatState & state, Forma
}
}
bool ASTStorage::isExtendedStorageDefinition() const
{
return partition_by || primary_key || order_by || sample_by || settings;
}
class ASTColumnsElement : public IAST
{

View File

@ -30,6 +30,8 @@ public:
ASTPtr clone() const override;
void formatImpl(const FormatSettings & s, FormatState & state, FormatStateStacked frame) const override;
bool isExtendedStorageDefinition() const;
};

View File

@ -8,7 +8,7 @@ namespace DB
{
JSONColumnsBlockOutputFormat::JSONColumnsBlockOutputFormat(WriteBuffer & out_, const Block & header_, const FormatSettings & format_settings_, bool validate_utf8, size_t indent_)
: JSONColumnsBlockOutputFormatBase(out_, header_, format_settings_, validate_utf8), indent(indent_)
: JSONColumnsBlockOutputFormatBase(out_, header_, format_settings_, validate_utf8), indent(indent_), header(header_)
{
names = JSONUtils::makeNamesValidJSONStrings(header_.getNames(), format_settings, validate_utf8);
}
@ -25,6 +25,18 @@ void JSONColumnsBlockOutputFormat::writeColumnStart(size_t column_index)
void JSONColumnsBlockOutputFormat::writeChunkEnd()
{
/// Write empty chunk
if (!written_rows)
{
const auto & columns = header.getColumns();
for (size_t i = 0; i != columns.size(); ++i)
{
writeColumnStart(i);
writeColumn(*columns[i], *serializations[i]);
writeColumnEnd(i == columns.size() - 1);
}
}
JSONUtils::writeObjectEnd(*ostr, indent);
writeChar('\n', *ostr);
}

View File

@ -27,6 +27,8 @@ protected:
Names names;
size_t indent;
Block header;
};
}

View File

@ -42,6 +42,7 @@ void JSONColumnsBlockOutputFormatBase::writeChunk(Chunk & chunk)
writeColumn(*columns[i], *serializations[i]);
writeColumnEnd(i == columns.size() - 1);
}
written_rows += chunk.getNumRows();
writeChunkEnd();
}

View File

@ -36,6 +36,8 @@ protected:
const Serializations serializations;
Chunk mono_chunk;
size_t written_rows = 0;
};
}

View File

@ -124,6 +124,8 @@ size_t IntersectOrExceptTransform::buildFilter(
void IntersectOrExceptTransform::accumulate(Chunk chunk)
{
convertToFullIfSparse(chunk);
auto num_rows = chunk.getNumRows();
auto columns = chunk.detachColumns();
@ -160,6 +162,8 @@ void IntersectOrExceptTransform::accumulate(Chunk chunk)
void IntersectOrExceptTransform::filter(Chunk & chunk)
{
convertToFullIfSparse(chunk);
auto num_rows = chunk.getNumRows();
auto columns = chunk.detachColumns();

View File

@ -700,12 +700,14 @@ struct StorageDistributedDirectoryMonitor::BatchHeader
struct StorageDistributedDirectoryMonitor::Batch
{
/// File indexes for this batch.
std::vector<UInt64> file_indices;
size_t total_rows = 0;
size_t total_bytes = 0;
bool recovered = false;
StorageDistributedDirectoryMonitor & parent;
/// Information about all available indexes (not only for the current batch).
const std::map<UInt64, String> & file_index_to_path;
bool split_batch_on_failure = true;
@ -795,17 +797,22 @@ struct StorageDistributedDirectoryMonitor::Batch
else
{
std::vector<std::string> files;
for (const auto && file_info : file_index_to_path | boost::adaptors::indexed())
for (auto file_index_info : file_indices | boost::adaptors::indexed())
{
if (file_info.index() > 8)
if (file_index_info.index() > 8)
{
files.push_back("...");
break;
}
files.push_back(file_info.value().second);
auto file_index = file_index_info.value();
auto file_path = file_index_to_path.find(file_index);
if (file_path != file_index_to_path.end())
files.push_back(file_path->second);
else
files.push_back(fmt::format("#{}.bin (deleted)", file_index));
}
e.addMessage(fmt::format("While sending batch, nums: {}, files: {}", file_index_to_path.size(), fmt::join(files, "\n")));
e.addMessage(fmt::format("While sending batch, size: {}, files: {}", file_indices.size(), fmt::join(files, "\n")));
throw;
}

View File

@ -706,8 +706,12 @@ Block KeyCondition::getBlockWithConstants(
if (syntax_analyzer_result)
{
const auto expr_for_constant_folding = ExpressionAnalyzer(query, syntax_analyzer_result, context).getConstActions();
expr_for_constant_folding->execute(result);
auto actions = ExpressionAnalyzer(query, syntax_analyzer_result, context).getConstActionsDAG();
for (const auto & action_node : actions->getOutputs())
{
if (action_node->column)
result.insert(ColumnWithTypeAndName{action_node->column, action_node->result_type, action_node->result_name});
}
}
return result;

View File

@ -379,11 +379,6 @@ namespace
bool columnExists(const String & name) const { return block.has(name); }
void insertStringColumn(const ColumnPtr & column, const String & name)
{
block.insert({column, std::make_shared<DataTypeString>(), name});
}
void insertUInt8Column(const ColumnPtr & column, const String & name)
{
block.insert({column, std::make_shared<DataTypeUInt8>(), name});
@ -399,6 +394,11 @@ namespace
block.insert({column, std::make_shared<DataTypeUUID>(), name});
}
void insertLowCardinalityColumn(const ColumnPtr & column, const String & name)
{
block.insert({column, std::make_shared<DataTypeLowCardinality>(std::make_shared<DataTypeString>()), name});
}
void insertPartitionValueColumn(
size_t rows, const Row & partition_value, const DataTypePtr & partition_value_type, const String & name)
{
@ -483,11 +483,13 @@ static void injectPartConstVirtualColumns(
{
ColumnPtr column;
if (rows)
column = DataTypeString().createColumnConst(rows, part->name)->convertToFullColumnIfConst();
column = DataTypeLowCardinality{std::make_shared<DataTypeString>()}
.createColumnConst(rows, part->name)
->convertToFullColumnIfConst();
else
column = DataTypeString().createColumn();
column = DataTypeLowCardinality{std::make_shared<DataTypeString>()}.createColumn();
inserter.insertStringColumn(column, virtual_column_name);
inserter.insertLowCardinalityColumn(column, virtual_column_name);
}
else if (virtual_column_name == "_part_index")
{
@ -513,11 +515,13 @@ static void injectPartConstVirtualColumns(
{
ColumnPtr column;
if (rows)
column = DataTypeString().createColumnConst(rows, part->info.partition_id)->convertToFullColumnIfConst();
column = DataTypeLowCardinality{std::make_shared<DataTypeString>()}
.createColumnConst(rows, part->info.partition_id)
->convertToFullColumnIfConst();
else
column = DataTypeString().createColumn();
column = DataTypeLowCardinality{std::make_shared<DataTypeString>()}.createColumn();
inserter.insertStringColumn(column, virtual_column_name);
inserter.insertLowCardinalityColumn(column, virtual_column_name);
}
else if (virtual_column_name == "_partition_value")
{

View File

@ -840,8 +840,14 @@ Block MergeTreeData::getSampleBlockWithVirtualColumns() const
{
DataTypePtr partition_value_type = getPartitionValueType();
return {
ColumnWithTypeAndName(ColumnString::create(), std::make_shared<DataTypeString>(), "_part"),
ColumnWithTypeAndName(ColumnString::create(), std::make_shared<DataTypeString>(), "_partition_id"),
ColumnWithTypeAndName(
DataTypeLowCardinality{std::make_shared<DataTypeString>()}.createColumn(),
std::make_shared<DataTypeLowCardinality>(std::make_shared<DataTypeString>()),
"_part"),
ColumnWithTypeAndName(
DataTypeLowCardinality{std::make_shared<DataTypeString>()}.createColumn(),
std::make_shared<DataTypeLowCardinality>(std::make_shared<DataTypeString>()),
"_partition_id"),
ColumnWithTypeAndName(ColumnUUID::create(), std::make_shared<DataTypeUUID>(), "_part_uuid"),
ColumnWithTypeAndName(partition_value_type->createColumn(), partition_value_type, "_partition_value")};
}
@ -1889,7 +1895,9 @@ size_t MergeTreeData::clearOldTemporaryDirectories(size_t custom_directories_lif
{
if (temporary_parts.contains(basename))
{
LOG_WARNING(log, "{} is in use (by merge/mutation/INSERT) (consider increasing temporary_directories_lifetime setting)", full_path);
/// Actually we don't rely on temporary_directories_lifetime when removing old temporaries directoties,
/// it's just an extra level of protection just in case we have a bug.
LOG_INFO(log, "{} is in use (by merge/mutation/INSERT) (consider increasing temporary_directories_lifetime setting)", full_path);
continue;
}
else
@ -7576,7 +7584,19 @@ MergeTreeData::WriteAheadLogPtr MergeTreeData::getWriteAheadLog()
if (!write_ahead_log)
{
auto reservation = reserveSpace(getSettings()->write_ahead_log_max_bytes);
write_ahead_log = std::make_shared<MergeTreeWriteAheadLog>(*this, reservation->getDisk());
for (const auto & disk: reservation->getDisks())
{
if (!disk->isRemote())
{
write_ahead_log = std::make_shared<MergeTreeWriteAheadLog>(*this, disk);
break;
}
}
if (!write_ahead_log)
throw Exception(
ErrorCodes::NOT_IMPLEMENTED,
"Can't store write ahead log in remote disk. It makes no sense.");
}
return write_ahead_log;
@ -7585,10 +7605,10 @@ MergeTreeData::WriteAheadLogPtr MergeTreeData::getWriteAheadLog()
NamesAndTypesList MergeTreeData::getVirtuals() const
{
return NamesAndTypesList{
NameAndTypePair("_part", std::make_shared<DataTypeString>()),
NameAndTypePair("_part", std::make_shared<DataTypeLowCardinality>(std::make_shared<DataTypeString>())),
NameAndTypePair("_part_index", std::make_shared<DataTypeUInt64>()),
NameAndTypePair("_part_uuid", std::make_shared<DataTypeUUID>()),
NameAndTypePair("_partition_id", std::make_shared<DataTypeString>()),
NameAndTypePair("_partition_id", std::make_shared<DataTypeLowCardinality>(std::make_shared<DataTypeString>())),
NameAndTypePair("_partition_value", getPartitionValueType()),
NameAndTypePair("_sample_factor", std::make_shared<DataTypeFloat64>()),
NameAndTypePair("_part_offset", std::make_shared<DataTypeUInt64>()),

View File

@ -88,7 +88,8 @@ static void assertIndexColumnsType(const Block & header)
WhichDataType which(actual_type);
if (!which.isUInt() && !which.isInt() && !which.isString() && !which.isFixedString() && !which.isFloat() &&
!which.isDate() && !which.isDateTime() && !which.isDateTime64() && !which.isEnum() && !which.isUUID())
!which.isDate() && !which.isDateTime() && !which.isDateTime64() && !which.isEnum() && !which.isUUID() &&
!which.isIPv4() && !which.isIPv6())
throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Unexpected type {} of bloom filter index.", type->getName());
}
}

View File

@ -147,9 +147,9 @@ static StoragePtr create(const StorageFactory::Arguments & args)
* - Additional MergeTreeSettings in the SETTINGS clause;
*/
bool is_extended_storage_def = args.storage_def->partition_by || args.storage_def->primary_key || args.storage_def->order_by
|| args.storage_def->sample_by || (args.query.columns_list->indices && !args.query.columns_list->indices->children.empty())
|| (args.query.columns_list->projections && !args.query.columns_list->projections->children.empty()) || args.storage_def->settings;
bool is_extended_storage_def = args.storage_def->isExtendedStorageDefinition()
|| (args.query.columns_list->indices && !args.query.columns_list->indices->children.empty())
|| (args.query.columns_list->projections && !args.query.columns_list->projections->children.empty());
String name_part = args.engine_name.substr(0, args.engine_name.size() - strlen("MergeTree"));

View File

@ -268,11 +268,11 @@ NamesAndTypesList StorageDistributed::getVirtuals() const
/// NOTE This is weird. Most of these virtual columns are part of MergeTree
/// tables info. But Distributed is general-purpose engine.
return NamesAndTypesList{
NameAndTypePair("_table", std::make_shared<DataTypeString>()),
NameAndTypePair("_part", std::make_shared<DataTypeString>()),
NameAndTypePair("_table", std::make_shared<DataTypeLowCardinality>(std::make_shared<DataTypeString>())),
NameAndTypePair("_part", std::make_shared<DataTypeLowCardinality>(std::make_shared<DataTypeString>())),
NameAndTypePair("_part_index", std::make_shared<DataTypeUInt64>()),
NameAndTypePair("_part_uuid", std::make_shared<DataTypeUUID>()),
NameAndTypePair("_partition_id", std::make_shared<DataTypeString>()),
NameAndTypePair("_partition_id", std::make_shared<DataTypeLowCardinality>(std::make_shared<DataTypeString>())),
NameAndTypePair("_sample_factor", std::make_shared<DataTypeFloat64>()),
NameAndTypePair("_part_offset", std::make_shared<DataTypeUInt64>()),
NameAndTypePair("_row_exists", std::make_shared<DataTypeUInt8>()),

View File

@ -664,7 +664,7 @@ QueryPipelineBuilderPtr ReadFromMerge::createSources(
{
ColumnWithTypeAndName column;
column.name = "_database";
column.type = std::make_shared<DataTypeString>();
column.type = std::make_shared<DataTypeLowCardinality>(std::make_shared<DataTypeString>());
column.column = column.type->createColumnConst(0, Field(database_name));
auto adding_column_dag = ActionsDAG::makeAddingColumnActions(std::move(column));
@ -682,7 +682,7 @@ QueryPipelineBuilderPtr ReadFromMerge::createSources(
{
ColumnWithTypeAndName column;
column.name = "_table";
column.type = std::make_shared<DataTypeString>();
column.type = std::make_shared<DataTypeLowCardinality>(std::make_shared<DataTypeString>());
column.column = column.type->createColumnConst(0, Field(table_name));
auto adding_column_dag = ActionsDAG::makeAddingColumnActions(std::move(column));
@ -980,7 +980,9 @@ void registerStorageMerge(StorageFactory & factory)
NamesAndTypesList StorageMerge::getVirtuals() const
{
NamesAndTypesList virtuals{{"_database", std::make_shared<DataTypeString>()}, {"_table", std::make_shared<DataTypeString>()}};
NamesAndTypesList virtuals{
{"_database", std::make_shared<DataTypeLowCardinality>(std::make_shared<DataTypeString>())},
{"_table", std::make_shared<DataTypeLowCardinality>(std::make_shared<DataTypeString>())}};
auto first_table = getFirstTable([](auto && table) { return table; });
if (first_table)

View File

@ -1601,37 +1601,39 @@ void StorageMergeTree::renameAndCommitEmptyParts(MutableDataPartsVector & new_pa
void StorageMergeTree::truncate(const ASTPtr &, const StorageMetadataPtr &, ContextPtr query_context, TableExclusiveLockHolder &)
{
/// Asks to complete merges and does not allow them to start.
/// This protects against "revival" of data for a removed partition after completion of merge.
auto merge_blocker = stopMergesAndWait();
waitForOutdatedPartsToBeLoaded();
Stopwatch watch;
auto txn = query_context->getCurrentTransaction();
MergeTreeData::Transaction transaction(*this, txn.get());
{
auto operation_data_parts_lock = lockOperationsWithParts();
/// Asks to complete merges and does not allow them to start.
/// This protects against "revival" of data for a removed partition after completion of merge.
auto merge_blocker = stopMergesAndWait();
waitForOutdatedPartsToBeLoaded();
auto parts = getVisibleDataPartsVector(query_context);
Stopwatch watch;
auto future_parts = initCoverageWithNewEmptyParts(parts);
auto txn = query_context->getCurrentTransaction();
MergeTreeData::Transaction transaction(*this, txn.get());
{
auto operation_data_parts_lock = lockOperationsWithParts();
LOG_TEST(log, "Made {} empty parts in order to cover {} parts. Empty parts: {}, covered parts: {}. With txn {}",
future_parts.size(), parts.size(),
fmt::join(getPartsNames(future_parts), ", "), fmt::join(getPartsNames(parts), ", "),
transaction.getTID());
auto parts = getVisibleDataPartsVector(query_context);
captureTmpDirectoryHolders(*this, future_parts);
auto future_parts = initCoverageWithNewEmptyParts(parts);
auto new_data_parts = createEmptyDataParts(*this, future_parts, txn);
renameAndCommitEmptyParts(new_data_parts, transaction);
LOG_TEST(log, "Made {} empty parts in order to cover {} parts. Empty parts: {}, covered parts: {}. With txn {}",
future_parts.size(), parts.size(),
fmt::join(getPartsNames(future_parts), ", "), fmt::join(getPartsNames(parts), ", "),
transaction.getTID());
PartLog::addNewParts(query_context, new_data_parts, watch.elapsed());
captureTmpDirectoryHolders(*this, future_parts);
LOG_INFO(log, "Truncated table with {} parts by replacing them with new empty {} parts. With txn {}",
parts.size(), future_parts.size(),
transaction.getTID());
auto new_data_parts = createEmptyDataParts(*this, future_parts, txn);
renameAndCommitEmptyParts(new_data_parts, transaction);
PartLog::addNewParts(query_context, new_data_parts, watch.elapsed());
LOG_INFO(log, "Truncated table with {} parts by replacing them with new empty {} parts. With txn {}",
parts.size(), future_parts.size(),
transaction.getTID());
}
}
/// Old parts are needed to be destroyed before clearing them from filesystem.
@ -1642,48 +1644,50 @@ void StorageMergeTree::truncate(const ASTPtr &, const StorageMetadataPtr &, Cont
void StorageMergeTree::dropPart(const String & part_name, bool detach, ContextPtr query_context)
{
/// Asks to complete merges and does not allow them to start.
/// This protects against "revival" of data for a removed partition after completion of merge.
auto merge_blocker = stopMergesAndWait();
Stopwatch watch;
/// It's important to create it outside of lock scope because
/// otherwise it can lock parts in destructor and deadlock is possible.
auto txn = query_context->getCurrentTransaction();
MergeTreeData::Transaction transaction(*this, txn.get());
{
auto operation_data_parts_lock = lockOperationsWithParts();
/// Asks to complete merges and does not allow them to start.
/// This protects against "revival" of data for a removed partition after completion of merge.
auto merge_blocker = stopMergesAndWait();
auto part = getPartIfExists(part_name, {MergeTreeDataPartState::Active});
if (!part)
throw Exception(ErrorCodes::NO_SUCH_DATA_PART, "Part {} not found, won't try to drop it.", part_name);
Stopwatch watch;
if (detach)
/// It's important to create it outside of lock scope because
/// otherwise it can lock parts in destructor and deadlock is possible.
auto txn = query_context->getCurrentTransaction();
MergeTreeData::Transaction transaction(*this, txn.get());
{
auto metadata_snapshot = getInMemoryMetadataPtr();
LOG_INFO(log, "Detaching {}", part->getDataPartStorage().getPartDirectory());
part->makeCloneInDetached("", metadata_snapshot);
}
auto operation_data_parts_lock = lockOperationsWithParts();
{
auto future_parts = initCoverageWithNewEmptyParts({part});
auto part = getPartIfExists(part_name, {MergeTreeDataPartState::Active});
if (!part)
throw Exception(ErrorCodes::NO_SUCH_DATA_PART, "Part {} not found, won't try to drop it.", part_name);
LOG_TEST(log, "Made {} empty parts in order to cover {} part. With txn {}",
fmt::join(getPartsNames(future_parts), ", "), fmt::join(getPartsNames({part}), ", "),
transaction.getTID());
if (detach)
{
auto metadata_snapshot = getInMemoryMetadataPtr();
LOG_INFO(log, "Detaching {}", part->getDataPartStorage().getPartDirectory());
part->makeCloneInDetached("", metadata_snapshot);
}
captureTmpDirectoryHolders(*this, future_parts);
{
auto future_parts = initCoverageWithNewEmptyParts({part});
auto new_data_parts = createEmptyDataParts(*this, future_parts, txn);
renameAndCommitEmptyParts(new_data_parts, transaction);
LOG_TEST(log, "Made {} empty parts in order to cover {} part. With txn {}",
fmt::join(getPartsNames(future_parts), ", "), fmt::join(getPartsNames({part}), ", "),
transaction.getTID());
PartLog::addNewParts(query_context, new_data_parts, watch.elapsed());
captureTmpDirectoryHolders(*this, future_parts);
const auto * op = detach ? "Detached" : "Dropped";
LOG_INFO(log, "{} {} part by replacing it with new empty {} part. With txn {}",
op, part->name, future_parts[0].part_name,
transaction.getTID());
auto new_data_parts = createEmptyDataParts(*this, future_parts, txn);
renameAndCommitEmptyParts(new_data_parts, transaction);
PartLog::addNewParts(query_context, new_data_parts, watch.elapsed());
const auto * op = detach ? "Detached" : "Dropped";
LOG_INFO(log, "{} {} part by replacing it with new empty {} part. With txn {}",
op, part->name, future_parts[0].part_name,
transaction.getTID());
}
}
}
@ -1695,58 +1699,60 @@ void StorageMergeTree::dropPart(const String & part_name, bool detach, ContextPt
void StorageMergeTree::dropPartition(const ASTPtr & partition, bool detach, ContextPtr query_context)
{
const auto * partition_ast = partition->as<ASTPartition>();
/// Asks to complete merges and does not allow them to start.
/// This protects against "revival" of data for a removed partition after completion of merge.
auto merge_blocker = stopMergesAndWait();
Stopwatch watch;
/// It's important to create it outside of lock scope because
/// otherwise it can lock parts in destructor and deadlock is possible.
auto txn = query_context->getCurrentTransaction();
MergeTreeData::Transaction transaction(*this, txn.get());
{
auto operation_data_parts_lock = lockOperationsWithParts();
const auto * partition_ast = partition->as<ASTPartition>();
DataPartsVector parts;
/// Asks to complete merges and does not allow them to start.
/// This protects against "revival" of data for a removed partition after completion of merge.
auto merge_blocker = stopMergesAndWait();
Stopwatch watch;
/// It's important to create it outside of lock scope because
/// otherwise it can lock parts in destructor and deadlock is possible.
auto txn = query_context->getCurrentTransaction();
MergeTreeData::Transaction transaction(*this, txn.get());
{
if (partition_ast && partition_ast->all)
parts = getVisibleDataPartsVector(query_context);
else
auto operation_data_parts_lock = lockOperationsWithParts();
DataPartsVector parts;
{
String partition_id = getPartitionIDFromQuery(partition, query_context);
parts = getVisibleDataPartsVectorInPartition(query_context, partition_id);
if (partition_ast && partition_ast->all)
parts = getVisibleDataPartsVector(query_context);
else
{
String partition_id = getPartitionIDFromQuery(partition, query_context);
parts = getVisibleDataPartsVectorInPartition(query_context, partition_id);
}
}
if (detach)
for (const auto & part : parts)
{
auto metadata_snapshot = getInMemoryMetadataPtr();
LOG_INFO(log, "Detaching {}", part->getDataPartStorage().getPartDirectory());
part->makeCloneInDetached("", metadata_snapshot);
}
auto future_parts = initCoverageWithNewEmptyParts(parts);
LOG_TEST(log, "Made {} empty parts in order to cover {} parts. Empty parts: {}, covered parts: {}. With txn {}",
future_parts.size(), parts.size(),
fmt::join(getPartsNames(future_parts), ", "), fmt::join(getPartsNames(parts), ", "),
transaction.getTID());
captureTmpDirectoryHolders(*this, future_parts);
auto new_data_parts = createEmptyDataParts(*this, future_parts, txn);
renameAndCommitEmptyParts(new_data_parts, transaction);
PartLog::addNewParts(query_context, new_data_parts, watch.elapsed());
const auto * op = detach ? "Detached" : "Dropped";
LOG_INFO(log, "{} partition with {} parts by replacing them with new empty {} parts. With txn {}",
op, parts.size(), future_parts.size(),
transaction.getTID());
}
if (detach)
for (const auto & part : parts)
{
auto metadata_snapshot = getInMemoryMetadataPtr();
LOG_INFO(log, "Detaching {}", part->getDataPartStorage().getPartDirectory());
part->makeCloneInDetached("", metadata_snapshot);
}
auto future_parts = initCoverageWithNewEmptyParts(parts);
LOG_TEST(log, "Made {} empty parts in order to cover {} parts. Empty parts: {}, covered parts: {}. With txn {}",
future_parts.size(), parts.size(),
fmt::join(getPartsNames(future_parts), ", "), fmt::join(getPartsNames(parts), ", "),
transaction.getTID());
captureTmpDirectoryHolders(*this, future_parts);
auto new_data_parts = createEmptyDataParts(*this, future_parts, txn);
renameAndCommitEmptyParts(new_data_parts, transaction);
PartLog::addNewParts(query_context, new_data_parts, watch.elapsed());
const auto * op = detach ? "Detached" : "Dropped";
LOG_INFO(log, "{} partition with {} parts by replacing them with new empty {} parts. With txn {}",
op, parts.size(), future_parts.size(),
transaction.getTID());
}
/// Old parts are needed to be destroyed before clearing them from filesystem.

View File

@ -107,6 +107,7 @@ namespace ErrorCodes
extern const int CANNOT_EXTRACT_TABLE_STRUCTURE;
extern const int NOT_IMPLEMENTED;
extern const int CANNOT_COMPILE_REGEXP;
extern const int FILE_DOESNT_EXIST;
}
class IOutputFormat;
@ -260,6 +261,9 @@ private:
outcome_future = listObjectsAsync();
}
if (request_settings.throw_on_zero_files_match && result_batch.empty())
throw Exception(ErrorCodes::FILE_DOESNT_EXIST, "Can not match any files using prefix {}", request.GetPrefix());
KeysWithInfo temp_buffer;
temp_buffer.reserve(result_batch.size());

View File

@ -167,6 +167,7 @@ S3Settings::RequestSettings::RequestSettings(const NamedCollection & collection)
max_connections = collection.getOrDefault<UInt64>("max_connections", max_connections);
list_object_keys_size = collection.getOrDefault<UInt64>("list_object_keys_size", list_object_keys_size);
allow_head_object_request = collection.getOrDefault<bool>("allow_head_object_request", allow_head_object_request);
throw_on_zero_files_match = collection.getOrDefault<bool>("throw_on_zero_files_match", throw_on_zero_files_match);
}
S3Settings::RequestSettings::RequestSettings(
@ -182,6 +183,7 @@ S3Settings::RequestSettings::RequestSettings(
check_objects_after_upload = config.getBool(key + "check_objects_after_upload", settings.s3_check_objects_after_upload);
list_object_keys_size = config.getUInt64(key + "list_object_keys_size", settings.s3_list_object_keys_size);
allow_head_object_request = config.getBool(key + "allow_head_object_request", allow_head_object_request);
throw_on_zero_files_match = config.getBool(key + "throw_on_zero_files_match", settings.s3_throw_on_zero_files_match);
/// NOTE: it would be better to reuse old throttlers to avoid losing token bucket state on every config reload,
/// which could lead to exceeding limit for short time. But it is good enough unless very high `burst` values are used.
@ -231,6 +233,9 @@ void S3Settings::RequestSettings::updateFromSettingsImpl(const Settings & settin
if ((!if_changed || settings.s3_max_put_rps.changed || settings.s3_max_put_burst.changed) && settings.s3_max_put_rps)
put_request_throttler = std::make_shared<Throttler>(
settings.s3_max_put_rps, settings.s3_max_put_burst ? settings.s3_max_put_burst : Throttler::default_burst_seconds * settings.s3_max_put_rps);
if (!if_changed || settings.s3_throw_on_zero_files_match)
throw_on_zero_files_match = settings.s3_throw_on_zero_files_match;
}
void S3Settings::RequestSettings::updateFromSettings(const Settings & settings)

View File

@ -77,6 +77,8 @@ struct S3Settings
/// See https://github.com/aws/aws-sdk-cpp/issues/1558 and also the function S3ErrorMarshaller::ExtractRegion() for more information.
bool allow_head_object_request = true;
bool throw_on_zero_files_match = false;
const PartUploadSettings & getUploadSettings() const { return upload_settings; }
RequestSettings() = default;

View File

@ -51,6 +51,7 @@ const char * auto_config_build[]
"USE_ROCKSDB", "@USE_ROCKSDB@",
"USE_NURAFT", "@USE_NURAFT@",
"USE_NLP", "@USE_NLP@",
"USE_LIBURING", "@USE_LIBURING@",
"USE_SQLITE", "@USE_SQLITE@",
"USE_LIBPQXX", "@USE_LIBPQXX@",
"USE_AZURE_BLOB_STORAGE", "@USE_AZURE_BLOB_STORAGE@",

View File

@ -120,7 +120,7 @@ void StorageSystemProcesses::fillData(MutableColumns & res_columns, ContextPtr c
res_columns[i++]->insert(process.client_info.quota_key);
res_columns[i++]->insert(process.client_info.distributed_depth);
res_columns[i++]->insert(static_cast<double>(process.elapsed_microseconds) / 100000.0);
res_columns[i++]->insert(static_cast<double>(process.elapsed_microseconds) / 1'000'000.0);
res_columns[i++]->insert(process.is_cancelled);
res_columns[i++]->insert(process.is_all_data_sent);
res_columns[i++]->insert(process.read_rows);

View File

@ -129,6 +129,9 @@ if (TARGET ch_contrib::parquet)
set(USE_ARROW 1)
set(USE_ORC 1)
endif()
if (TARGET ch_contrib::liburing)
set(USE_LIBURING 1)
endif ()
if (TARGET ch_contrib::protobuf)
set(USE_PROTOBUF 1)
endif()

View File

@ -5,7 +5,8 @@ import logging
import os
import sys
import time
from typing import Any, List, Optional
from pathlib import Path
from typing import Any, Callable, List, Optional
import requests # type: ignore
@ -56,21 +57,29 @@ def read_build_urls(build_name: str, reports_path: str) -> List[str]:
return []
def download_build_with_progress(url, path):
def download_build_with_progress(url: str, path: Path) -> None:
logging.info("Downloading from %s to temp path %s", url, path)
for i in range(DOWNLOAD_RETRIES_COUNT):
try:
response = get_with_retries(url, retries=1, stream=True)
total_length = int(response.headers.get("content-length", 0))
if path.is_file() and total_length and path.stat().st_size == total_length:
logging.info(
"The file %s already exists and have a proper size %s",
path,
total_length,
)
return
with open(path, "wb") as f:
response = get_with_retries(url, retries=1, stream=True)
total_length = response.headers.get("content-length")
if total_length is None or int(total_length) == 0:
if total_length == 0:
logging.info(
"No content-length, will download file without progress"
)
f.write(response.content)
else:
dl = 0
total_length = int(total_length)
logging.info("Content length is %ld bytes", total_length)
for data in response.iter_content(chunk_size=4096):
dl += len(data)
@ -99,12 +108,14 @@ def download_build_with_progress(url, path):
logging.info("Downloading finished")
def download_builds(result_path, build_urls, filter_fn):
def download_builds(
result_path: str, build_urls: List[str], filter_fn: Callable[[str], bool]
) -> None:
for url in build_urls:
if filter_fn(url):
fname = os.path.basename(url.replace("%2B", "+").replace("%20", " "))
logging.info("Will download %s to %s", fname, result_path)
download_build_with_progress(url, os.path.join(result_path, fname))
download_build_with_progress(url, Path(result_path) / fname)
def download_builds_filter(

View File

@ -182,6 +182,12 @@ CI_CONFIG = {
"tests_config": {
# required_build - build name for artifacts
# force_tests - force success status for tests
"Install packages (amd64)": {
"required_build": "package_release",
},
"Install packages (arm64)": {
"required_build": "package_aarch64",
},
"Stateful tests (asan)": {
"required_build": "package_asan",
},

View File

@ -6,6 +6,7 @@ This file is needed to avoid cicle import build_download_helper.py <=> env_helpe
import argparse
import logging
import os
from pathlib import Path
from build_download_helper import download_build_with_progress
from ci_config import CI_CONFIG, BuildConfig
@ -57,14 +58,15 @@ def parse_args() -> argparse.Namespace:
def main():
logging.basicConfig(level=logging.INFO, format="%(asctime)s %(message)s")
args = parse_args()
os.makedirs(TEMP_PATH, exist_ok=True)
temp_path = Path(TEMP_PATH)
temp_path.mkdir(parents=True, exist_ok=True)
for build in args.build_names:
# check if it's in CI_CONFIG
config = CI_CONFIG["build_config"][build] # type: BuildConfig
if args.rename:
path = os.path.join(TEMP_PATH, f"clickhouse-{config['static_binary_name']}")
path = temp_path / f"clickhouse-{config['static_binary_name']}"
else:
path = os.path.join(TEMP_PATH, "clickhouse")
path = temp_path / "clickhouse"
url = S3_ARTIFACT_DOWNLOAD_TEMPLATE.format(
pr_or_release=f"{args.version.major}.{args.version.minor}",

315
tests/ci/install_check.py Normal file
View File

@ -0,0 +1,315 @@
#!/usr/bin/env python3
import argparse
import atexit
import logging
import sys
import subprocess
from pathlib import Path
from typing import Dict
from github import Github
from build_download_helper import download_builds_filter
from clickhouse_helper import (
ClickHouseHelper,
mark_flaky_tests,
prepare_tests_results_for_clickhouse,
)
from commit_status_helper import post_commit_status, update_mergeable_check
from docker_pull_helper import get_image_with_version, DockerImage
from env_helper import CI, TEMP_PATH as TEMP, REPORTS_PATH
from get_robot_token import get_best_robot_token
from pr_info import PRInfo
from report import TestResults, TestResult
from rerun_helper import RerunHelper
from s3_helper import S3Helper
from stopwatch import Stopwatch
from tee_popen import TeePopen
from upload_result_helper import upload_results
RPM_IMAGE = "clickhouse/install-rpm-test"
DEB_IMAGE = "clickhouse/install-deb-test"
TEMP_PATH = Path(TEMP)
SUCCESS = "success"
FAILURE = "failure"
def prepare_test_scripts():
server_test = r"""#!/bin/bash
systemctl start clickhouse-server
clickhouse-client -q 'SELECT version()'"""
keeper_test = r"""#!/bin/bash
systemctl start clickhouse-keeper
for i in {1..20}; do
echo wait for clickhouse-keeper to being up
> /dev/tcp/127.0.0.1/9181 2>/dev/null && break || sleep 1
done
for i in {1..5}; do
echo wait for clickhouse-keeper to answer on mntr request
exec 13<>/dev/tcp/127.0.0.1/9181
echo mntr >&13
cat <&13 | grep zk_version && break || sleep 1
exec 13>&-
done
exec 13>&-"""
binary_test = r"""#!/bin/bash
chmod +x /packages/clickhouse
/packages/clickhouse install
clickhouse-server start --daemon
for i in {1..5}; do
clickhouse-client -q 'SELECT version()' && break || sleep 1
done
clickhouse-keeper start --daemon
for i in {1..20}; do
echo wait for clickhouse-keeper to being up
> /dev/tcp/127.0.0.1/9181 2>/dev/null && break || sleep 1
done
for i in {1..5}; do
echo wait for clickhouse-keeper to answer on mntr request
exec 13<>/dev/tcp/127.0.0.1/9181
echo mntr >&13
cat <&13 | grep zk_version && break || sleep 1
exec 13>&-
done
exec 13>&-"""
(TEMP_PATH / "server_test.sh").write_text(server_test, encoding="utf-8")
(TEMP_PATH / "keeper_test.sh").write_text(keeper_test, encoding="utf-8")
(TEMP_PATH / "binary_test.sh").write_text(binary_test, encoding="utf-8")
def test_install_deb(image: DockerImage) -> TestResults:
tests = {
"Install server deb": r"""#!/bin/bash -ex
apt-get install /packages/clickhouse-{server,client,common}*deb
bash -ex /packages/server_test.sh""",
"Install keeper deb": r"""#!/bin/bash -ex
apt-get install /packages/clickhouse-keeper*deb
bash -ex /packages/keeper_test.sh""",
"Install clickhouse binary in deb": r"bash -ex /packages/binary_test.sh",
}
return test_install(image, tests)
def test_install_rpm(image: DockerImage) -> TestResults:
# FIXME: I couldn't find why Type=notify is broken in centos:8
# systemd just ignores the watchdog completely
tests = {
"Install server rpm": r"""#!/bin/bash -ex
yum localinstall --disablerepo=* -y /packages/clickhouse-{server,client,common}*rpm
echo CLICKHOUSE_WATCHDOG_ENABLE=0 > /etc/default/clickhouse-server
bash -ex /packages/server_test.sh""",
"Install keeper rpm": r"""#!/bin/bash -ex
yum localinstall --disablerepo=* -y /packages/clickhouse-keeper*rpm
bash -ex /packages/keeper_test.sh""",
"Install clickhouse binary in rpm": r"bash -ex /packages/binary_test.sh",
}
return test_install(image, tests)
def test_install_tgz(image: DockerImage) -> TestResults:
# FIXME: I couldn't find why Type=notify is broken in centos:8
# systemd just ignores the watchdog completely
tests = {
f"Install server tgz in {image.name}": r"""#!/bin/bash -ex
[ -f /etc/debian_version ] && CONFIGURE=configure || CONFIGURE=
for pkg in /packages/clickhouse-{common,client,server}*tgz; do
package=${pkg%-*}
package=${package##*/}
tar xf "$pkg"
"/$package/install/doinst.sh" $CONFIGURE
done
[ -f /etc/yum.conf ] && echo CLICKHOUSE_WATCHDOG_ENABLE=0 > /etc/default/clickhouse-server
bash -ex /packages/server_test.sh""",
f"Install keeper tgz in {image.name}": r"""#!/bin/bash -ex
[ -f /etc/debian_version ] && CONFIGURE=configure || CONFIGURE=
for pkg in /packages/clickhouse-keeper*tgz; do
package=${pkg%-*}
package=${package##*/}
tar xf "$pkg"
"/$package/install/doinst.sh" $CONFIGURE
done
bash -ex /packages/keeper_test.sh""",
}
return test_install(image, tests)
def test_install(image: DockerImage, tests: Dict[str, str]) -> TestResults:
test_results = [] # type: TestResults
for name, command in tests.items():
stopwatch = Stopwatch()
container_name = name.lower().replace(" ", "_").replace("/", "_")
log_file = TEMP_PATH / f"{container_name}.log"
run_command = (
f"docker run --rm --privileged --detach --cap-add=SYS_PTRACE "
f"--volume={TEMP_PATH}:/packages {image}"
)
logging.info("Running docker container: `%s`", run_command)
container_id = subprocess.check_output(
run_command, shell=True, encoding="utf-8"
).strip()
(TEMP_PATH / "install.sh").write_text(command)
install_command = f"docker exec {container_id} bash -ex /packages/install.sh"
with TeePopen(install_command, log_file) as process:
retcode = process.wait()
if retcode == 0:
status = SUCCESS
else:
status = FAILURE
subprocess.check_call(f"docker kill -s 9 {container_id}", shell=True)
test_results.append(
TestResult(name, status, stopwatch.duration_seconds, [log_file])
)
return test_results
def parse_args() -> argparse.Namespace:
parser = argparse.ArgumentParser(
formatter_class=argparse.ArgumentDefaultsHelpFormatter,
description="The script to check if the packages are able to install",
)
parser.add_argument(
"check_name",
help="check name, used to download the packages",
)
parser.add_argument("--download", default=True, help=argparse.SUPPRESS)
parser.add_argument(
"--no-download",
dest="download",
action="store_false",
default=argparse.SUPPRESS,
help="if set, the packages won't be downloaded, useful for debug",
)
parser.add_argument("--deb", default=True, help=argparse.SUPPRESS)
parser.add_argument(
"--no-deb",
dest="deb",
action="store_false",
default=argparse.SUPPRESS,
help="if set, the deb packages won't be checked",
)
parser.add_argument("--rpm", default=True, help=argparse.SUPPRESS)
parser.add_argument(
"--no-rpm",
dest="rpm",
action="store_false",
default=argparse.SUPPRESS,
help="if set, the rpm packages won't be checked",
)
parser.add_argument("--tgz", default=True, help=argparse.SUPPRESS)
parser.add_argument(
"--no-tgz",
dest="tgz",
action="store_false",
default=argparse.SUPPRESS,
help="if set, the tgz packages won't be checked",
)
return parser.parse_args()
def main():
logging.basicConfig(level=logging.INFO)
stopwatch = Stopwatch()
args = parse_args()
TEMP_PATH.mkdir(parents=True, exist_ok=True)
pr_info = PRInfo()
if CI:
gh = Github(get_best_robot_token(), per_page=100)
atexit.register(update_mergeable_check, gh, pr_info, args.check_name)
rerun_helper = RerunHelper(gh, pr_info, args.check_name)
if rerun_helper.is_already_finished_by_status():
logging.info(
"Check is already finished according to github status, exiting"
)
sys.exit(0)
docker_images = {
name: get_image_with_version(REPORTS_PATH, name)
for name in (RPM_IMAGE, DEB_IMAGE)
}
prepare_test_scripts()
if args.download:
def filter_artifacts(path: str) -> bool:
return (
path.endswith(".deb")
or path.endswith(".rpm")
or path.endswith(".tgz")
or path.endswith("/clickhouse")
)
download_builds_filter(
args.check_name, REPORTS_PATH, TEMP_PATH, filter_artifacts
)
test_results = [] # type: TestResults
if args.deb:
test_results.extend(test_install_deb(docker_images[DEB_IMAGE]))
if args.rpm:
test_results.extend(test_install_rpm(docker_images[RPM_IMAGE]))
if args.tgz:
test_results.extend(test_install_tgz(docker_images[DEB_IMAGE]))
test_results.extend(test_install_tgz(docker_images[RPM_IMAGE]))
state = SUCCESS
description = "Packages installed successfully"
if FAILURE in (result.status for result in test_results):
state = FAILURE
description = "Failed to install packages: " + ", ".join(
result.name for result in test_results
)
s3_helper = S3Helper()
report_url = upload_results(
s3_helper,
pr_info.number,
pr_info.sha,
test_results,
[],
args.check_name,
)
print(f"::notice ::Report url: {report_url}")
if not CI:
return
ch_helper = ClickHouseHelper()
mark_flaky_tests(ch_helper, args.check_name, test_results)
if len(description) >= 140:
description = description[:136] + "..."
post_commit_status(gh, pr_info.sha, args.check_name, description, state, report_url)
prepared_events = prepare_tests_results_for_clickhouse(
pr_info,
test_results,
state,
stopwatch.duration_seconds,
stopwatch.start_time_str,
report_url,
args.check_name,
)
ch_helper.insert_events_into(db="default", table="checks", events=prepared_events)
if state == FAILURE:
sys.exit(1)
if __name__ == "__main__":
main()

View File

@ -108,13 +108,10 @@ def main():
stopwatch = Stopwatch()
temp_path = TEMP_PATH
reports_path = REPORTS_PATH
check_name = sys.argv[1]
if not os.path.exists(temp_path):
os.makedirs(temp_path)
if not os.path.exists(TEMP_PATH):
os.makedirs(TEMP_PATH)
pr_info = PRInfo()
@ -127,14 +124,14 @@ def main():
logging.info("Check is already finished according to github status, exiting")
sys.exit(0)
docker_image = get_image_with_version(reports_path, IMAGE_NAME)
docker_image = get_image_with_version(REPORTS_PATH, IMAGE_NAME)
download_unit_tests(check_name, reports_path, temp_path)
download_unit_tests(check_name, REPORTS_PATH, TEMP_PATH)
tests_binary_path = os.path.join(temp_path, "unit_tests_dbms")
tests_binary_path = os.path.join(TEMP_PATH, "unit_tests_dbms")
os.chmod(tests_binary_path, 0o777)
test_output = os.path.join(temp_path, "test_output")
test_output = os.path.join(TEMP_PATH, "test_output")
if not os.path.exists(test_output):
os.makedirs(test_output)
@ -151,7 +148,7 @@ def main():
else:
logging.info("Run failed")
subprocess.check_call(f"sudo chown -R ubuntu:ubuntu {temp_path}", shell=True)
subprocess.check_call(f"sudo chown -R ubuntu:ubuntu {TEMP_PATH}", shell=True)
s3_helper = S3Helper()
state, description, test_results, additional_logs = process_results(test_output)

View File

@ -449,21 +449,31 @@ class FailureReason(enum.Enum):
INTERNAL_ERROR = "Test internal error: "
def threshold_generator(always_on_prob, always_off_prob, min_val, max_val):
def gen():
tmp = random.random()
if tmp <= always_on_prob:
return min_val
if tmp <= always_on_prob + always_off_prob:
return max_val
if isinstance(min_val, int) and isinstance(max_val, int):
return random.randint(min_val, max_val)
else:
return random.uniform(min_val, max_val)
return gen
class SettingsRandomizer:
settings = {
"max_insert_threads": lambda: 0
if random.random() < 0.5
else random.randint(1, 16),
"group_by_two_level_threshold": lambda: 1
if random.random() < 0.1
else 2**60
if random.random() < 0.11
else 100000,
"group_by_two_level_threshold_bytes": lambda: 1
if random.random() < 0.1
else 2**60
if random.random() < 0.11
else 50000000,
"group_by_two_level_threshold": threshold_generator(0.2, 0.2, 1, 1000000),
"group_by_two_level_threshold_bytes": threshold_generator(
0.2, 0.2, 1, 50000000
),
"distributed_aggregation_memory_efficient": lambda: random.randint(0, 1),
"fsync_metadata": lambda: random.randint(0, 1),
"output_format_parallel_formatting": lambda: random.randint(0, 1),
@ -480,17 +490,15 @@ class SettingsRandomizer:
"read_in_order_two_level_merge_threshold": lambda: random.randint(0, 100),
"optimize_aggregation_in_order": lambda: random.randint(0, 1),
"aggregation_in_order_max_block_bytes": lambda: random.randint(0, 50000000),
"min_compress_block_size": lambda: random.randint(1, 1048576 * 3),
"max_compress_block_size": lambda: random.randint(1, 1048576 * 3),
"use_uncompressed_cache": lambda: random.randint(0, 1),
"min_bytes_to_use_direct_io": lambda: 0
if random.random() < 0.5
else 1
if random.random() < 0.2
else random.randint(1, 1024 * 1024 * 1024),
"min_bytes_to_use_mmap_io": lambda: 0
if random.random() < 0.5
else 1
if random.random() < 0.2
else random.randint(1, 1024 * 1024 * 1024),
"min_bytes_to_use_direct_io": threshold_generator(
0.2, 0.5, 1, 10 * 1024 * 1024 * 1024
),
"min_bytes_to_use_mmap_io": threshold_generator(
0.2, 0.5, 1, 10 * 1024 * 1024 * 1024
),
"local_filesystem_read_method": lambda: random.choice(
["read", "pread", "mmap", "pread_threadpool", "io_uring"]
),
@ -514,6 +522,39 @@ class SettingsRandomizer:
return random_settings
class MergeTreeSettingsRandomizer:
settings = {
# Temporary disable due to large number of failures. TODO: fix.
# "ratio_of_defaults_for_sparse_serialization": threshold_generator(
# 0.1, 0.6, 0.0, 1.0
# ),
"prefer_fetch_merged_part_size_threshold": threshold_generator(
0.2, 0.5, 1, 10 * 1024 * 1024 * 1024
),
"vertical_merge_algorithm_min_rows_to_activate": threshold_generator(
0.4, 0.4, 1, 1000000
),
"vertical_merge_algorithm_min_columns_to_activate": threshold_generator(
0.4, 0.4, 1, 100
),
"min_merge_bytes_to_use_direct_io": threshold_generator(
0.25, 0.25, 1, 10 * 1024 * 1024 * 1024
),
"index_granularity_bytes": lambda: random.randint(1024, 30 * 1024 * 1024),
"merge_max_block_size": lambda: random.randint(1, 8192 * 3),
"index_granularity": lambda: random.randint(1, 65536),
"min_bytes_for_wide_part": threshold_generator(0.3, 0.3, 0, 1024 * 1024 * 1024),
}
@staticmethod
def get_random_settings(args):
random_settings = []
for setting, generator in MergeTreeSettingsRandomizer.settings.items():
if setting not in args.changed_merge_tree_settings:
random_settings.append(f"{setting}={generator()}")
return random_settings
class TestResult:
def __init__(
self,
@ -618,41 +659,48 @@ class TestCase:
return testcase_args
def cli_random_settings(self) -> str:
return " ".join([f"--{setting}" for setting in self.random_settings])
@staticmethod
def cli_format_settings(settings_list) -> str:
return " ".join([f"--{setting}" for setting in settings_list])
def add_random_settings(self, args, client_options):
if self.tags and "no-random-settings" in self.tags:
return client_options
if args.no_random_settings:
return client_options
def has_show_create_table_in_test(self):
return not subprocess.call(["grep", "-iq", "show create", self.case_file])
if len(self.base_url_params) == 0:
os.environ["CLICKHOUSE_URL_PARAMS"] = "&".join(self.random_settings)
else:
os.environ["CLICKHOUSE_URL_PARAMS"] = (
self.base_url_params + "&" + "&".join(self.random_settings)
def add_random_settings(self, client_options):
new_options = ""
if self.randomize_settings:
if len(self.base_url_params) == 0:
os.environ["CLICKHOUSE_URL_PARAMS"] = "&".join(self.random_settings)
else:
os.environ["CLICKHOUSE_URL_PARAMS"] = (
self.base_url_params + "&" + "&".join(self.random_settings)
)
new_options += f" {self.cli_format_settings(self.random_settings)}"
if self.randomize_merge_tree_settings:
new_options += f" --allow_merge_tree_settings {self.cli_format_settings(self.merge_tree_random_settings)}"
if new_options != "":
new_options += " --allow_repeated_settings"
os.environ["CLICKHOUSE_CLIENT_OPT"] = (
self.base_client_options + new_options + " "
)
new_options = f" --allow_repeated_settings {self.cli_random_settings()}"
os.environ["CLICKHOUSE_CLIENT_OPT"] = (
self.base_client_options + new_options + " "
)
return client_options + new_options
def remove_random_settings_from_env(self):
os.environ["CLICKHOUSE_URL_PARAMS"] = self.base_url_params
os.environ["CLICKHOUSE_CLIENT_OPT"] = self.base_client_options
def add_info_about_settings(self, args, description):
if self.tags and "no-random-settings" in self.tags:
return description
if args.no_random_settings:
return description
def add_info_about_settings(self, description):
if self.randomize_settings:
description += f"\nSettings used in the test: {self.cli_format_settings(self.random_settings)}"
if self.randomize_merge_tree_settings:
description += f"\n\nMergeTree settings used in test: {self.cli_format_settings(self.merge_tree_random_settings)}"
return (
f"{description}\nSettings used in the test: {self.cli_random_settings()}\n"
)
return description + "\n"
def __init__(self, suite, case: str, args, is_concurrent: bool):
self.case: str = case # case file name
@ -676,12 +724,40 @@ class TestCase:
self.testcase_args = None
self.runs_count = 0
self.random_settings = SettingsRandomizer.get_random_settings()
has_no_random_settings_tag = self.tags and "no-random-settings" in self.tags
self.randomize_settings = not (
args.no_random_settings or has_no_random_settings_tag
)
has_no_random_merge_tree_settings_tag = (
self.tags and "no-random-merge-tree-settings" in self.tags
)
# If test contains SHOW CREATE TABLE do not
# randomize merge tree settings, because
# they will be added to table definition and test will fail
self.randomize_merge_tree_settings = not (
args.no_random_merge_tree_settings
or has_no_random_settings_tag
or has_no_random_merge_tree_settings_tag
or self.has_show_create_table_in_test()
)
if self.randomize_settings:
self.random_settings = SettingsRandomizer.get_random_settings()
if self.randomize_merge_tree_settings:
self.merge_tree_random_settings = (
MergeTreeSettingsRandomizer.get_random_settings(args)
)
self.base_url_params = (
os.environ["CLICKHOUSE_URL_PARAMS"]
if "CLICKHOUSE_URL_PARAMS" in os.environ
else ""
)
self.base_client_options = (
os.environ["CLICKHOUSE_CLIENT_OPT"]
if "CLICKHOUSE_CLIENT_OPT" in os.environ
@ -1136,7 +1212,7 @@ class TestCase:
self.testcase_args = self.configure_testcase_args(
args, self.case_file, suite.suite_tmp_path
)
client_options = self.add_random_settings(args, client_options)
client_options = self.add_random_settings(client_options)
proc, stdout, stderr, debug_log, total_time = self.run_single_test(
server_logs_level, client_options
)
@ -1149,9 +1225,7 @@ class TestCase:
result.description = result.description.replace('\0', '')
if result.status == TestStatus.FAIL:
result.description = self.add_info_about_settings(
args, result.description
)
result.description = self.add_info_about_settings(result.description)
return result
except KeyboardInterrupt as e:
raise e
@ -1162,7 +1236,7 @@ class TestCase:
FailureReason.INTERNAL_QUERY_FAIL,
0.0,
self.add_info_about_settings(
args, self.get_description_from_exception_info(sys.exc_info())
self.get_description_from_exception_info(sys.exc_info())
),
)
except (ConnectionError, http.client.ImproperConnectionState):
@ -1172,7 +1246,7 @@ class TestCase:
FailureReason.SERVER_DIED,
0.0,
self.add_info_about_settings(
args, self.get_description_from_exception_info(sys.exc_info())
self.get_description_from_exception_info(sys.exc_info())
),
)
except Exception:
@ -1680,6 +1754,19 @@ def collect_build_flags(args):
return result
def collect_changed_merge_tree_settings(args):
changed_settings = (
clickhouse_execute(
args,
"SELECT name FROM system.merge_tree_settings WHERE changed",
)
.strip()
.splitlines()
)
return list(map(lambda s: s.decode(), changed_settings))
def check_table_column(args, database, table, column):
return (
int(
@ -1984,6 +2071,7 @@ def main(args):
raise Exception(msg)
args.build_flags = collect_build_flags(args)
args.changed_merge_tree_settings = collect_changed_merge_tree_settings(args)
args.suppport_system_processes_is_all_data_sent = check_table_column(
args, "system", "processes", "is_all_data_sent"
)
@ -2328,7 +2416,12 @@ if __name__ == "__main__":
default=False,
help="Disable settings randomization",
)
parser.add_argument(
"--no-random-merge-tree-settings",
action="store_true",
default=False,
help="Disable MergeTree settings randomization",
)
parser.add_argument(
"--run-by-hash-num",
type=int,

View File

@ -2856,7 +2856,10 @@ class ClickHouseCluster:
SANITIZER_SIGN, from_host=True, filename="stderr.log"
):
sanitizer_assert_instance = instance.grep_in_log(
SANITIZER_SIGN, from_host=True, filename="stderr.log"
SANITIZER_SIGN,
from_host=True,
filename="stderr.log",
after=1000,
)
logging.error(
"Sanitizer in instance %s log %s",
@ -2897,8 +2900,8 @@ class ClickHouseCluster:
if sanitizer_assert_instance is not None:
raise Exception(
"Sanitizer assert found in {} for instance {}".format(
self.docker_logs_path, sanitizer_assert_instance
"Sanitizer assert found for instance {}".format(
sanitizer_assert_instance
)
)
if fatal_log is not None:
@ -3652,15 +3655,21 @@ class ClickHouseInstance:
)
return len(result) > 0
def grep_in_log(self, substring, from_host=False, filename="clickhouse-server.log"):
def grep_in_log(
self, substring, from_host=False, filename="clickhouse-server.log", after=None
):
logging.debug(f"grep in log called %s", substring)
if after is not None:
after_opt = "-A{}".format(after)
else:
after_opt = ""
if from_host:
# We check fist file exists but want to look for all rotated logs as well
result = subprocess_check_call(
[
"bash",
"-c",
f'[ -f {self.logs_dir}/{filename} ] && zgrep -a "{substring}" {self.logs_dir}/{filename}* || true',
f'[ -f {self.logs_dir}/{filename} ] && zgrep {after_opt} -a "{substring}" {self.logs_dir}/{filename}* || true',
]
)
else:
@ -3668,7 +3677,7 @@ class ClickHouseInstance:
[
"bash",
"-c",
f'[ -f /var/log/clickhouse-server/{filename} ] && zgrep -a "{substring}" /var/log/clickhouse-server/{filename}* || true',
f'[ -f /var/log/clickhouse-server/{filename} ] && zgrep {after_opt} -a "{substring}" /var/log/clickhouse-server/{filename}* || true',
]
)
logging.debug("grep result %s", result)

View File

@ -2953,6 +2953,7 @@ def test_rabbitmq_address(rabbitmq_cluster):
instance2.query("drop table rabbit_out sync")
@pytest.mark.skip(reason="FIXME: flaky (something with channel.start_consuming()")
def test_format_with_prefix_and_suffix(rabbitmq_cluster):
instance.query(
"""
@ -3001,6 +3002,7 @@ def test_format_with_prefix_and_suffix(rabbitmq_cluster):
)
@pytest.mark.skip(reason="FIXME: flaky (something with channel.start_consuming()")
def test_max_rows_per_message(rabbitmq_cluster):
num_rows = 5
@ -3073,6 +3075,7 @@ def test_max_rows_per_message(rabbitmq_cluster):
assert result == "0\t0\n10\t100\n20\t200\n30\t300\n40\t400\n"
@pytest.mark.skip(reason="FIXME: flaky (something with channel.start_consuming()")
def test_row_based_formats(rabbitmq_cluster):
num_rows = 10
@ -3169,6 +3172,7 @@ def test_row_based_formats(rabbitmq_cluster):
assert result == expected
@pytest.mark.skip(reason="FIXME: flaky (something with channel.start_consuming()")
def test_block_based_formats_1(rabbitmq_cluster):
instance.query(
"""
@ -3230,6 +3234,7 @@ def test_block_based_formats_1(rabbitmq_cluster):
]
@pytest.mark.skip(reason="FIXME: flaky (something with channel.start_consuming()")
def test_block_based_formats_2(rabbitmq_cluster):
num_rows = 100

View File

@ -0,0 +1,12 @@
<test>
<query>SELECT arr FROM (SELECT cast(range(number % 10) as Array(Int128)) AS arr FROM (SELECT * FROM system.numbers LIMIT 10000000) WHERE length(arr) &lt;= 5) format Null</query>
<query>SELECT arr FROM (SELECT cast(range(number % 10) as Array(UInt128)) AS arr FROM (SELECT * FROM system.numbers LIMIT 10000000) WHERE length(arr) &lt;= 5) format Null</query>
<query>SELECT arr FROM (SELECT cast(range(number % 10) as Array(Int256)) AS arr FROM (SELECT * FROM system.numbers LIMIT 10000000) WHERE length(arr) &lt;= 5) format Null</query>
<query>SELECT arr FROM (SELECT cast(range(number % 10) as Array(UInt256)) AS arr FROM (SELECT * FROM system.numbers LIMIT 10000000) WHERE length(arr) &lt;= 5) format Null</query>
<query>SELECT arr FROM (SELECT cast(range(number % 10) as Array(Decimal32(0))) AS arr FROM (SELECT * FROM system.numbers LIMIT 10000000) WHERE length(arr) &lt;= 5) format Null</query>
<query>SELECT arr FROM (SELECT cast(range(number % 10) as Array(Decimal64(0))) AS arr FROM (SELECT * FROM system.numbers LIMIT 10000000) WHERE length(arr) &lt;= 5) format Null</query>
<query>SELECT arr FROM (SELECT cast(range(number % 10) as Array(Decimal128(0))) AS arr FROM (SELECT * FROM system.numbers LIMIT 10000000) WHERE length(arr) &lt;= 5) format Null</query>
<query>SELECT arr FROM (SELECT cast(range(number % 10) as Array(Decimal256(0))) AS arr FROM (SELECT * FROM system.numbers LIMIT 10000000) WHERE length(arr) &lt;= 5) format Null</query>
</test>

View File

@ -0,0 +1,12 @@
<test>
<query>with cast([1,2,3,4] as Array(Int128)) as elem select arrayWithConstant(rand() % 10 + 5, materialize(elem)) from numbers(1000000) format Null</query>
<query>with cast([1,2,3,4] as Array(UInt128)) as elem select arrayWithConstant(rand() % 10 + 5, materialize(elem)) from numbers(1000000) format Null</query>
<query>with cast([1,2,3,4] as Array(Int256)) as elem select arrayWithConstant(rand() % 10 + 5, materialize(elem)) from numbers(1000000) format Null</query>
<query>with cast([1,2,3,4] as Array(UInt256)) as elem select arrayWithConstant(rand() % 10 + 5, materialize(elem)) from numbers(1000000) format Null</query>
<query>with cast([1,2,3,4] as Array(Decimal32(0))) as elem select arrayWithConstant(rand() % 10 + 5, materialize(elem)) from numbers(1000000) format Null</query>
<query>with cast([1,2,3,4] as Array(Decimal64(0))) as elem select arrayWithConstant(rand() % 10 + 5, materialize(elem)) from numbers(1000000) format Null</query>
<query>with cast([1,2,3,4] as Array(Decimal128(0))) as elem select arrayWithConstant(rand() % 10 + 5, materialize(elem)) from numbers(1000000) format Null</query>
<query>with cast([1,2,3,4] as Array(Decimal256(0))) as elem select arrayWithConstant(rand() % 10 + 5, materialize(elem)) from numbers(1000000) format Null</query>
</test>

View File

@ -44,7 +44,7 @@ select 100, max2((select count() from logs where level = 'Warning' and message_f
group by message_format_string order by count() desc limit 1) / (select count() from logs), 0.005);
-- Same as above for Error
select 110, max2((select count() from logs where level = 'Warning' group by message_format_string order by count() desc limit 1) / (select count() from logs), 0.01);
select 110, max2((select count() from logs where level = 'Error' group by message_format_string order by count() desc limit 1) / (select count() from logs), 0.01);
-- Avoid too noisy messages: limit the number of messages with high frequency
select 120, max2(count(), 3) from (select count() / (select count() from logs) as freq, message_format_string from logs group by message_format_string having freq > 0.10);

View File

@ -1,4 +1,5 @@
#!/usr/bin/env bash
# Tags: no-random-merge-tree-settings
set -e
@ -7,7 +8,7 @@ CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
. "$CURDIR"/../shell_config.sh
$CLICKHOUSE_CLIENT -q "DROP TABLE IF EXISTS preferred_block_size_bytes"
$CLICKHOUSE_CLIENT -q "CREATE TABLE preferred_block_size_bytes (p Date, s String) ENGINE = MergeTree PARTITION BY p ORDER BY p SETTINGS index_granularity=1, index_granularity_bytes=0, min_bytes_for_wide_part = 0"
$CLICKHOUSE_CLIENT -q "CREATE TABLE preferred_block_size_bytes (p Date, s String) ENGINE = MergeTree PARTITION BY p ORDER BY p SETTINGS index_granularity=1, index_granularity_bytes=0, min_bytes_for_wide_part = 0, min_rows_for_wide_part = 0"
$CLICKHOUSE_CLIENT -q "INSERT INTO preferred_block_size_bytes (s) SELECT '16_bytes_-_-_-_' AS s FROM system.numbers LIMIT 10, 90"
$CLICKHOUSE_CLIENT -q "OPTIMIZE TABLE preferred_block_size_bytes"
$CLICKHOUSE_CLIENT --preferred_block_size_bytes=26 -q "SELECT DISTINCT blockSize(), ignore(p, s) FROM preferred_block_size_bytes"
@ -18,7 +19,7 @@ $CLICKHOUSE_CLIENT -q "DROP TABLE IF EXISTS preferred_block_size_bytes"
# PREWHERE using empty column
$CLICKHOUSE_CLIENT -q "DROP TABLE IF EXISTS pbs"
$CLICKHOUSE_CLIENT -q "CREATE TABLE pbs (p Date, i UInt64, sa Array(String)) ENGINE = MergeTree PARTITION BY p ORDER BY p SETTINGS index_granularity=100, index_granularity_bytes=0, min_bytes_for_wide_part = 0"
$CLICKHOUSE_CLIENT -q "CREATE TABLE pbs (p Date, i UInt64, sa Array(String)) ENGINE = MergeTree PARTITION BY p ORDER BY p SETTINGS index_granularity=100, index_granularity_bytes=0, min_bytes_for_wide_part = 0, min_rows_for_wide_part = 0"
$CLICKHOUSE_CLIENT -q "INSERT INTO pbs (p, i, sa) SELECT toDate(i % 30) AS p, number AS i, ['a'] AS sa FROM system.numbers LIMIT 1000"
$CLICKHOUSE_CLIENT -q "ALTER TABLE pbs ADD COLUMN s UInt8 DEFAULT 0"
$CLICKHOUSE_CLIENT --preferred_block_size_bytes=100000 -q "SELECT count() FROM pbs PREWHERE s = 0"
@ -29,7 +30,7 @@ $CLICKHOUSE_CLIENT -q "DROP TABLE pbs"
# Nullable PREWHERE
$CLICKHOUSE_CLIENT -q "DROP TABLE IF EXISTS nullable_prewhere"
$CLICKHOUSE_CLIENT -q "CREATE TABLE nullable_prewhere (p Date, f Nullable(UInt64), d UInt64) ENGINE = MergeTree PARTITION BY p ORDER BY p SETTINGS index_granularity=8, index_granularity_bytes=0, min_bytes_for_wide_part = 0"
$CLICKHOUSE_CLIENT -q "CREATE TABLE nullable_prewhere (p Date, f Nullable(UInt64), d UInt64) ENGINE = MergeTree PARTITION BY p ORDER BY p SETTINGS index_granularity=8, index_granularity_bytes=0, min_bytes_for_wide_part = 0, min_rows_for_wide_part = 0"
$CLICKHOUSE_CLIENT -q "INSERT INTO nullable_prewhere SELECT toDate(0) AS p, if(number % 2 = 0, CAST(number AS Nullable(UInt64)), CAST(NULL AS Nullable(UInt64))) AS f, number as d FROM system.numbers LIMIT 1001"
$CLICKHOUSE_CLIENT -q "SELECT sum(d), sum(f), max(d) FROM nullable_prewhere PREWHERE NOT isNull(f)"
$CLICKHOUSE_CLIENT -q "DROP TABLE IF EXISTS nullable_prewhere"

View File

@ -25,15 +25,9 @@
1
1
0
1
0
1
1
0
-
1
1
1
-
(1,2) ((1,2),(3,4)) 1 1
-

View File

@ -28,16 +28,10 @@ select 1 in (0 + 1, 1, toInt8(sin(5)));
select (0 + 1, 1, toInt8(sin(5))) in (0 + 1, 1, toInt8(sin(5)));
select identity(tuple(1)) in (tuple(1), tuple(2));
select identity(tuple(1)) in (tuple(0), tuple(2));
select identity(tuple(1)) in (identity(tuple(1)), tuple(2));
select identity(tuple(1)) in (identity(tuple(0)), tuple(2));
select identity(tuple(1)) in (identity(tuple(1)), identity(tuple(2)));
select identity(tuple(1)) in (identity(tuple(1)), identity(identity(tuple(2))));
select identity(tuple(1)) in (identity(tuple(0)), identity(identity(tuple(2))));
select '-';
select identity((1, 2)) in (1, 2);
select identity((1, 2)) in ((1, 2), (3, 4));
select identity((1, 2)) in ((1, 2), identity((3, 4)));
select '-';
select (1,2) as x, ((1,2),(3,4)) as y, 1 in x, x in y;
@ -50,4 +44,3 @@ select (1, 2) in (select (1, 2));
select identity(tuple(1)) in (select tuple(1));
select identity((1, 2)) in (select 1, 2);
select identity((1, 2)) in (select (1, 2));

View File

@ -1,4 +1,5 @@
#!/usr/bin/env bash
# Tags: no-random-merge-tree-settings
#--------------------------------------------
# Description of test result:

Some files were not shown because too many files have changed in this diff Show More