Merge branch 'master' into floating_seconds

This commit is contained in:
mergify[bot] 2022-05-30 19:18:35 +00:00 committed by GitHub
commit b43cfd056f
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
454 changed files with 9036 additions and 11196 deletions

View File

@ -359,15 +359,11 @@ jobs:
steps:
- name: Set envs
run: |
DEPENDENCIES=$(cat << 'EOF' | jq '. | length'
${{ toJSON(needs) }}
EOF
)
echo "DEPENDENCIES=$DEPENDENCIES" >> "$GITHUB_ENV"
cat >> "$GITHUB_ENV" << 'EOF'
CHECK_NAME=ClickHouse build check (actions)
REPORTS_PATH=${{runner.temp}}/reports_dir
TEMP_PATH=${{runner.temp}}/report_check
NEEDS_DATA_PATH=${{runner.temp}}/needs.json
EOF
- name: Download json reports
uses: actions/download-artifact@v2
@ -382,8 +378,11 @@ jobs:
run: |
sudo rm -fr "$TEMP_PATH"
mkdir -p "$TEMP_PATH"
cat > "$NEEDS_DATA_PATH" << 'EOF'
${{ toJSON(needs) }}
EOF
cd "$GITHUB_WORKSPACE/tests/ci"
python3 build_report_check.py "$CHECK_NAME" "$DEPENDENCIES"
python3 build_report_check.py "$CHECK_NAME"
- name: Cleanup
if: always()
run: |

View File

@ -7,11 +7,8 @@ concurrency:
on: # yamllint disable-line rule:truthy
schedule:
- cron: '0 */6 * * *'
workflow_run:
workflows: ["PullRequestCI"]
types:
- completed
workflow_dispatch:
workflow_call:
jobs:
KeeperJepsenRelease:
runs-on: [self-hosted, style-checker]

View File

@ -970,16 +970,12 @@ jobs:
steps:
- name: Set envs
run: |
DEPENDENCIES=$(cat << 'EOF' | jq '. | length'
${{ toJSON(needs) }}
EOF
)
echo "DEPENDENCIES=$DEPENDENCIES" >> "$GITHUB_ENV"
cat >> "$GITHUB_ENV" << 'EOF'
CHECK_NAME=ClickHouse build check (actions)
REPORTS_PATH=${{runner.temp}}/reports_dir
REPORTS_PATH=${{runner.temp}}/reports_dir
TEMP_PATH=${{runner.temp}}/report_check
NEEDS_DATA_PATH=${{runner.temp}}/needs.json
EOF
- name: Download json reports
uses: actions/download-artifact@v2
@ -994,8 +990,11 @@ jobs:
run: |
sudo rm -fr "$TEMP_PATH"
mkdir -p "$TEMP_PATH"
cat > "$NEEDS_DATA_PATH" << 'EOF'
${{ toJSON(needs) }}
EOF
cd "$GITHUB_WORKSPACE/tests/ci"
python3 build_report_check.py "$CHECK_NAME" "$DEPENDENCIES"
python3 build_report_check.py "$CHECK_NAME"
- name: Cleanup
if: always()
run: |
@ -1018,15 +1017,11 @@ jobs:
steps:
- name: Set envs
run: |
DEPENDENCIES=$(cat << 'EOF' | jq '. | length'
${{ toJSON(needs) }}
EOF
)
echo "DEPENDENCIES=$DEPENDENCIES" >> "$GITHUB_ENV"
cat >> "$GITHUB_ENV" << 'EOF'
TEMP_PATH=${{runner.temp}}/report_check
REPORTS_PATH=${{runner.temp}}/reports_dir
CHECK_NAME=ClickHouse special build check (actions)
NEEDS_DATA_PATH=${{runner.temp}}/needs.json
EOF
- name: Download json reports
uses: actions/download-artifact@v2
@ -1041,8 +1036,11 @@ jobs:
run: |
sudo rm -fr "$TEMP_PATH"
mkdir -p "$TEMP_PATH"
cat > "$NEEDS_DATA_PATH" << 'EOF'
${{ toJSON(needs) }}
EOF
cd "$GITHUB_WORKSPACE/tests/ci"
python3 build_report_check.py "$CHECK_NAME" "$DEPENDENCIES"
python3 build_report_check.py "$CHECK_NAME"
- name: Cleanup
if: always()
run: |

View File

@ -1025,15 +1025,11 @@ jobs:
steps:
- name: Set envs
run: |
DEPENDENCIES=$(cat << 'EOF' | jq '. | length'
${{ toJSON(needs) }}
EOF
)
echo "DEPENDENCIES=$DEPENDENCIES" >> "$GITHUB_ENV"
cat >> "$GITHUB_ENV" << 'EOF'
CHECK_NAME=ClickHouse build check (actions)
REPORTS_PATH=${{runner.temp}}/reports_dir
TEMP_PATH=${{runner.temp}}/report_check
NEEDS_DATA_PATH=${{runner.temp}}/needs.json
EOF
- name: Download json reports
uses: actions/download-artifact@v2
@ -1048,8 +1044,11 @@ jobs:
run: |
sudo rm -fr "$TEMP_PATH"
mkdir -p "$TEMP_PATH"
cat > "$NEEDS_DATA_PATH" << 'EOF'
${{ toJSON(needs) }}
EOF
cd "$GITHUB_WORKSPACE/tests/ci"
python3 build_report_check.py "$CHECK_NAME" "$DEPENDENCIES"
python3 build_report_check.py "$CHECK_NAME"
- name: Cleanup
if: always()
run: |
@ -1073,15 +1072,11 @@ jobs:
steps:
- name: Set envs
run: |
DEPENDENCIES=$(cat << 'EOF' | jq '. | length'
${{ toJSON(needs) }}
EOF
)
echo "DEPENDENCIES=$DEPENDENCIES" >> "$GITHUB_ENV"
cat >> "$GITHUB_ENV" << 'EOF'
TEMP_PATH=${{runner.temp}}/report_check
REPORTS_PATH=${{runner.temp}}/reports_dir
CHECK_NAME=ClickHouse special build check (actions)
NEEDS_DATA_PATH=${{runner.temp}}/needs.json
EOF
- name: Download json reports
uses: actions/download-artifact@v2
@ -1096,8 +1091,11 @@ jobs:
run: |
sudo rm -fr "$TEMP_PATH"
mkdir -p "$TEMP_PATH"
cat > "$NEEDS_DATA_PATH" << 'EOF'
${{ toJSON(needs) }}
EOF
cd "$GITHUB_WORKSPACE/tests/ci"
python3 build_report_check.py "$CHECK_NAME" "$DEPENDENCIES"
python3 build_report_check.py "$CHECK_NAME"
- name: Cleanup
if: always()
run: |
@ -3272,6 +3270,13 @@ jobs:
# shellcheck disable=SC2046
docker rm -f $(docker ps -a -q) ||:
sudo rm -fr "$TEMP_PATH"
#############################################################################################
###################################### JEPSEN TESTS #########################################
#############################################################################################
Jepsen:
needs: [BuilderBinRelease]
uses: ./.github/workflows/jepsen.yml
FinishCheck:
needs:
- StyleCheck
@ -3336,6 +3341,7 @@ jobs:
- SplitBuildSmokeTest
- CompatibilityCheck
- IntegrationTestsFlakyCheck
- Jepsen
runs-on: [self-hosted, style-checker]
steps:
- name: Clear repository

View File

@ -442,16 +442,12 @@ jobs:
steps:
- name: Set envs
run: |
DEPENDENCIES=$(cat << 'EOF' | jq '. | length'
${{ toJSON(needs) }}
EOF
)
echo "DEPENDENCIES=$DEPENDENCIES" >> "$GITHUB_ENV"
cat >> "$GITHUB_ENV" << 'EOF'
CHECK_NAME=ClickHouse build check (actions)
REPORTS_PATH=${{runner.temp}}/reports_dir
REPORTS_PATH=${{runner.temp}}/reports_dir
TEMP_PATH=${{runner.temp}}/report_check
NEEDS_DATA_PATH=${{runner.temp}}/needs.json
EOF
- name: Download json reports
uses: actions/download-artifact@v2
@ -466,8 +462,11 @@ jobs:
run: |
sudo rm -fr "$TEMP_PATH"
mkdir -p "$TEMP_PATH"
cat > "$NEEDS_DATA_PATH" << 'EOF'
${{ toJSON(needs) }}
EOF
cd "$GITHUB_WORKSPACE/tests/ci"
python3 build_report_check.py "$CHECK_NAME" "$DEPENDENCIES"
python3 build_report_check.py "$CHECK_NAME"
- name: Cleanup
if: always()
run: |

View File

@ -63,7 +63,7 @@ RUN arch=${TARGETARCH:-amd64} \
&& chown clickhouse:clickhouse /var/lib/clickhouse \
&& chown root:clickhouse /var/log/clickhouse-server \
&& chmod +x /entrypoint.sh \
&& apk add --no-cache su-exec bash tzdata \
&& apk add --no-cache bash tzdata \
&& cp /usr/share/zoneinfo/UTC /etc/localtime \
&& echo "UTC" > /etc/timezone \
&& chmod ugo+Xrw -R /var/lib/clickhouse /var/log/clickhouse-server /etc/clickhouse-server /etc/clickhouse-client

View File

@ -3,8 +3,6 @@ FROM ubuntu:20.04
# see https://github.com/moby/moby/issues/4032#issuecomment-192327844
ARG DEBIAN_FRONTEND=noninteractive
COPY su-exec.c /su-exec.c
# ARG for quick switch to a given ubuntu mirror
ARG apt_archive="http://archive.ubuntu.com"
RUN sed -i "s|http://archive.ubuntu.com|${apt_archive}|g" /etc/apt/sources.list \
@ -19,17 +17,11 @@ RUN sed -i "s|http://archive.ubuntu.com|${apt_archive}|g" /etc/apt/sources.list
locales \
wget \
tzdata \
&& apt-get install -y --no-install-recommends tcc libc-dev && \
tcc /su-exec.c -o /bin/su-exec && \
chown root:root /bin/su-exec && \
chmod 0755 /bin/su-exec && \
rm /su-exec.c && \
apt-get purge -y --auto-remove tcc libc-dev libc-dev-bin libc6-dev linux-libc-dev \
&& apt-get clean
ARG REPO_CHANNEL="stable"
ARG REPOSITORY="deb https://packages.clickhouse.com/deb ${REPO_CHANNEL} main"
ARG VERSION=22.1.1.*
ARG VERSION=22.5.1.*
ARG PACKAGES="clickhouse-client clickhouse-server clickhouse-common-static"
# set non-empty deb_location_url url to create a docker image
@ -51,21 +43,6 @@ ARG single_binary_location_url=""
# installed to prevent picking those uid / gid by some unrelated software.
# The same uid / gid (101) is used both for alpine and ubuntu.
# To drop privileges, we need 'su' command, that simply changes uid and gid.
# In fact, the 'su' command from Linux is not so simple, due to inherent vulnerability in Linux:
# https://ruderich.org/simon/notes/su-sudo-from-root-tty-hijacking
# It has to mitigate this drawback of Linux, and to do this, 'su' command is creating it's own pseudo-terminal
# and forwarding commands. Due to some ridiculous curcumstances, it does not work in Docker (or it does)
# and for these reasons people are using alternatives to the 'su' command in Docker,
# that don't mess with the terminal, don't care about closing the opened files, etc...
# but can only be safe to drop privileges inside Docker.
# The question - what implementation of 'su' command to use.
# It should be a simple script doing about just two syscalls.
# Some people tend to use 'gosu' tool that is written in Go.
# It is not used for several reasons:
# 1. Dependency on some foreign code in yet another programming language - does not sound alright.
# 2. Anselmo D. Adams suggested not to use it due to false positive alarms in some undisclosed security scanners.
ARG TARGETARCH
RUN arch=${TARGETARCH:-amd64} \

View File

@ -15,29 +15,15 @@ CLICKHOUSE_GID="${CLICKHOUSE_GID:-"$(id -g clickhouse)"}"
if [ "$(id -u)" = "0" ]; then
USER=$CLICKHOUSE_UID
GROUP=$CLICKHOUSE_GID
if command -v gosu &> /dev/null; then
gosu="gosu $USER:$GROUP"
elif command -v su-exec &> /dev/null; then
gosu="su-exec $USER:$GROUP"
else
echo "No gosu/su-exec detected!"
exit 1
fi
else
USER="$(id -u)"
GROUP="$(id -g)"
gosu=""
DO_CHOWN=0
fi
# set some vars
CLICKHOUSE_CONFIG="${CLICKHOUSE_CONFIG:-/etc/clickhouse-server/config.xml}"
if ! $gosu test -f "$CLICKHOUSE_CONFIG" -a -r "$CLICKHOUSE_CONFIG"; then
echo "Configuration file '$CLICKHOUSE_CONFIG' isn't readable by user with id '$USER'"
exit 1
fi
# get CH directories locations
DATA_DIR="$(clickhouse extract-from-config --config-file "$CLICKHOUSE_CONFIG" --key=path || true)"
TMP_DIR="$(clickhouse extract-from-config --config-file "$CLICKHOUSE_CONFIG" --key=tmp_path || true)"
@ -65,12 +51,7 @@ do
# check if variable not empty
[ -z "$dir" ] && continue
# ensure directories exist
if [ "$DO_CHOWN" = "1" ]; then
mkdir="mkdir"
else
mkdir="$gosu mkdir"
fi
if ! $mkdir -p "$dir"; then
if ! mkdir -p "$dir"; then
echo "Couldn't create necessary directory: $dir"
exit 1
fi
@ -81,9 +62,6 @@ do
if [ "$(stat -c %u "$dir")" != "$USER" ] || [ "$(stat -c %g "$dir")" != "$GROUP" ]; then
chown -R "$USER:$GROUP" "$dir"
fi
elif ! $gosu test -d "$dir" -a -w "$dir" -a -r "$dir"; then
echo "Necessary directory '$dir' isn't accessible by user with id '$USER'"
exit 1
fi
done
@ -117,7 +95,7 @@ if [ -n "$(ls /docker-entrypoint-initdb.d/)" ] || [ -n "$CLICKHOUSE_DB" ]; then
HTTP_PORT="$(clickhouse extract-from-config --config-file "$CLICKHOUSE_CONFIG" --key=http_port)"
# Listen only on localhost until the initialization is done
$gosu /usr/bin/clickhouse-server --config-file="$CLICKHOUSE_CONFIG" -- --listen_host=127.0.0.1 &
/usr/bin/clickhouse su "${USER}:${GROUP}" /usr/bin/clickhouse-server --config-file="$CLICKHOUSE_CONFIG" -- --listen_host=127.0.0.1 &
pid="$!"
# check if clickhouse is ready to accept connections
@ -173,7 +151,7 @@ if [[ $# -lt 1 ]] || [[ "$1" == "--"* ]]; then
# so the container can't be finished by ctrl+c
CLICKHOUSE_WATCHDOG_ENABLE=${CLICKHOUSE_WATCHDOG_ENABLE:-0}
export CLICKHOUSE_WATCHDOG_ENABLE
exec $gosu /usr/bin/clickhouse-server --config-file="$CLICKHOUSE_CONFIG" "$@"
/usr/bin/clickhouse su "${USER}:${GROUP}" /usr/bin/clickhouse-server --config-file="$CLICKHOUSE_CONFIG" "$@"
fi
# Otherwise, we assume the user want to run his own process, for example a `bash` shell to explore this image

View File

@ -1,138 +0,0 @@
/*
https://github.com/ncopa/su-exec
The file is copy-pasted verbatim to avoid supply chain attacks.
The MIT License (MIT)
Copyright (c) 2015 ncopa
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
*/
/* set user and group id and exec */
#include <sys/types.h>
#include <err.h>
#include <errno.h>
#include <grp.h>
#include <pwd.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
static char *argv0;
static void usage(int exitcode)
{
printf("Usage: %s user-spec command [args]\n", argv0);
exit(exitcode);
}
int main(int argc, char *argv[])
{
char *user, *group, **cmdargv;
char *end;
uid_t uid = getuid();
gid_t gid = getgid();
argv0 = argv[0];
if (argc < 3)
usage(0);
user = argv[1];
group = strchr(user, ':');
if (group)
*group++ = '\0';
cmdargv = &argv[2];
struct passwd *pw = NULL;
if (user[0] != '\0') {
uid_t nuid = strtol(user, &end, 10);
if (*end == '\0')
uid = nuid;
else {
pw = getpwnam(user);
if (pw == NULL)
err(1, "getpwnam(%s)", user);
}
}
if (pw == NULL) {
pw = getpwuid(uid);
}
if (pw != NULL) {
uid = pw->pw_uid;
gid = pw->pw_gid;
}
setenv("HOME", pw != NULL ? pw->pw_dir : "/", 1);
if (group && group[0] != '\0') {
/* group was specified, ignore grouplist for setgroups later */
pw = NULL;
gid_t ngid = strtol(group, &end, 10);
if (*end == '\0')
gid = ngid;
else {
struct group *gr = getgrnam(group);
if (gr == NULL)
err(1, "getgrnam(%s)", group);
gid = gr->gr_gid;
}
}
if (pw == NULL) {
if (setgroups(1, &gid) < 0)
err(1, "setgroups(%i)", gid);
} else {
int ngroups = 0;
gid_t *glist = NULL;
while (1) {
int r = getgrouplist(pw->pw_name, gid, glist, &ngroups);
if (r >= 0) {
if (setgroups(ngroups, glist) < 0)
err(1, "setgroups");
break;
}
glist = realloc(glist, ngroups * sizeof(gid_t));
if (glist == NULL)
err(1, "malloc");
}
}
if (setgid(gid) < 0)
err(1, "setgid(%i)", gid);
if (setuid(uid) < 0)
err(1, "setuid(%i)", uid);
execvp(cmdargv[0], cmdargv);
err(1, "%s", cmdargv[0]);
return 1;
}

View File

@ -355,22 +355,8 @@ fi
cat > report.html <<EOF ||:
<!DOCTYPE html>
<html lang="en">
<link rel="preload" as="font" href="https://yastatic.net/adv-www/_/sUYVCPUAQE7ExrvMS7FoISoO83s.woff2" type="font/woff2" crossorigin="anonymous"/>
<style>
@font-face {
font-family:'Yandex Sans Display Web';
src:url(https://yastatic.net/adv-www/_/H63jN0veW07XQUIA2317lr9UIm8.eot);
src:url(https://yastatic.net/adv-www/_/H63jN0veW07XQUIA2317lr9UIm8.eot?#iefix) format('embedded-opentype'),
url(https://yastatic.net/adv-www/_/sUYVCPUAQE7ExrvMS7FoISoO83s.woff2) format('woff2'),
url(https://yastatic.net/adv-www/_/v2Sve_obH3rKm6rKrtSQpf-eB7U.woff) format('woff'),
url(https://yastatic.net/adv-www/_/PzD8hWLMunow5i3RfJ6WQJAL7aI.ttf) format('truetype'),
url(https://yastatic.net/adv-www/_/lF_KG5g4tpQNlYIgA0e77fBSZ5s.svg#YandexSansDisplayWeb-Regular) format('svg');
font-weight:400;
font-style:normal;
font-stretch:normal
}
body { font-family: "Yandex Sans Display Web", Arial, sans-serif; background: #EEE; }
body { font-family: "DejaVu Sans", "Noto Sans", Arial, sans-serif; background: #EEE; }
h1 { margin-left: 10px; }
th, td { border: 0; padding: 5px 10px 5px 10px; text-align: left; vertical-align: top; line-height: 1.5; background-color: #FFF;
td { white-space: pre; font-family: Monospace, Courier New; }
@ -378,7 +364,6 @@ border: 0; box-shadow: 0 0 0 1px rgba(0, 0, 0, 0.05), 0 8px 25px -5px rgba(0, 0,
a { color: #06F; text-decoration: none; }
a:hover, a:active { color: #F40; text-decoration: underline; }
table { border: 0; }
.main { margin-left: 10%; }
p.links a { padding: 5px; margin: 3px; background: #FFF; line-height: 2; white-space: nowrap; box-shadow: 0 0 0 1px rgba(0, 0, 0, 0.05), 0 8px 25px -5px rgba(0, 0, 0, 0.1); }
th { cursor: pointer; }

View File

@ -92,7 +92,7 @@ The list of third-party libraries can be obtained by the following query:
SELECT library_name, license_type, license_path FROM system.licenses ORDER BY library_name COLLATE 'en';
```
[Example](https://gh-api.clickhouse.com/play?user=play#U0VMRUNUIGxpYnJhcnlfbmFtZSwgbGljZW5zZV90eXBlLCBsaWNlbnNlX3BhdGggRlJPTSBzeXN0ZW0ubGljZW5zZXMgT1JERVIgQlkgbGlicmFyeV9uYW1lIENPTExBVEUgJ2VuJw==)
[Example](https://play.clickhouse.com/play?user=play#U0VMRUNUIGxpYnJhcnlfbmFtZSwgbGljZW5zZV90eXBlLCBsaWNlbnNlX3BhdGggRlJPTSBzeXN0ZW0ubGljZW5zZXMgT1JERVIgQlkgbGlicmFyeV9uYW1lIENPTExBVEUgJ2VuJw==)
## Adding new third-party libraries and maintaining patches in third-party libraries {#adding-third-party-libraries}

View File

@ -411,6 +411,6 @@ ORDER BY yr,
mo;
```
The data is also available for interactive queries in the [Playground](https://gh-api.clickhouse.com/play?user=play), [example](https://gh-api.clickhouse.com/play?user=play#U0VMRUNUIG1hY2hpbmVfbmFtZSwKICAgICAgIE1JTihjcHUpIEFTIGNwdV9taW4sCiAgICAgICBNQVgoY3B1KSBBUyBjcHVfbWF4LAogICAgICAgQVZHKGNwdSkgQVMgY3B1X2F2ZywKICAgICAgIE1JTihuZXRfaW4pIEFTIG5ldF9pbl9taW4sCiAgICAgICBNQVgobmV0X2luKSBBUyBuZXRfaW5fbWF4LAogICAgICAgQVZHKG5ldF9pbikgQVMgbmV0X2luX2F2ZywKICAgICAgIE1JTihuZXRfb3V0KSBBUyBuZXRfb3V0X21pbiwKICAgICAgIE1BWChuZXRfb3V0KSBBUyBuZXRfb3V0X21heCwKICAgICAgIEFWRyhuZXRfb3V0KSBBUyBuZXRfb3V0X2F2ZwpGUk9NICgKICBTRUxFQ1QgbWFjaGluZV9uYW1lLAogICAgICAgICBDT0FMRVNDRShjcHVfdXNlciwgMC4wKSBBUyBjcHUsCiAgICAgICAgIENPQUxFU0NFKGJ5dGVzX2luLCAwLjApIEFTIG5ldF9pbiwKICAgICAgICAgQ09BTEVTQ0UoYnl0ZXNfb3V0LCAwLjApIEFTIG5ldF9vdXQKICBGUk9NIG1nYmVuY2gubG9nczEKICBXSEVSRSBtYWNoaW5lX25hbWUgSU4gKCdhbmFuc2knLCdhcmFnb2cnLCd1cmQnKQogICAgQU5EIGxvZ190aW1lID49IFRJTUVTVEFNUCAnMjAxNy0wMS0xMSAwMDowMDowMCcKKSBBUyByCkdST1VQIEJZIG1hY2hpbmVfbmFtZQ==).
The data is also available for interactive queries in the [Playground](https://play.clickhouse.com/play?user=play), [example](https://play.clickhouse.com/play?user=play#U0VMRUNUIG1hY2hpbmVfbmFtZSwKICAgICAgIE1JTihjcHUpIEFTIGNwdV9taW4sCiAgICAgICBNQVgoY3B1KSBBUyBjcHVfbWF4LAogICAgICAgQVZHKGNwdSkgQVMgY3B1X2F2ZywKICAgICAgIE1JTihuZXRfaW4pIEFTIG5ldF9pbl9taW4sCiAgICAgICBNQVgobmV0X2luKSBBUyBuZXRfaW5fbWF4LAogICAgICAgQVZHKG5ldF9pbikgQVMgbmV0X2luX2F2ZywKICAgICAgIE1JTihuZXRfb3V0KSBBUyBuZXRfb3V0X21pbiwKICAgICAgIE1BWChuZXRfb3V0KSBBUyBuZXRfb3V0X21heCwKICAgICAgIEFWRyhuZXRfb3V0KSBBUyBuZXRfb3V0X2F2ZwpGUk9NICgKICBTRUxFQ1QgbWFjaGluZV9uYW1lLAogICAgICAgICBDT0FMRVNDRShjcHVfdXNlciwgMC4wKSBBUyBjcHUsCiAgICAgICAgIENPQUxFU0NFKGJ5dGVzX2luLCAwLjApIEFTIG5ldF9pbiwKICAgICAgICAgQ09BTEVTQ0UoYnl0ZXNfb3V0LCAwLjApIEFTIG5ldF9vdXQKICBGUk9NIG1nYmVuY2gubG9nczEKICBXSEVSRSBtYWNoaW5lX25hbWUgSU4gKCdhbmFuc2knLCdhcmFnb2cnLCd1cmQnKQogICAgQU5EIGxvZ190aW1lID49IFRJTUVTVEFNUCAnMjAxNy0wMS0xMSAwMDowMDowMCcKKSBBUyByCkdST1VQIEJZIG1hY2hpbmVfbmFtZQ==).
[Original article](https://clickhouse.com/docs/en/getting_started/example_datasets/brown-benchmark/) <!--hide-->

View File

@ -126,6 +126,6 @@ SELECT count() FROM cell_towers WHERE pointInPolygon((lon, lat), (SELECT * FROM
1 rows in set. Elapsed: 0.067 sec. Processed 43.28 million rows, 692.42 MB (645.83 million rows/s., 10.33 GB/s.)
```
The data is also available for interactive queries in the [Playground](https://gh-api.clickhouse.com/play?user=play), [example](https://gh-api.clickhouse.com/play?user=play#U0VMRUNUIG1jYywgY291bnQoKSBGUk9NIGNlbGxfdG93ZXJzIEdST1VQIEJZIG1jYyBPUkRFUiBCWSBjb3VudCgpIERFU0M=).
The data is also available for interactive queries in the [Playground](https://play.clickhouse.com/play?user=play), [example](https://play.clickhouse.com/play?user=play#U0VMRUNUIG1jYywgY291bnQoKSBGUk9NIGNlbGxfdG93ZXJzIEdST1VQIEJZIG1jYyBPUkRFUiBCWSBjb3VudCgpIERFU0M=).
Although you cannot create temporary tables there.

View File

@ -351,4 +351,4 @@ At least they have caviar with vodka. Very nice.
## Online Playground {#playground}
The data is uploaded to ClickHouse Playground, [example](https://gh-api.clickhouse.com/play?user=play#U0VMRUNUCiAgICByb3VuZCh0b1VJbnQzMk9yWmVybyhleHRyYWN0KG1lbnVfZGF0ZSwgJ15cXGR7NH0nKSksIC0xKSBBUyBkLAogICAgY291bnQoKSwKICAgIHJvdW5kKGF2ZyhwcmljZSksIDIpLAogICAgYmFyKGF2ZyhwcmljZSksIDAsIDUwLCAxMDApLAogICAgYW55KGRpc2hfbmFtZSkKRlJPTSBtZW51X2l0ZW1fZGVub3JtCldIRVJFIChtZW51X2N1cnJlbmN5IElOICgnRG9sbGFycycsICcnKSkgQU5EIChkID4gMCkgQU5EIChkIDwgMjAyMikgQU5EIChkaXNoX25hbWUgSUxJS0UgJyVjYXZpYXIlJykKR1JPVVAgQlkgZApPUkRFUiBCWSBkIEFTQw==).
The data is uploaded to ClickHouse Playground, [example](https://play.clickhouse.com/play?user=play#U0VMRUNUCiAgICByb3VuZCh0b1VJbnQzMk9yWmVybyhleHRyYWN0KG1lbnVfZGF0ZSwgJ15cXGR7NH0nKSksIC0xKSBBUyBkLAogICAgY291bnQoKSwKICAgIHJvdW5kKGF2ZyhwcmljZSksIDIpLAogICAgYmFyKGF2ZyhwcmljZSksIDAsIDUwLCAxMDApLAogICAgYW55KGRpc2hfbmFtZSkKRlJPTSBtZW51X2l0ZW1fZGVub3JtCldIRVJFIChtZW51X2N1cnJlbmN5IElOICgnRG9sbGFycycsICcnKSkgQU5EIChkID4gMCkgQU5EIChkIDwgMjAyMikgQU5EIChkaXNoX25hbWUgSUxJS0UgJyVjYXZpYXIlJykKR1JPVVAgQlkgZApPUkRFUiBCWSBkIEFTQw==).

View File

@ -5,20 +5,9 @@ description: Dataset containing the on-time performance of airline flights
# OnTime
This dataset can be obtained in two ways:
This dataset contains data from Bureau of Transportation Statistics.
- import from raw data
- download of prepared partitions
## Import from Raw Data {#import-from-raw-data}
Downloading data:
``` bash
wget --no-check-certificate --continue https://transtats.bts.gov/PREZIP/On_Time_Reporting_Carrier_On_Time_Performance_1987_present_{1987..2021}_{1..12}.zip
```
Creating a table:
## Creating a table
``` sql
CREATE TABLE `ontime`
@ -29,140 +18,138 @@ CREATE TABLE `ontime`
`DayofMonth` UInt8,
`DayOfWeek` UInt8,
`FlightDate` Date,
`Reporting_Airline` String,
`Reporting_Airline` LowCardinality(String),
`DOT_ID_Reporting_Airline` Int32,
`IATA_CODE_Reporting_Airline` String,
`Tail_Number` String,
`Flight_Number_Reporting_Airline` String,
`IATA_CODE_Reporting_Airline` LowCardinality(String),
`Tail_Number` LowCardinality(String),
`Flight_Number_Reporting_Airline` LowCardinality(String),
`OriginAirportID` Int32,
`OriginAirportSeqID` Int32,
`OriginCityMarketID` Int32,
`Origin` FixedString(5),
`OriginCityName` String,
`OriginCityName` LowCardinality(String),
`OriginState` FixedString(2),
`OriginStateFips` String,
`OriginStateName` String,
`OriginStateFips` FixedString(2),
`OriginStateName` LowCardinality(String),
`OriginWac` Int32,
`DestAirportID` Int32,
`DestAirportSeqID` Int32,
`DestCityMarketID` Int32,
`Dest` FixedString(5),
`DestCityName` String,
`DestCityName` LowCardinality(String),
`DestState` FixedString(2),
`DestStateFips` String,
`DestStateName` String,
`DestStateFips` FixedString(2),
`DestStateName` LowCardinality(String),
`DestWac` Int32,
`CRSDepTime` Int32,
`DepTime` Int32,
`DepDelay` Int32,
`DepDelayMinutes` Int32,
`DepDel15` Int32,
`DepartureDelayGroups` String,
`DepTimeBlk` String,
`DepartureDelayGroups` LowCardinality(String),
`DepTimeBlk` LowCardinality(String),
`TaxiOut` Int32,
`WheelsOff` Int32,
`WheelsOn` Int32,
`WheelsOff` LowCardinality(String),
`WheelsOn` LowCardinality(String),
`TaxiIn` Int32,
`CRSArrTime` Int32,
`ArrTime` Int32,
`ArrDelay` Int32,
`ArrDelayMinutes` Int32,
`ArrDel15` Int32,
`ArrivalDelayGroups` Int32,
`ArrTimeBlk` String,
`Cancelled` UInt8,
`ArrivalDelayGroups` LowCardinality(String),
`ArrTimeBlk` LowCardinality(String),
`Cancelled` Int8,
`CancellationCode` FixedString(1),
`Diverted` UInt8,
`Diverted` Int8,
`CRSElapsedTime` Int32,
`ActualElapsedTime` Int32,
`AirTime` Nullable(Int32),
`AirTime` Int32,
`Flights` Int32,
`Distance` Int32,
`DistanceGroup` UInt8,
`DistanceGroup` Int8,
`CarrierDelay` Int32,
`WeatherDelay` Int32,
`NASDelay` Int32,
`SecurityDelay` Int32,
`LateAircraftDelay` Int32,
`FirstDepTime` String,
`TotalAddGTime` String,
`LongestAddGTime` String,
`DivAirportLandings` String,
`DivReachedDest` String,
`DivActualElapsedTime` String,
`DivArrDelay` String,
`DivDistance` String,
`Div1Airport` String,
`FirstDepTime` Int16,
`TotalAddGTime` Int16,
`LongestAddGTime` Int16,
`DivAirportLandings` Int8,
`DivReachedDest` Int8,
`DivActualElapsedTime` Int16,
`DivArrDelay` Int16,
`DivDistance` Int16,
`Div1Airport` LowCardinality(String),
`Div1AirportID` Int32,
`Div1AirportSeqID` Int32,
`Div1WheelsOn` String,
`Div1TotalGTime` String,
`Div1LongestGTime` String,
`Div1WheelsOff` String,
`Div1TailNum` String,
`Div2Airport` String,
`Div1WheelsOn` Int16,
`Div1TotalGTime` Int16,
`Div1LongestGTime` Int16,
`Div1WheelsOff` Int16,
`Div1TailNum` LowCardinality(String),
`Div2Airport` LowCardinality(String),
`Div2AirportID` Int32,
`Div2AirportSeqID` Int32,
`Div2WheelsOn` String,
`Div2TotalGTime` String,
`Div2LongestGTime` String,
`Div2WheelsOff` String,
`Div2TailNum` String,
`Div3Airport` String,
`Div2WheelsOn` Int16,
`Div2TotalGTime` Int16,
`Div2LongestGTime` Int16,
`Div2WheelsOff` Int16,
`Div2TailNum` LowCardinality(String),
`Div3Airport` LowCardinality(String),
`Div3AirportID` Int32,
`Div3AirportSeqID` Int32,
`Div3WheelsOn` String,
`Div3TotalGTime` String,
`Div3LongestGTime` String,
`Div3WheelsOff` String,
`Div3TailNum` String,
`Div4Airport` String,
`Div3WheelsOn` Int16,
`Div3TotalGTime` Int16,
`Div3LongestGTime` Int16,
`Div3WheelsOff` Int16,
`Div3TailNum` LowCardinality(String),
`Div4Airport` LowCardinality(String),
`Div4AirportID` Int32,
`Div4AirportSeqID` Int32,
`Div4WheelsOn` String,
`Div4TotalGTime` String,
`Div4LongestGTime` String,
`Div4WheelsOff` String,
`Div4TailNum` String,
`Div5Airport` String,
`Div4WheelsOn` Int16,
`Div4TotalGTime` Int16,
`Div4LongestGTime` Int16,
`Div4WheelsOff` Int16,
`Div4TailNum` LowCardinality(String),
`Div5Airport` LowCardinality(String),
`Div5AirportID` Int32,
`Div5AirportSeqID` Int32,
`Div5WheelsOn` String,
`Div5TotalGTime` String,
`Div5LongestGTime` String,
`Div5WheelsOff` String,
`Div5TailNum` String
`Div5WheelsOn` Int16,
`Div5TotalGTime` Int16,
`Div5LongestGTime` Int16,
`Div5WheelsOff` Int16,
`Div5TailNum` LowCardinality(String)
) ENGINE = MergeTree
PARTITION BY Year
ORDER BY (IATA_CODE_Reporting_Airline, FlightDate)
SETTINGS index_granularity = 8192;
ORDER BY (Year, Quarter, Month, DayofMonth, FlightDate, IATA_CODE_Reporting_Airline);
```
## Import from Raw Data {#import-from-raw-data}
Downloading data:
``` bash
wget --no-check-certificate --continue https://transtats.bts.gov/PREZIP/On_Time_Reporting_Carrier_On_Time_Performance_1987_present_{1987..2022}_{1..12}.zip
```
Loading data with multiple threads:
``` bash
ls -1 *.zip | xargs -I{} -P $(nproc) bash -c "echo {}; unzip -cq {} '*.csv' | sed 's/\.00//g' | clickhouse-client --input_format_with_names_use_header=0 --query='INSERT INTO ontime FORMAT CSVWithNames'"
ls -1 *.zip | xargs -I{} -P $(nproc) bash -c "echo {}; unzip -cq {} '*.csv' | sed 's/\.00//g' | clickhouse-client --input_format_csv_empty_as_default 1 --query='INSERT INTO ontime FORMAT CSVWithNames'"
```
(if you will have memory shortage or other issues on your server, remove the `-P $(nproc)` part)
## Download of Prepared Partitions {#download-of-prepared-partitions}
## Import from a saved copy
``` bash
$ curl -O https://datasets.clickhouse.com/ontime/partitions/ontime.tar
$ tar xvf ontime.tar -C /var/lib/clickhouse # path to ClickHouse data directory
$ # check permissions of unpacked data, fix if required
$ sudo service clickhouse-server restart
$ clickhouse-client --query "select count(*) from datasets.ontime"
Alternatively, you can import data from a saved copy by the following query:
```
INSERT INTO ontime SELECT * FROM s3('https://clickhouse-public-datasets.s3.amazonaws.com/ontime/csv_by_year/*.csv.gz', CSVWithNames) SETTINGS max_insert_threads = 40;
```
:::note
If you will run the queries described below, you have to use the full table name, `datasets.ontime`.
:::
!!! info "Info"
If you are using the prepared partitions or the Online Playground replace any occurrence of `IATA_CODE_Reporting_Airline` or `IATA_CODE_Reporting_Airline AS Carrier` in the following queries with `Carrier` (see `describe ontime`).
The snapshot was created on 2022-05-29.
## Queries {#queries}
@ -398,7 +385,7 @@ ORDER BY c DESC
LIMIT 10;
```
You can also play with the data in Playground, [example](https://gh-api.clickhouse.com/play?user=play#U0VMRUNUIERheU9mV2VlaywgY291bnQoKikgQVMgYwpGUk9NIG9udGltZQpXSEVSRSBZZWFyPj0yMDAwIEFORCBZZWFyPD0yMDA4CkdST1VQIEJZIERheU9mV2VlawpPUkRFUiBCWSBjIERFU0M7Cg==).
You can also play with the data in Playground, [example](https://play.clickhouse.com/play?user=play#U0VMRUNUIERheU9mV2VlaywgY291bnQoKikgQVMgYwpGUk9NIG9udGltZQpXSEVSRSBZZWFyPj0yMDAwIEFORCBZZWFyPD0yMDA4CkdST1VQIEJZIERheU9mV2VlawpPUkRFUiBCWSBjIERFU0M7Cg==).
This performance test was created by Vadim Tkachenko. See:

View File

@ -417,4 +417,4 @@ Result:
### Online Playground {#playground}
You can test other queries to this data set using the interactive resource [Online Playground](https://gh-api.clickhouse.com/play?user=play). For example, [like this](https://gh-api.clickhouse.com/play?user=play#U0VMRUNUCiAgICBvcmlnaW4sCiAgICBjb3VudCgpLAogICAgcm91bmQoYXZnKGdlb0Rpc3RhbmNlKGxvbmdpdHVkZV8xLCBsYXRpdHVkZV8xLCBsb25naXR1ZGVfMiwgbGF0aXR1ZGVfMikpKSBBUyBkaXN0YW5jZSwKICAgIGJhcihkaXN0YW5jZSwgMCwgMTAwMDAwMDAsIDEwMCkgQVMgYmFyCkZST00gb3BlbnNreQpXSEVSRSBvcmlnaW4gIT0gJycKR1JPVVAgQlkgb3JpZ2luCk9SREVSIEJZIGNvdW50KCkgREVTQwpMSU1JVCAxMDA=). However, please note that you cannot create temporary tables here.
You can test other queries to this data set using the interactive resource [Online Playground](https://play.clickhouse.com/play?user=play). For example, [like this](https://play.clickhouse.com/play?user=play#U0VMRUNUCiAgICBvcmlnaW4sCiAgICBjb3VudCgpLAogICAgcm91bmQoYXZnKGdlb0Rpc3RhbmNlKGxvbmdpdHVkZV8xLCBsYXRpdHVkZV8xLCBsb25naXR1ZGVfMiwgbGF0aXR1ZGVfMikpKSBBUyBkaXN0YW5jZSwKICAgIGJhcihkaXN0YW5jZSwgMCwgMTAwMDAwMDAsIDEwMCkgQVMgYmFyCkZST00gb3BlbnNreQpXSEVSRSBvcmlnaW4gIT0gJycKR1JPVVAgQlkgb3JpZ2luCk9SREVSIEJZIGNvdW50KCkgREVTQwpMSU1JVCAxMDA=). However, please note that you cannot create temporary tables here.

View File

@ -334,6 +334,6 @@ Result:
### Online Playground
The dataset is also available in the [Online Playground](https://gh-api.clickhouse.com/play?user=play#U0VMRUNUCiAgICBhcnJheUpvaW4oTkVSKSBBUyBrLAogICAgY291bnQoKSBBUyBjCkZST00gcmVjaXBlcwpHUk9VUCBCWSBrCk9SREVSIEJZIGMgREVTQwpMSU1JVCA1MA==).
The dataset is also available in the [Online Playground](https://play.clickhouse.com/play?user=play#U0VMRUNUCiAgICBhcnJheUpvaW4oTkVSKSBBUyBrLAogICAgY291bnQoKSBBUyBjCkZST00gcmVjaXBlcwpHUk9VUCBCWSBrCk9SREVSIEJZIGMgREVTQwpMSU1JVCA1MA==).
[Original article](https://clickhouse.com/docs/en/getting-started/example-datasets/recipes/) <!--hide-->

View File

@ -26,7 +26,6 @@ $ ./dbgen -s 1000 -T c
$ ./dbgen -s 1000 -T l
$ ./dbgen -s 1000 -T p
$ ./dbgen -s 1000 -T s
$ ./dbgen -s 1000 -T d
```
Creating tables in ClickHouse:
@ -109,10 +108,8 @@ Converting “star schema” to denormalized “flat schema”:
SET max_memory_usage = 20000000000;
CREATE TABLE lineorder_flat
ENGINE = MergeTree
PARTITION BY toYear(LO_ORDERDATE)
ORDER BY (LO_ORDERDATE, LO_ORDERKEY) AS
SELECT
ENGINE = MergeTree ORDER BY (LO_ORDERDATE, LO_ORDERKEY)
AS SELECT
l.LO_ORDERKEY AS LO_ORDERKEY,
l.LO_LINENUMBER AS LO_LINENUMBER,
l.LO_CUSTKEY AS LO_CUSTKEY,

View File

@ -646,4 +646,4 @@ no projection: 100 rows in set. Elapsed: 0.069 sec. Processed 26.32 million rows
### Test It in Playground {#playground}
The dataset is also available in the [Online Playground](https://gh-api.clickhouse.com/play?user=play#U0VMRUNUIHRvd24sIGRpc3RyaWN0LCBjb3VudCgpIEFTIGMsIHJvdW5kKGF2ZyhwcmljZSkpIEFTIHByaWNlLCBiYXIocHJpY2UsIDAsIDUwMDAwMDAsIDEwMCkgRlJPTSB1a19wcmljZV9wYWlkIFdIRVJFIGRhdGUgPj0gJzIwMjAtMDEtMDEnIEdST1VQIEJZIHRvd24sIGRpc3RyaWN0IEhBVklORyBjID49IDEwMCBPUkRFUiBCWSBwcmljZSBERVNDIExJTUlUIDEwMA==).
The dataset is also available in the [Online Playground](https://play.clickhouse.com/play?user=play#U0VMRUNUIHRvd24sIGRpc3RyaWN0LCBjb3VudCgpIEFTIGMsIHJvdW5kKGF2ZyhwcmljZSkpIEFTIHByaWNlLCBiYXIocHJpY2UsIDAsIDUwMDAwMDAsIDEwMCkgRlJPTSB1a19wcmljZV9wYWlkIFdIRVJFIGRhdGUgPj0gJzIwMjAtMDEtMDEnIEdST1VQIEJZIHRvd24sIGRpc3RyaWN0IEhBVklORyBjID49IDEwMCBPUkRFUiBCWSBwcmljZSBERVNDIExJTUlUIDEwMA==).

View File

@ -147,6 +147,16 @@ Features:
[Zeppelin-Interpreter-for-ClickHouse](https://github.com/SiderZhang/Zeppelin-Interpreter-for-ClickHouse) is a [Zeppelin](https://zeppelin.apache.org) interpreter for ClickHouse. Compared with JDBC interpreter, it can provide better timeout control for long running queries.
### ClickCat {#clickcat}
[ClickCat](https://github.com/open-botech/ClickCat) is a firendly user interface that lets you search, explore and visualize your ClickHouse Data.
Features:
- An online SQL editor which can run your SQL code without any installing.
- You can observe all processes and mutations. For those unfinished processes, you can kill them in ui.
- The Metrics contains Cluster Analysis,Data Analysis,Query Analysis.
## Commercial {#commercial}
### DataGrip {#datagrip}

View File

@ -21,7 +21,7 @@ ClickHouse generates an exception for errors with dictionaries. Examples of erro
- The dictionary being accessed could not be loaded.
- Error querying a `cached` dictionary.
You can view the list of external dictionaries and their statuses in the `system.dictionaries` table.
You can view the list of external dictionaries and their statuses in the [system.dictionaries](../../../operations/system-tables/dictionaries.md) table.
The configuration looks like this:
@ -48,6 +48,35 @@ LAYOUT(LAYOUT_TYPE(param value)) -- layout settings
...
```
Dictionaries without word `complex-key*` in a layout have a key with [UInt64](../../../sql-reference/data-types/int-uint.md) type, `complex-key*` dictionaries have a composite key (complex, with arbitrary types).
[UInt64](../../../sql-reference/data-types/int-uint.md) keys in XML dictionaries are defined with `<id>` tag.
Configuration example (column key_column has UInt64 type):
```xml
...
<structure>
<id>
<name>key_column</name>
</id>
...
```
Composite `complex` keys XML dictionaries are defined `<key>` tag.
Configuration example of a composite key (key has one element with [String](../../../sql-reference/data-types/string.md) type):
```xml
...
<structure>
<key>
<attribute>
<name>country_code</name>
<type>String</type>
</attribute>
</key>
...
```
## Ways to Store Dictionaries in Memory {#ways-to-store-dictionaries-in-memory}
- [flat](#flat)
@ -98,6 +127,8 @@ LAYOUT(FLAT(INITIAL_ARRAY_SIZE 50000 MAX_ARRAY_SIZE 5000000))
The dictionary is completely stored in memory in the form of a hash table. The dictionary can contain any number of elements with any identifiers In practice, the number of keys can reach tens of millions of items.
The dictionary key has the [UInt64](../../../sql-reference/data-types/int-uint.md) type.
If `preallocate` is `true` (default is `false`) the hash table will be preallocated (this will make the dictionary load faster). But note that you should use it only if:
- The source support an approximate number of elements (for now it is supported only by the `ClickHouse` source).
@ -125,6 +156,8 @@ LAYOUT(HASHED(PREALLOCATE 0))
Similar to `hashed`, but uses less memory in favor more CPU usage.
The dictionary key has the [UInt64](../../../sql-reference/data-types/int-uint.md) type.
It will be also preallocated so as `hashed` (with `preallocate` set to `true`), and note that it is even more significant for `sparse_hashed`.
Configuration example:
@ -181,6 +214,8 @@ LAYOUT(COMPLEX_KEY_SPARSE_HASHED())
The dictionary is completely stored in memory. Each attribute is stored in an array. The key attribute is stored in the form of a hashed table where value is an index in the attributes array. The dictionary can contain any number of elements with any identifiers. In practice, the number of keys can reach tens of millions of items.
The dictionary key has the [UInt64](../../../sql-reference/data-types/int-uint.md) type.
All types of sources are supported. When updating, data (from a file or from a table) is read in its entirety.
Configuration example:
@ -220,6 +255,7 @@ LAYOUT(COMPLEX_KEY_HASHED_ARRAY())
The dictionary is stored in memory in the form of a hash table with an ordered array of ranges and their corresponding values.
The dictionary key has the [UInt64](../../../sql-reference/data-types/int-uint.md) type.
This storage method works the same way as hashed and allows using date/time (arbitrary numeric type) ranges in addition to the key.
Example: The table contains discounts for each advertiser in the format:
@ -360,6 +396,8 @@ RANGE(MIN StartDate MAX EndDate);
The dictionary is stored in a cache that has a fixed number of cells. These cells contain frequently used elements.
The dictionary key has the [UInt64](../../../sql-reference/data-types/int-uint.md) type.
When searching for a dictionary, the cache is searched first. For each block of data, all keys that are not found in the cache or are outdated are requested from the source using `SELECT attrs... FROM db.table WHERE id IN (k1, k2, ...)`. The received data is then written to the cache.
If keys are not found in dictionary, then update cache task is created and added into update queue. Update queue properties can be controlled with settings `max_update_queue_size`, `update_queue_push_timeout_milliseconds`, `query_wait_timeout_milliseconds`, `max_threads_for_updates`.
@ -420,6 +458,8 @@ This type of storage is for use with composite [keys](../../../sql-reference/dic
Similar to `cache`, but stores data on SSD and index in RAM. All cache dictionary settings related to update queue can also be applied to SSD cache dictionaries.
The dictionary key has the [UInt64](../../../sql-reference/data-types/int-uint.md) type.
``` xml
<layout>
<ssd_cache>
@ -452,7 +492,7 @@ This type of storage is for use with composite [keys](../../../sql-reference/dic
The dictionary is not stored in memory and directly goes to the source during the processing of a request.
The dictionary key has the `UInt64` type.
The dictionary key has the [UInt64](../../../sql-reference/data-types/int-uint.md) type.
All types of [sources](../../../sql-reference/dictionaries/external-dictionaries/external-dicts-dict-sources.md), except local files, are supported.

View File

@ -92,7 +92,7 @@ sidebar_label: "Используемые сторонние библиотеки
SELECT library_name, license_type, license_path FROM system.licenses ORDER BY library_name COLLATE 'en';
```
[Пример](https://gh-api.clickhouse.com/play?user=play#U0VMRUNUIGxpYnJhcnlfbmFtZSwgbGljZW5zZV90eXBlLCBsaWNlbnNlX3BhdGggRlJPTSBzeXN0ZW0ubGljZW5zZXMgT1JERVIgQlkgbGlicmFyeV9uYW1lIENPTExBVEUgJ2VuJw==)
[Пример](https://play.clickhouse.com/play?user=play#U0VMRUNUIGxpYnJhcnlfbmFtZSwgbGljZW5zZV90eXBlLCBsaWNlbnNlX3BhdGggRlJPTSBzeXN0ZW0ubGljZW5zZXMgT1JERVIgQlkgbGlicmFyeV9uYW1lIENPTExBVEUgJ2VuJw==)
## Рекомендации по добавлению сторонних библиотек и поддержанию в них пользовательских изменений {#adding-third-party-libraries}

View File

@ -411,5 +411,4 @@ ORDER BY yr,
mo;
```
Данные также доступны для работы с интерактивными запросами через [Playground](https://gh-api.clickhouse.com/play?user=play), [пример](https://gh-api.clickhouse.com/play?user=play#U0VMRUNUIG1hY2hpbmVfbmFtZSwKICAgICAgIE1JTihjcHUpIEFTIGNwdV9taW4sCiAgICAgICBNQVgoY3B1KSBBUyBjcHVfbWF4LAogICAgICAgQVZHKGNwdSkgQVMgY3B1X2F2ZywKICAgICAgIE1JTihuZXRfaW4pIEFTIG5ldF9pbl9taW4sCiAgICAgICBNQVgobmV0X2luKSBBUyBuZXRfaW5fbWF4LAogICAgICAgQVZHKG5ldF9pbikgQVMgbmV0X2luX2F2ZywKICAgICAgIE1JTihuZXRfb3V0KSBBUyBuZXRfb3V0X21pbiwKICAgICAgIE1BWChuZXRfb3V0KSBBUyBuZXRfb3V0X21heCwKICAgICAgIEFWRyhuZXRfb3V0KSBBUyBuZXRfb3V0X2F2ZwpGUk9NICgKICBTRUxFQ1QgbWFjaGluZV9uYW1lLAogICAgICAgICBDT0FMRVNDRShjcHVfdXNlciwgMC4wKSBBUyBjcHUsCiAgICAgICAgIENPQUxFU0NFKGJ5dGVzX2luLCAwLjApIEFTIG5ldF9pbiwKICAgICAgICAgQ09BTEVTQ0UoYnl0ZXNfb3V0LCAwLjApIEFTIG5ldF9vdXQKICBGUk9NIG1nYmVuY2gubG9nczEKICBXSEVSRSBtYWNoaW5lX25hbWUgSU4gKCdhbmFuc2knLCdhcmFnb2cnLCd1cmQnKQogICAgQU5EIGxvZ190aW1lID49IFRJTUVTVEFNUCAnMjAxNy0wMS0xMSAwMDowMDowMCcKKSBBUyByCkdST1VQIEJZIG1hY2hpbmVfbmFtZQ==).
Данные также доступны для работы с интерактивными запросами через [Playground](https://play.clickhouse.com/play?user=play), [пример](https://play.clickhouse.com/play?user=play#U0VMRUNUIG1hY2hpbmVfbmFtZSwKICAgICAgIE1JTihjcHUpIEFTIGNwdV9taW4sCiAgICAgICBNQVgoY3B1KSBBUyBjcHVfbWF4LAogICAgICAgQVZHKGNwdSkgQVMgY3B1X2F2ZywKICAgICAgIE1JTihuZXRfaW4pIEFTIG5ldF9pbl9taW4sCiAgICAgICBNQVgobmV0X2luKSBBUyBuZXRfaW5fbWF4LAogICAgICAgQVZHKG5ldF9pbikgQVMgbmV0X2luX2F2ZywKICAgICAgIE1JTihuZXRfb3V0KSBBUyBuZXRfb3V0X21pbiwKICAgICAgIE1BWChuZXRfb3V0KSBBUyBuZXRfb3V0X21heCwKICAgICAgIEFWRyhuZXRfb3V0KSBBUyBuZXRfb3V0X2F2ZwpGUk9NICgKICBTRUxFQ1QgbWFjaGluZV9uYW1lLAogICAgICAgICBDT0FMRVNDRShjcHVfdXNlciwgMC4wKSBBUyBjcHUsCiAgICAgICAgIENPQUxFU0NFKGJ5dGVzX2luLCAwLjApIEFTIG5ldF9pbiwKICAgICAgICAgQ09BTEVTQ0UoYnl0ZXNfb3V0LCAwLjApIEFTIG5ldF9vdXQKICBGUk9NIG1nYmVuY2gubG9nczEKICBXSEVSRSBtYWNoaW5lX25hbWUgSU4gKCdhbmFuc2knLCdhcmFnb2cnLCd1cmQnKQogICAgQU5EIGxvZ190aW1lID49IFRJTUVTVEFNUCAnMjAxNy0wMS0xMSAwMDowMDowMCcKKSBBUyByCkdST1VQIEJZIG1hY2hpbmVfbmFtZQ==).

View File

@ -125,4 +125,4 @@ SELECT count() FROM cell_towers WHERE pointInPolygon((lon, lat), (SELECT * FROM
1 rows in set. Elapsed: 0.067 sec. Processed 43.28 million rows, 692.42 MB (645.83 million rows/s., 10.33 GB/s.)
```
Вы можете протестировать другие запросы с помощью интерактивного ресурса [Playground](https://gh-api.clickhouse.com/play?user=play). Например, [вот так](https://gh-api.clickhouse.com/play?user=play#U0VMRUNUIG1jYywgY291bnQoKSBGUk9NIGNlbGxfdG93ZXJzIEdST1VQIEJZIG1jYyBPUkRFUiBCWSBjb3VudCgpIERFU0M=). Однако, обратите внимание, что здесь нельзя создавать временные таблицы.
Вы можете протестировать другие запросы с помощью интерактивного ресурса [Playground](https://play.clickhouse.com/play?user=play). Например, [вот так](https://play.clickhouse.com/play?user=play#U0VMRUNUIG1jYywgY291bnQoKSBGUk9NIGNlbGxfdG93ZXJzIEdST1VQIEJZIG1jYyBPUkRFUiBCWSBjb3VudCgpIERFU0M=). Однако, обратите внимание, что здесь нельзя создавать временные таблицы.

View File

@ -337,6 +337,6 @@ WHERE title = 'Chocolate-Strawberry-Orange Wedding Cake';
### Online Playground
Этот набор данных доступен в [Online Playground](https://gh-api.clickhouse.com/play?user=play#U0VMRUNUCiAgICBhcnJheUpvaW4oTkVSKSBBUyBrLAogICAgY291bnQoKSBBUyBjCkZST00gcmVjaXBlcwpHUk9VUCBCWSBrCk9SREVSIEJZIGMgREVTQwpMSU1JVCA1MA==).
Этот набор данных доступен в [Online Playground](https://play.clickhouse.com/play?user=play#U0VMRUNUCiAgICBhcnJheUpvaW4oTkVSKSBBUyBrLAogICAgY291bnQoKSBBUyBjCkZST00gcmVjaXBlcwpHUk9VUCBCWSBrCk9SREVSIEJZIGMgREVTQwpMSU1JVCA1MA==).
[Оригинальная статья](https://clickhouse.com/docs/ru/getting-started/example-datasets/recipes/) <!--hide-->

View File

@ -21,7 +21,7 @@ sidebar_label: "Хранение словарей в памяти"
- При обращении к словарю, который не удалось загрузить.
- При ошибке запроса к `cached`-словарю.
Список внешних словарей и их статус можно посмотреть в таблице `system.dictionaries`.
Список внешних словарей и их статус можно посмотреть в таблице [system.dictionaries](../../../operations/system-tables/dictionaries.md).
Общий вид конфигурации:
@ -48,6 +48,36 @@ LAYOUT(LAYOUT_TYPE(param value)) -- layout settings
...
```
Ключ словарей не имеющих слово `complex-key*` в названии имеет тип [UInt64](../../../sql-reference/data-types/int-uint.md), `complex-key*` словари позволяют произвольный тип ключа (составной, и из разных типов).
[UInt64](../../../sql-reference/data-types/int-uint.md) ключи в XML словарях задаются тегом `<id>`.
Пример конфигурации (поле key_column имеет тип UInt64):
```xml
...
<structure>
<id>
<name>key_column</name>
</id>
...
```
Cоставные `complex` ключи в XML словарях задаются тегом `<key>`.
Пример конфигурации составного ключа (ключ состоит из одного элемента с типом [String](../../../sql-reference/data-types/string.md)):
```xml
...
<structure>
<key>
<attribute>
<name>country_code</name>
<type>String</type>
</attribute>
</key>
...
```
## Способы размещения словарей в памяти {#ways-to-store-dictionaries-in-memory}
- [flat](#flat)
@ -98,6 +128,8 @@ LAYOUT(FLAT(INITIAL_ARRAY_SIZE 50000 MAX_ARRAY_SIZE 5000000))
Словарь полностью хранится в оперативной памяти в виде хэш-таблиц. Словарь может содержать произвольное количество элементов с произвольными идентификаторами. На практике количество ключей может достигать десятков миллионов элементов.
Ключ словаря имеет тип [UInt64](../../../sql-reference/data-types/int-uint.md).
Если `preallocate` имеет значение `true` (по умолчанию `false`), хеш-таблица будет предварительно определена (это ускорит загрузку словаря). Используйте этот метод только в случае, если:
- Источник поддерживает произвольное количество элементов (пока поддерживается только источником `ClickHouse`).
@ -125,6 +157,8 @@ LAYOUT(HASHED(PREALLOCATE 0))
Аналогичен `hashed`, но при этом занимает меньше места в памяти и генерирует более высокую загрузку CPU.
Ключ словаря имеет тип [UInt64](../../../sql-reference/data-types/int-uint.md).
Для этого типа размещения также можно задать `preallocate` в значении `true`. В данном случае это более важно, чем для типа `hashed`.
Пример конфигурации:
@ -181,6 +215,8 @@ LAYOUT(COMPLEX_KEY_SPARSE_HASHED())
Словарь полностью хранится в оперативной памяти. Каждый атрибут хранится в массиве. Ключевой атрибут хранится в виде хеш-таблицы, где его значение является индексом в массиве атрибутов. Словарь может содержать произвольное количество элементов с произвольными идентификаторами. На практике количество ключей может достигать десятков миллионов элементов.
Ключ словаря имеет тип [UInt64](../../../sql-reference/data-types/int-uint.md).
Поддерживаются все виды источников. При обновлении данные (из файла, из таблицы) считываются целиком.
Пример конфигурации:
@ -220,6 +256,7 @@ LAYOUT(COMPLEX_KEY_HASHED_ARRAY())
Словарь хранится в оперативной памяти в виде хэш-таблицы с упорядоченным массивом диапазонов и соответствующих им значений.
Ключ словаря имеет тип [UInt64](../../../sql-reference/data-types/int-uint.md).
Этот способ размещения работает также как и hashed и позволяет дополнительно к ключу использовать дипазоны по дате/времени (произвольному числовому типу).
Пример: таблица содержит скидки для каждого рекламодателя в виде:
@ -355,6 +392,8 @@ RANGE(MIN StartDate MAX EndDate);
Словарь хранится в кэше, состоящем из фиксированного количества ячеек. Ячейки содержат часто используемые элементы.
Ключ словаря имеет тип [UInt64](../../../sql-reference/data-types/int-uint.md).
При поиске в словаре сначала просматривается кэш. На каждый блок данных, все не найденные в кэше или устаревшие ключи запрашиваются у источника с помощью `SELECT attrs... FROM db.table WHERE id IN (k1, k2, ...)`. Затем, полученные данные записываются в кэш.
Если ключи не были найдены в словаре, то для обновления кэша создается задание и добавляется в очередь обновлений. Параметры очереди обновлений можно устанавливать настройками `max_update_queue_size`, `update_queue_push_timeout_milliseconds`, `query_wait_timeout_milliseconds`, `max_threads_for_updates`
@ -414,6 +453,8 @@ LAYOUT(CACHE(SIZE_IN_CELLS 1000000000))
Похож на `cache`, но хранит данные на SSD, а индекс в оперативной памяти. Все параметры, относящиеся к очереди обновлений, могут также быть применены к SSD-кэш словарям.
Ключ словаря имеет тип [UInt64](../../../sql-reference/data-types/int-uint.md).
``` xml
<layout>
<ssd_cache>
@ -446,7 +487,7 @@ LAYOUT(SSD_CACHE(BLOCK_SIZE 4096 FILE_SIZE 16777216 READ_BUFFER_SIZE 1048576
Словарь не хранит данные локально и взаимодействует с источником непосредственно в момент запроса.
Ключ словаря имеет тип `UInt64`.
Ключ словаря имеет тип [UInt64](../../../sql-reference/data-types/int-uint.md).
Поддерживаются все виды [источников](external-dicts-dict-sources.md), кроме локальных файлов.

View File

@ -62,6 +62,8 @@ option (ENABLE_CLICKHOUSE_KEEPER "ClickHouse alternative to ZooKeeper" ${ENABLE_
option (ENABLE_CLICKHOUSE_KEEPER_CONVERTER "Util allows to convert ZooKeeper logs and snapshots into clickhouse-keeper snapshot" ${ENABLE_CLICKHOUSE_ALL})
option (ENABLE_CLICKHOUSE_SU "A tool similar to 'su'" ${ENABLE_CLICKHOUSE_ALL})
if (NOT ENABLE_NURAFT)
# RECONFIGURE_MESSAGE_LEVEL should not be used here,
# since ENABLE_NURAFT is set to OFF for FreeBSD and Darwin.
@ -237,6 +239,7 @@ add_subdirectory (install)
add_subdirectory (git-import)
add_subdirectory (bash-completion)
add_subdirectory (static-files-disk-uploader)
add_subdirectory (su)
if (ENABLE_CLICKHOUSE_KEEPER)
add_subdirectory (keeper)
@ -269,7 +272,8 @@ if (CLICKHOUSE_ONE_SHARED)
${CLICKHOUSE_ODBC_BRIDGE_SOURCES}
${CLICKHOUSE_KEEPER_SOURCES}
${CLICKHOUSE_KEEPER_CONVERTER_SOURCES}
${CLICKHOUSE_STATIC_FILES_DISK_UPLOADER_SOURCES})
${CLICKHOUSE_STATIC_FILES_DISK_UPLOADER_SOURCES}
${CLICKHOUSE_SU_SOURCES})
target_link_libraries(clickhouse-lib
${CLICKHOUSE_SERVER_LINK}
@ -285,7 +289,8 @@ if (CLICKHOUSE_ONE_SHARED)
${CLICKHOUSE_ODBC_BRIDGE_LINK}
${CLICKHOUSE_KEEPER_LINK}
${CLICKHOUSE_KEEPER_CONVERTER_LINK}
${CLICKHOUSE_STATIC_FILES_DISK_UPLOADER_LINK})
${CLICKHOUSE_STATIC_FILES_DISK_UPLOADER_LINK}
${CLICKHOUSE_SU_LINK})
target_include_directories(clickhouse-lib
${CLICKHOUSE_SERVER_INCLUDE}
@ -318,8 +323,7 @@ if (CLICKHOUSE_SPLIT_BINARY)
clickhouse-obfuscator
clickhouse-git-import
clickhouse-copier
clickhouse-static-files-disk-uploader
)
clickhouse-static-files-disk-uploader)
if (ENABLE_CLICKHOUSE_ODBC_BRIDGE)
list (APPEND CLICKHOUSE_ALL_TARGETS clickhouse-odbc-bridge)
@ -387,6 +391,9 @@ else ()
if (ENABLE_CLICKHOUSE_STATIC_FILES_DISK_UPLOADER)
clickhouse_target_link_split_lib(clickhouse static-files-disk-uploader)
endif ()
if (ENABLE_CLICKHOUSE_SU)
clickhouse_target_link_split_lib(clickhouse su)
endif ()
if (ENABLE_CLICKHOUSE_KEEPER)
clickhouse_target_link_split_lib(clickhouse keeper)
endif()
@ -453,6 +460,11 @@ else ()
install (FILES "${CMAKE_CURRENT_BINARY_DIR}/clickhouse-static-files-disk-uploader" DESTINATION ${CMAKE_INSTALL_BINDIR} COMPONENT clickhouse)
list(APPEND CLICKHOUSE_BUNDLE clickhouse-static-files-disk-uploader)
endif ()
if (ENABLE_CLICKHOUSE_SU)
add_custom_target (clickhouse-su ALL COMMAND ${CMAKE_COMMAND} -E create_symlink clickhouse clickhouse-su DEPENDS clickhouse)
install (FILES "${CMAKE_CURRENT_BINARY_DIR}/clickhouse-su" DESTINATION ${CMAKE_INSTALL_BINDIR} COMPONENT clickhouse)
list(APPEND CLICKHOUSE_BUNDLE clickhouse-su)
endif ()
if (ENABLE_CLICKHOUSE_KEEPER)
if (NOT BUILD_STANDALONE_KEEPER AND CREATE_KEEPER_SYMLINK)

View File

@ -19,3 +19,4 @@
#cmakedefine01 ENABLE_CLICKHOUSE_KEEPER
#cmakedefine01 ENABLE_CLICKHOUSE_KEEPER_CONVERTER
#cmakedefine01 ENABLE_CLICKHOUSE_STATIC_FILES_DISK_UPLOADER
#cmakedefine01 ENABLE_CLICKHOUSE_SU

View File

@ -925,24 +925,7 @@ namespace
executable.string(), config.string(), pid_file.string());
if (!user.empty())
{
#if defined(OS_FREEBSD)
command = fmt::format("su -m '{}' -c '{}'", user, command);
#else
bool may_need_sudo = geteuid() != 0;
if (may_need_sudo)
{
struct passwd *p = getpwuid(geteuid());
// Only use sudo when we are not the given user
if (p == nullptr || std::string(p->pw_name) != user)
command = fmt::format("sudo -u '{}' {}", user, command);
}
else
{
command = fmt::format("su -s /bin/sh '{}' -c '{}'", user, command);
}
#endif
}
command = fmt::format("clickhouse su '{}' {}", user, command);
fmt::print("Will run {}\n", command);
executeScript(command, true);

View File

@ -65,6 +65,9 @@ int mainEntryClickHouseKeeperConverter(int argc, char ** argv);
#if ENABLE_CLICKHOUSE_STATIC_FILES_DISK_UPLOADER
int mainEntryClickHouseStaticFilesDiskUploader(int argc, char ** argv);
#endif
#if ENABLE_CLICKHOUSE_SU
int mainEntryClickHouseSU(int argc, char ** argv);
#endif
#if ENABLE_CLICKHOUSE_INSTALL
int mainEntryClickHouseInstall(int argc, char ** argv);
int mainEntryClickHouseStart(int argc, char ** argv);
@ -81,8 +84,6 @@ int mainEntryClickHouseHashBinary(int, char **)
return 0;
}
#define ARRAY_SIZE(a) (sizeof(a)/sizeof((a)[0]))
namespace
{
@ -138,6 +139,9 @@ std::pair<const char *, MainFunc> clickhouse_applications[] =
#endif
#if ENABLE_CLICKHOUSE_STATIC_FILES_DISK_UPLOADER
{"static-files-disk-uploader", mainEntryClickHouseStaticFilesDiskUploader},
#endif
#if ENABLE_CLICKHOUSE_SU
{"su", mainEntryClickHouseSU},
#endif
{"hash-binary", mainEntryClickHouseHashBinary},
};
@ -189,7 +193,7 @@ auto instructionFailToString(InstructionFail fail)
{
switch (fail)
{
#define ret(x) return std::make_tuple(STDERR_FILENO, x, ARRAY_SIZE(x) - 1)
#define ret(x) return std::make_tuple(STDERR_FILENO, x, sizeof(x) - 1)
case InstructionFail::NONE:
ret("NONE");
case InstructionFail::SSE3:
@ -277,7 +281,7 @@ void checkRequiredInstructionsImpl(volatile InstructionFail & fail)
#define writeError(data) do \
{ \
static_assert(__builtin_constant_p(data)); \
if (!writeRetry(STDERR_FILENO, data, ARRAY_SIZE(data) - 1)) \
if (!writeRetry(STDERR_FILENO, data, sizeof(data) - 1)) \
_Exit(1); \
} while (false)
@ -334,6 +338,7 @@ struct Checker
#endif
;
/// NOTE: We will migrate to full static linking or our own dynamic loader to make this code obsolete.
void checkHarmfulEnvironmentVariables(char ** argv)
{
@ -406,17 +411,17 @@ int main(int argc_, char ** argv_)
inside_main = true;
SCOPE_EXIT({ inside_main = false; });
/// PHDR cache is required for query profiler to work reliably
/// It also speed up exception handling, but exceptions from dynamically loaded libraries (dlopen)
/// will work only after additional call of this function.
updatePHDRCache();
checkHarmfulEnvironmentVariables(argv_);
/// Reset new handler to default (that throws std::bad_alloc)
/// It is needed because LLVM library clobbers it.
std::set_new_handler(nullptr);
/// PHDR cache is required for query profiler to work reliably
/// It also speed up exception handling, but exceptions from dynamically loaded libraries (dlopen)
/// will work only after additional call of this function.
updatePHDRCache();
std::vector<char *> argv(argv_, argv_ + argc_);
/// Print a basic help if nothing was matched

View File

@ -1395,8 +1395,11 @@ int Server::main(const std::vector<std::string> & /*args*/)
fs::create_directories(format_schema_path);
/// Check sanity of MergeTreeSettings on server startup
global_context->getMergeTreeSettings().sanityCheck(settings);
global_context->getReplicatedMergeTreeSettings().sanityCheck(settings);
{
size_t background_pool_tasks = global_context->getMergeMutateExecutor()->getMaxTasksCount();
global_context->getMergeTreeSettings().sanityCheck(background_pool_tasks);
global_context->getReplicatedMergeTreeSettings().sanityCheck(background_pool_tasks);
}
/// try set up encryption. There are some errors in config, error will be printed and server wouldn't start.
CompressionCodecEncrypted::Configuration::instance().load(config(), "encryption_codecs");

View File

@ -200,6 +200,6 @@ try
}
catch (...)
{
std::cerr << DB::getCurrentExceptionMessage(false);
std::cerr << DB::getCurrentExceptionMessage(false) << '\n';
return 1;
}

View File

@ -0,0 +1,3 @@
set (CLICKHOUSE_SU_SOURCES clickhouse-su.cpp)
set (CLICKHOUSE_SU_LINK PRIVATE dbms)
clickhouse_program_add(su)

View File

@ -0,0 +1,145 @@
#include <Common/Exception.h>
#include <IO/ReadHelpers.h>
#include <fmt/format.h>
#include <vector>
#include <sys/types.h>
#include <unistd.h>
#include <pwd.h>
#include <grp.h>
/// "su" means "set user"
/// In fact, this program can set Unix user and group.
///
/// Usage:
/// clickhouse su user[:group] args...
///
/// - will set user and, optionally, group and exec the remaining args.
/// user and group can be numeric identifiers or strings.
///
/// The motivation for this tool is very obscure and idiosyncratic. It is needed for Docker.
/// People want to run programs inside Docker with dropped privileges (less than root).
/// But the standard Linux "su" program is not suitable for usage inside Docker,
/// because it is creating pseudoterminals to avoid hijacking input from the terminal, for security,
/// but Docker is also doing something with the terminal and it is incompatible.
/// For this reason, people use alternative and less "secure" versions of "su" tools like "gosu" or "su-exec".
/// But it would be very strange to use 3rd-party software only to do two-three syscalls.
/// That's why we provide this tool.
///
/// Note: ClickHouse does not need Docker at all and works better without Docker.
/// ClickHouse has no dependencies, it is packaged and distributed in single binary.
/// There is no reason to use Docker unless you are already running all your software in Docker.
namespace DB
{
namespace ErrorCodes
{
extern const int BAD_ARGUMENTS;
extern const int SYSTEM_ERROR;
}
void setUserAndGroup(std::string arg_uid, std::string arg_gid)
{
static constexpr size_t buf_size = 16384; /// Linux man page says it is enough. Nevertheless, we will check if it's not enough and throw.
std::unique_ptr<char[]> buf(new char[buf_size]);
/// Set the group first, because if we set user, the privileges will be already dropped and we will not be able to set the group later.
if (!arg_gid.empty())
{
gid_t gid = 0;
if (!tryParse(gid, arg_gid) || gid == 0)
{
group entry{};
group * result{};
if (0 != getgrnam_r(arg_gid.data(), &entry, buf.get(), buf_size, &result))
throwFromErrno(fmt::format("Cannot do 'getgrnam_r' to obtain gid from group name ({})", arg_gid), ErrorCodes::SYSTEM_ERROR);
if (!result)
throw Exception("Group {} is not found in the system", ErrorCodes::BAD_ARGUMENTS);
gid = entry.gr_gid;
}
if (gid == 0 && getgid() != 0)
throw Exception("Group has id 0, but dropping privileges to gid 0 does not make sense", ErrorCodes::BAD_ARGUMENTS);
if (0 != setgid(gid))
throwFromErrno(fmt::format("Cannot do 'setgid' to user ({})", arg_gid), ErrorCodes::SYSTEM_ERROR);
}
if (!arg_uid.empty())
{
/// Is it numeric id or name?
uid_t uid = 0;
if (!tryParse(uid, arg_uid) || uid == 0)
{
passwd entry{};
passwd * result{};
if (0 != getpwnam_r(arg_uid.data(), &entry, buf.get(), buf_size, &result))
throwFromErrno(fmt::format("Cannot do 'getpwnam_r' to obtain uid from user name ({})", arg_uid), ErrorCodes::SYSTEM_ERROR);
if (!result)
throw Exception("User {} is not found in the system", ErrorCodes::BAD_ARGUMENTS);
uid = entry.pw_uid;
}
if (uid == 0 && getuid() != 0)
throw Exception("User has id 0, but dropping privileges to uid 0 does not make sense", ErrorCodes::BAD_ARGUMENTS);
if (0 != setuid(uid))
throwFromErrno(fmt::format("Cannot do 'setuid' to user ({})", arg_uid), ErrorCodes::SYSTEM_ERROR);
}
}
}
int mainEntryClickHouseSU(int argc, char ** argv)
try
{
using namespace DB;
if (argc < 3)
{
std::cout << "Usage: ./clickhouse su user:group ..." << std::endl;
exit(0);
}
std::string_view user_and_group = argv[1];
std::string user;
std::string group;
auto pos = user_and_group.find(':');
if (pos == std::string_view::npos)
{
user = user_and_group;
}
else
{
user = user_and_group.substr(0, pos);
group = user_and_group.substr(pos + 1);
}
setUserAndGroup(std::move(user), std::move(group));
std::vector<char *> new_argv;
new_argv.reserve(argc - 1);
new_argv.insert(new_argv.begin(), argv + 2, argv + argc);
new_argv.push_back(nullptr);
execvp(new_argv.front(), new_argv.data());
throwFromErrno("Cannot execvp", ErrorCodes::SYSTEM_ERROR);
}
catch (...)
{
std::cerr << DB::getCurrentExceptionMessage(false) << '\n';
return 1;
}

View File

@ -159,6 +159,7 @@ enum class AccessType
M(SYSTEM_RESTART_REPLICA, "RESTART REPLICA", TABLE, SYSTEM) \
M(SYSTEM_RESTORE_REPLICA, "RESTORE REPLICA", TABLE, SYSTEM) \
M(SYSTEM_SYNC_DATABASE_REPLICA, "SYNC DATABASE REPLICA", DATABASE, SYSTEM) \
M(SYSTEM_SYNC_TRANSACTION_LOG, "SYNC TRANSACTION LOG", GLOBAL, SYSTEM) \
M(SYSTEM_FLUSH_DISTRIBUTED, "FLUSH DISTRIBUTED", TABLE, SYSTEM_FLUSH) \
M(SYSTEM_FLUSH_LOGS, "FLUSH LOGS", GLOBAL, SYSTEM_FLUSH) \
M(SYSTEM_FLUSH, "", GROUP, SYSTEM) \

View File

@ -224,8 +224,16 @@ public:
++this->data(place).denominator;
}
void
addBatchSinglePlace(
void addManyDefaults(
AggregateDataPtr __restrict place,
const IColumn ** /*columns*/,
size_t length,
Arena * /*arena*/) const override
{
this->data(place).denominator += length;
}
void addBatchSinglePlace(
size_t row_begin,
size_t row_end,
AggregateDataPtr place,

View File

@ -53,6 +53,15 @@ public:
++data(place).count;
}
void addManyDefaults(
AggregateDataPtr __restrict place,
const IColumn ** /*columns*/,
size_t length,
Arena * /*arena*/) const override
{
data(place).count += length;
}
void addBatchSinglePlace(
size_t row_begin,
size_t row_end,

View File

@ -882,6 +882,7 @@ struct AggregateFunctionMinData : Data
bool changeIfBetter(const IColumn & column, size_t row_num, Arena * arena) { return this->changeIfLess(column, row_num, arena); }
bool changeIfBetter(const Self & to, Arena * arena) { return this->changeIfLess(to, arena); }
void addManyDefaults(const IColumn & column, size_t /*length*/, Arena * arena) { this->changeIfLess(column, 0, arena); }
static const char * name() { return "min"; }
@ -909,6 +910,7 @@ struct AggregateFunctionMaxData : Data
bool changeIfBetter(const IColumn & column, size_t row_num, Arena * arena) { return this->changeIfGreater(column, row_num, arena); }
bool changeIfBetter(const Self & to, Arena * arena) { return this->changeIfGreater(to, arena); }
void addManyDefaults(const IColumn & column, size_t /*length*/, Arena * arena) { this->changeIfGreater(column, 0, arena); }
static const char * name() { return "max"; }
@ -937,6 +939,7 @@ struct AggregateFunctionAnyData : Data
bool changeIfBetter(const IColumn & column, size_t row_num, Arena * arena) { return this->changeFirstTime(column, row_num, arena); }
bool changeIfBetter(const Self & to, Arena * arena) { return this->changeFirstTime(to, arena); }
void addManyDefaults(const IColumn & column, size_t /*length*/, Arena * arena) { this->changeFirstTime(column, 0, arena); }
static const char * name() { return "any"; }
@ -964,6 +967,7 @@ struct AggregateFunctionAnyLastData : Data
bool changeIfBetter(const IColumn & column, size_t row_num, Arena * arena) { return this->changeEveryTime(column, row_num, arena); }
bool changeIfBetter(const Self & to, Arena * arena) { return this->changeEveryTime(to, arena); }
void addManyDefaults(const IColumn & column, size_t /*length*/, Arena * arena) { this->changeEveryTime(column, 0, arena); }
static const char * name() { return "anyLast"; }
@ -1024,6 +1028,8 @@ struct AggregateFunctionSingleValueOrNullData : Data
return false;
}
void addManyDefaults(const IColumn & column, size_t /*length*/, Arena * arena) { this->changeIfBetter(column, 0, arena); }
void insertResultInto(IColumn & to) const
{
if (is_null || first_value)
@ -1098,6 +1104,12 @@ struct AggregateFunctionAnyHeavyData : Data
return false;
}
void addManyDefaults(const IColumn & column, size_t length, Arena * arena)
{
for (size_t i = 0; i < length; ++i)
changeIfBetter(column, 0, arena);
}
void write(WriteBuffer & buf, const ISerialization & serialization) const
{
Data::write(buf, serialization);
@ -1158,6 +1170,15 @@ public:
this->data(place).changeIfBetter(*columns[0], row_num, arena);
}
void addManyDefaults(
AggregateDataPtr __restrict place,
const IColumn ** columns,
size_t length,
Arena * arena) const override
{
this->data(place).addManyDefaults(*columns[0], length, arena);
}
void addBatchSinglePlace(
size_t row_begin,
size_t row_end,

View File

@ -489,6 +489,33 @@ public:
}
}
void addManyDefaults(
AggregateDataPtr __restrict /*place*/,
const IColumn ** /*columns*/,
size_t /*length*/,
Arena * /*arena*/) const override
{
}
void addBatchSparse(
size_t row_begin,
size_t row_end,
AggregateDataPtr * places,
size_t place_offset,
const IColumn ** columns,
Arena * arena) const override
{
const auto & column_sparse = assert_cast<const ColumnSparse &>(*columns[0]);
const auto * values = &column_sparse.getValuesColumn();
const auto & offsets = column_sparse.getOffsetsData();
size_t from = std::lower_bound(offsets.begin(), offsets.end(), row_begin) - offsets.begin();
size_t to = std::lower_bound(offsets.begin(), offsets.end(), row_end) - offsets.begin();
for (size_t i = from; i < to; ++i)
add(places[offsets[i]] + place_offset, &values, i + 1, arena);
}
void merge(AggregateDataPtr __restrict place, ConstAggregateDataPtr rhs, Arena *) const override
{
this->data(place).merge(this->data(rhs));

View File

@ -237,6 +237,15 @@ public:
detail::OneAdder<T, Data>::add(this->data(place), *columns[0], row_num);
}
void addManyDefaults(
AggregateDataPtr __restrict place,
const IColumn ** columns,
size_t /*length*/,
Arena * /*arena*/) const override
{
detail::OneAdder<T, Data>::add(this->data(place), *columns[0], 0);
}
void merge(AggregateDataPtr __restrict place, ConstAggregateDataPtr rhs, Arena *) const override
{
this->data(place).set.merge(this->data(rhs).set);

View File

@ -123,6 +123,10 @@ public:
*/
virtual void add(AggregateDataPtr __restrict place, const IColumn ** columns, size_t row_num, Arena * arena) const = 0;
/// Adds several default values of arguments into aggregation data on which place points to.
/// Default values must be a the 0-th positions in columns.
virtual void addManyDefaults(AggregateDataPtr __restrict place, const IColumn ** columns, size_t length, Arena * arena) const = 0;
/// Merges state (on which place points to) with other state of current aggregation function.
virtual void merge(AggregateDataPtr __restrict place, ConstAggregateDataPtr rhs, Arena * arena) const = 0;
@ -377,6 +381,16 @@ public:
AddFunc getAddressOfAddFunction() const override { return &addFree; }
void addManyDefaults(
AggregateDataPtr __restrict place,
const IColumn ** columns,
size_t length,
Arena * arena) const override
{
for (size_t i = 0; i < length; ++i)
static_cast<const Derived *>(this)->add(place, columns, 0, arena);
}
void addBatch( /// NOLINT
size_t row_begin,
size_t row_end,
@ -413,13 +427,9 @@ public:
{
const auto & column_sparse = assert_cast<const ColumnSparse &>(*columns[0]);
const auto * values = &column_sparse.getValuesColumn();
auto offset_it = column_sparse.begin();
auto offset_it = column_sparse.getIterator(row_begin);
/// FIXME: make it more optimal
for (size_t i = 0; i < row_begin; ++i, ++offset_it)
;
for (size_t i = 0; i < row_end; ++i, ++offset_it)
for (size_t i = row_begin; i < row_end; ++i, ++offset_it)
static_cast<const Derived *>(this)->add(places[offset_it.getCurrentRow()] + place_offset,
&values, offset_it.getValueIndex(), arena);
}
@ -468,17 +478,16 @@ public:
const IColumn ** columns,
Arena * arena) const override
{
/// TODO: add values and defaults separately if order of adding isn't important.
const auto & column_sparse = assert_cast<const ColumnSparse &>(*columns[0]);
const auto * values = &column_sparse.getValuesColumn();
auto offset_it = column_sparse.begin();
const auto & offsets = column_sparse.getOffsetsData();
/// FIXME: make it more optimal
for (size_t i = 0; i < row_begin; ++i, ++offset_it)
;
auto from = std::lower_bound(offsets.begin(), offsets.end(), row_begin) - offsets.begin() + 1;
auto to = std::lower_bound(offsets.begin(), offsets.end(), row_end) - offsets.begin() + 1;
for (size_t i = 0; i < row_end; ++i, ++offset_it)
static_cast<const Derived *>(this)->add(place, &values, offset_it.getValueIndex(), arena);
size_t num_defaults = (row_end - row_begin) - (to - from);
static_cast<const Derived *>(this)->addBatchSinglePlace(from, to, place, &values, arena, -1);
static_cast<const Derived *>(this)->addManyDefaults(place, &values, num_defaults, arena);
}
void addBatchSinglePlaceNotNull( /// NOLINT

View File

@ -87,6 +87,7 @@ add_headers_and_sources(clickhouse_common_io IO/S3)
list (REMOVE_ITEM clickhouse_common_io_sources Common/malloc.cpp Common/new_delete.cpp)
add_headers_and_sources(dbms Disks/IO)
add_headers_and_sources(dbms Disks/ObjectStorages)
if (TARGET ch_contrib::sqlite)
add_headers_and_sources(dbms Databases/SQLite)
endif()
@ -113,16 +114,16 @@ endif()
if (TARGET ch_contrib::aws_s3)
add_headers_and_sources(dbms Common/S3)
add_headers_and_sources(dbms Disks/S3)
add_headers_and_sources(dbms Disks/ObjectStorages/S3)
endif()
if (TARGET ch_contrib::azure_sdk)
add_headers_and_sources(dbms Disks/AzureBlobStorage)
add_headers_and_sources(dbms Disks/ObjectStorages/AzureBlobStorage)
endif()
if (TARGET ch_contrib::hdfs)
add_headers_and_sources(dbms Storages/HDFS)
add_headers_and_sources(dbms Disks/HDFS)
add_headers_and_sources(dbms Disks/ObjectStorages/HDFS)
endif()
add_headers_and_sources(dbms Storages/Cache)

View File

@ -285,11 +285,11 @@ void ClientBase::setupSignalHandler()
sigemptyset(&new_act.sa_mask);
#else
if (sigemptyset(&new_act.sa_mask))
throw Exception(ErrorCodes::CANNOT_SET_SIGNAL_HANDLER, "Cannot set signal handler.");
throwFromErrno("Cannot set signal handler.", ErrorCodes::CANNOT_SET_SIGNAL_HANDLER);
#endif
if (sigaction(SIGINT, &new_act, nullptr))
throw Exception(ErrorCodes::CANNOT_SET_SIGNAL_HANDLER, "Cannot set signal handler.");
throwFromErrno("Cannot set signal handler.", ErrorCodes::CANNOT_SET_SIGNAL_HANDLER);
}
@ -492,7 +492,8 @@ try
String pager = config().getString("pager", "");
if (!pager.empty())
{
signal(SIGPIPE, SIG_IGN);
if (SIG_ERR == signal(SIGPIPE, SIG_IGN))
throwFromErrno("Cannot set signal handler.", ErrorCodes::CANNOT_SET_SIGNAL_HANDLER);
ShellCommand::Config config(pager);
config.pipe_stdin_only = true;

View File

@ -772,6 +772,14 @@ size_t ColumnSparse::getValueIndex(size_t n) const
return it - offsets_data.begin() + 1;
}
ColumnSparse::Iterator ColumnSparse::getIterator(size_t n) const
{
const auto & offsets_data = getOffsetsData();
const auto * it = std::lower_bound(offsets_data.begin(), offsets_data.end(), n);
size_t current_offset = it - offsets_data.begin();
return Iterator(offsets_data, _size, current_offset, n);
}
ColumnPtr recursiveRemoveSparse(const ColumnPtr & column)
{
if (!column)

View File

@ -215,6 +215,7 @@ public:
Iterator begin() const { return Iterator(getOffsetsData(), _size, 0, 0); }
Iterator end() const { return Iterator(getOffsetsData(), _size, getOffsetsData().size(), _size); }
Iterator getIterator(size_t n) const;
private:
using Inserter = std::function<void(IColumn &)>;

View File

@ -628,6 +628,7 @@
M(657, UNSUPPORTED_MEILISEARCH_TYPE) \
M(658, MEILISEARCH_MISSING_SOME_COLUMNS) \
M(659, UNKNOWN_STATUS_OF_TRANSACTION) \
M(660, HDFS_ERROR) \
\
M(999, KEEPER_EXCEPTION) \
M(1000, POCO_EXCEPTION) \

View File

@ -72,6 +72,8 @@ void IFileCache::assertInitialized() const
LRUFileCache::LRUFileCache(const String & cache_base_path_, const FileCacheSettings & cache_settings_)
: IFileCache(cache_base_path_, cache_settings_)
, max_stash_element_size(cache_settings_.max_elements)
, enable_cache_hits_threshold(cache_settings_.enable_cache_hits_threshold)
, log(&Poco::Logger::get("LRUFileCache"))
{
}
@ -404,9 +406,42 @@ LRUFileCache::FileSegmentCell * LRUFileCache::addCell(
"Cache already exists for key: `{}`, offset: {}, size: {}.\nCurrent cache structure: {}",
keyToStr(key), offset, size, dumpStructureUnlocked(key, cache_lock));
auto file_segment = std::make_shared<FileSegment>(offset, size, key, this, state);
FileSegmentCell cell(std::move(file_segment), this, cache_lock);
auto skip_or_download = [&]() -> FileSegmentPtr
{
if (state == FileSegment::State::EMPTY && enable_cache_hits_threshold)
{
auto record = records.find({key, offset});
if (record == records.end())
{
auto queue_iter = stash_queue.add(key, offset, 0, cache_lock);
records.insert({{key, offset}, queue_iter});
if (stash_queue.getElementsNum(cache_lock) > max_stash_element_size)
{
auto remove_queue_iter = stash_queue.begin();
records.erase({remove_queue_iter->key, remove_queue_iter->offset});
stash_queue.remove(remove_queue_iter, cache_lock);
}
/// For segments that do not reach the download threshold, we do not download them, but directly read them
return std::make_shared<FileSegment>(offset, size, key, this, FileSegment::State::SKIP_CACHE);
}
else
{
auto queue_iter = record->second;
queue_iter->hits++;
stash_queue.moveToEnd(queue_iter, cache_lock);
state = queue_iter->hits >= enable_cache_hits_threshold ? FileSegment::State::EMPTY : FileSegment::State::SKIP_CACHE;
return std::make_shared<FileSegment>(offset, size, key, this, state);
}
}
else
return std::make_shared<FileSegment>(offset, size, key, this, state);
};
FileSegmentCell cell(skip_or_download(), this, cache_lock);
auto & offsets = files[key];
if (offsets.empty())
@ -471,7 +506,7 @@ bool LRUFileCache::tryReserve(
std::vector<FileSegmentCell *> to_evict;
std::vector<FileSegmentCell *> trash;
for (const auto & [entry_key, entry_offset, entry_size] : queue)
for (const auto & [entry_key, entry_offset, entry_size, _] : queue)
{
if (!is_overflow())
break;
@ -619,7 +654,7 @@ void LRUFileCache::remove()
std::vector<FileSegment *> to_remove;
for (auto it = queue.begin(); it != queue.end();)
{
const auto & [key, offset, size] = *it++;
const auto & [key, offset, size, _] = *it++;
auto * cell = getCell(key, offset, cache_lock);
if (!cell)
throw Exception(
@ -637,6 +672,10 @@ void LRUFileCache::remove()
}
}
}
/// Remove all access information.
records.clear();
stash_queue.removeAll(cache_lock);
}
void LRUFileCache::remove(
@ -882,6 +921,7 @@ LRUFileCache::FileSegmentCell::FileSegmentCell(
queue_iterator = cache->queue.add(file_segment->key(), file_segment->offset(), file_segment->range().size(), cache_lock);
break;
}
case FileSegment::State::SKIP_CACHE:
case FileSegment::State::EMPTY:
case FileSegment::State::DOWNLOADING:
{
@ -898,7 +938,7 @@ LRUFileCache::LRUQueue::Iterator LRUFileCache::LRUQueue::add(
const IFileCache::Key & key, size_t offset, size_t size, std::lock_guard<std::mutex> & /* cache_lock */)
{
#ifndef NDEBUG
for (const auto & [entry_key, entry_offset, _] : queue)
for (const auto & [entry_key, entry_offset, entry_size, entry_hits] : queue)
{
if (entry_key == key && entry_offset == offset)
throw Exception(
@ -918,6 +958,12 @@ void LRUFileCache::LRUQueue::remove(Iterator queue_it, std::lock_guard<std::mute
queue.erase(queue_it);
}
void LRUFileCache::LRUQueue::removeAll(std::lock_guard<std::mutex> & /* cache_lock */)
{
queue.clear();
cache_size = 0;
}
void LRUFileCache::LRUQueue::moveToEnd(Iterator queue_it, std::lock_guard<std::mutex> & /* cache_lock */)
{
queue.splice(queue.end(), queue, queue_it);
@ -934,7 +980,7 @@ bool LRUFileCache::LRUQueue::contains(
{
/// This method is used for assertions in debug mode.
/// So we do not care about complexity here.
for (const auto & [entry_key, entry_offset, size] : queue)
for (const auto & [entry_key, entry_offset, size, _] : queue)
{
if (key == entry_key && offset == entry_offset)
return true;
@ -947,7 +993,7 @@ void LRUFileCache::LRUQueue::assertCorrectness(LRUFileCache * cache, std::lock_g
[[maybe_unused]] size_t total_size = 0;
for (auto it = queue.begin(); it != queue.end();)
{
auto & [key, offset, size] = *it++;
auto & [key, offset, size, _] = *it++;
auto * cell = cache->getCell(key, offset, cache_lock);
if (!cell)
@ -969,7 +1015,7 @@ void LRUFileCache::LRUQueue::assertCorrectness(LRUFileCache * cache, std::lock_g
String LRUFileCache::LRUQueue::toString(std::lock_guard<std::mutex> & /* cache_lock */) const
{
String result;
for (const auto & [key, offset, size] : queue)
for (const auto & [key, offset, size, _] : queue)
{
if (!result.empty())
result += ", ";
@ -1007,7 +1053,8 @@ void LRUFileCache::assertCacheCellsCorrectness(
if (file_segment->reserved_size != 0)
{
assert(cell.queue_iterator);
assert(queue.contains(file_segment->key(), file_segment->offset(), cache_lock));
/// FIXME: this is too slow, need to make it O(1)
/// assert(queue.contains(file_segment->key(), file_segment->offset(), cache_lock));
}
}
}

View File

@ -7,6 +7,7 @@
#include <mutex>
#include <unordered_map>
#include <unordered_set>
#include <boost/functional/hash.hpp>
#include <boost/noncopyable.hpp>
#include <map>
@ -165,6 +166,7 @@ private:
Key key;
size_t offset;
size_t size;
size_t hits = 0;
FileKeyAndOffset(const Key & key_, size_t offset_, size_t size_) : key(key_), offset(offset_), size(size_) {}
};
@ -194,6 +196,8 @@ private:
Iterator end() { return queue.end(); }
void removeAll(std::lock_guard<std::mutex> & cache_lock);
private:
std::list<FileKeyAndOffset> queue;
size_t cache_size = 0;
@ -223,8 +227,26 @@ private:
using FileSegmentsByOffset = std::map<size_t, FileSegmentCell>;
using CachedFiles = std::unordered_map<Key, FileSegmentsByOffset>;
using AccessKeyAndOffset = std::pair<Key, size_t>;
struct KeyAndOffsetHash
{
std::size_t operator()(const AccessKeyAndOffset & key) const
{
return std::hash<UInt128>()(key.first) ^ std::hash<UInt64>()(key.second);
}
};
using AccessRecord = std::unordered_map<AccessKeyAndOffset, LRUQueue::Iterator, KeyAndOffsetHash>;
CachedFiles files;
LRUQueue queue;
LRUQueue stash_queue;
AccessRecord records;
size_t max_stash_element_size;
size_t enable_cache_hits_threshold;
Poco::Logger * log;
FileSegments getImpl(
@ -279,7 +301,7 @@ private:
size_t getFileSegmentsNumUnlocked(std::lock_guard<std::mutex> & cache_lock) const;
void assertCacheCellsCorrectness(const FileSegmentsByOffset & cells_by_offset, std::lock_guard<std::mutex> & cache_lock);
static void assertCacheCellsCorrectness(const FileSegmentsByOffset & cells_by_offset, std::lock_guard<std::mutex> & cache_lock);
public:
String dumpStructure(const Key & key_) override;

View File

@ -11,6 +11,7 @@ void FileCacheSettings::loadFromConfig(const Poco::Util::AbstractConfiguration &
max_elements = config.getUInt64(config_prefix + ".data_cache_max_elements", REMOTE_FS_OBJECTS_CACHE_DEFAULT_MAX_ELEMENTS);
max_file_segment_size = config.getUInt64(config_prefix + ".max_file_segment_size", REMOTE_FS_OBJECTS_CACHE_DEFAULT_MAX_FILE_SEGMENT_SIZE);
cache_on_write_operations = config.getUInt64(config_prefix + ".cache_on_write_operations", false);
enable_cache_hits_threshold = config.getUInt64(config_prefix + ".enable_cache_hits_threshold", REMOTE_FS_OBJECTS_CACHE_ENABLE_HITS_THRESHOLD);
}
}

View File

@ -14,6 +14,8 @@ struct FileCacheSettings
size_t max_file_segment_size = REMOTE_FS_OBJECTS_CACHE_DEFAULT_MAX_FILE_SEGMENT_SIZE;
bool cache_on_write_operations = false;
size_t enable_cache_hits_threshold = REMOTE_FS_OBJECTS_CACHE_ENABLE_HITS_THRESHOLD;
void loadFromConfig(const Poco::Util::AbstractConfiguration & config, const std::string & config_prefix);
};

View File

@ -7,6 +7,7 @@ namespace DB
static constexpr int REMOTE_FS_OBJECTS_CACHE_DEFAULT_MAX_CACHE_SIZE = 1024 * 1024 * 1024;
static constexpr int REMOTE_FS_OBJECTS_CACHE_DEFAULT_MAX_FILE_SEGMENT_SIZE = 100 * 1024 * 1024;
static constexpr int REMOTE_FS_OBJECTS_CACHE_DEFAULT_MAX_ELEMENTS = 1024 * 1024;
static constexpr int REMOTE_FS_OBJECTS_CACHE_ENABLE_HITS_THRESHOLD = 0;
class IFileCache;
using FileCachePtr = std::shared_ptr<IFileCache>;

View File

@ -59,6 +59,10 @@ FileSegment::FileSegment(
downloader_id = getCallerId();
break;
}
case (State::SKIP_CACHE):
{
break;
}
default:
{
throw Exception(ErrorCodes::REMOTE_FS_OBJECT_CACHE_ERROR, "Can create cell with either EMPTY, DOWNLOADED, DOWNLOADING state");
@ -525,6 +529,14 @@ void FileSegment::complete(std::lock_guard<std::mutex> & cache_lock)
void FileSegment::completeUnlocked(std::lock_guard<std::mutex> & cache_lock, std::lock_guard<std::mutex> & segment_lock)
{
bool is_last_holder = cache->isLastFileSegmentHolder(key(), offset(), cache_lock, segment_lock);
if (is_last_holder && download_state == State::SKIP_CACHE)
{
cache->remove(key(), offset(), cache_lock, segment_lock);
return;
}
if (download_state == State::SKIP_CACHE || is_detached)
return;
@ -542,8 +554,7 @@ void FileSegment::completeUnlocked(std::lock_guard<std::mutex> & cache_lock, std
/// Segment state can be changed from DOWNLOADING or EMPTY only if the caller is the
/// downloader or the only owner of the segment.
bool can_update_segment_state = isDownloaderImpl(segment_lock)
|| cache->isLastFileSegmentHolder(key(), offset(), cache_lock, segment_lock);
bool can_update_segment_state = isDownloaderImpl(segment_lock) || is_last_holder;
if (can_update_segment_state)
download_state = State::PARTIALLY_DOWNLOADED;

View File

@ -144,6 +144,13 @@
M(MergeTreeDataWriterBlocks, "Number of blocks INSERTed to MergeTree tables. Each block forms a data part of level zero.") \
M(MergeTreeDataWriterBlocksAlreadySorted, "Number of blocks INSERTed to MergeTree tables that appeared to be already sorted.") \
\
M(InsertedWideParts, "Number of parts inserted in Wide format.") \
M(InsertedCompactParts, "Number of parts inserted in Compact format.") \
M(InsertedInMemoryParts, "Number of parts inserted in InMemory format.") \
M(MergedIntoWideParts, "Number of parts merged into Wide format.") \
M(MergedIntoCompactParts, "Number of parts merged into Compact format.") \
M(MergedIntoInMemoryParts, "Number of parts in merged into InMemory format.") \
\
M(MergeTreeDataProjectionWriterRows, "Number of rows INSERTed to MergeTree tables projection.") \
M(MergeTreeDataProjectionWriterUncompressedBytes, "Uncompressed bytes (for columns as they stored in memory) INSERTed to MergeTree tables projection.") \
M(MergeTreeDataProjectionWriterCompressedBytes, "Bytes written to filesystem for data INSERTed to MergeTree tables projection.") \

View File

@ -1,3 +1,5 @@
// NOLINTBEGIN(readability-inconsistent-declaration-parameter-name)
#include <csignal>
#include <sys/time.h>
#if defined(OS_LINUX)
@ -317,3 +319,5 @@ FOR_EACH_WRAPPED_FUNCTION(MAKE_WRAPPER)
# undef MAKE_WRAPPER
#endif
}
// NOLINTEND(readability-inconsistent-declaration-parameter-name)

View File

@ -1,10 +1,8 @@
#include "filesystemHelpers.h"
#include <sys/stat.h>
#if defined(__linux__)
# include <cstdio>
# include <mntent.h>
# include <sys/stat.h>
# include <sys/sysmacros.h>
#endif
#include <cerrno>
@ -13,6 +11,7 @@
#include <filesystem>
#include <fcntl.h>
#include <unistd.h>
#include <sys/stat.h>
#include <sys/types.h>
#include <utime.h>
#include <IO/ReadBufferFromFile.h>

View File

@ -15,7 +15,6 @@
#include <IO/WriteHelpers.h>
#include <boost/algorithm/string.hpp>
#include <libnuraft/cluster_config.hxx>
#include <libnuraft/log_val_type.hxx>
#include <libnuraft/raft_server.hxx>
#include <Poco/Util/AbstractConfiguration.h>
#include <Poco/Util/Application.h>
@ -316,22 +315,6 @@ void KeeperServer::startup(const Poco::Util::AbstractConfiguration & config, boo
state_manager->loadLogStore(state_machine->last_commit_index() + 1, coordination_settings->reserved_log_items);
auto log_store = state_manager->load_log_store();
auto next_log_idx = log_store->next_slot();
if (next_log_idx > 0 && next_log_idx > state_machine->last_commit_index())
{
auto log_entries = log_store->log_entries(state_machine->last_commit_index() + 1, next_log_idx);
auto idx = state_machine->last_commit_index() + 1;
for (const auto & entry : *log_entries)
{
if (entry && entry->get_val_type() == nuraft::log_val_type::app_log)
state_machine->preprocess(idx, entry->get_buf());
++idx;
}
}
loadLatestConfig();
last_local_config = state_manager->parseServersConfiguration(config, true).cluster_config;

View File

@ -44,6 +44,7 @@ namespace
else /// backward compatibility
request_for_session.time = std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::system_clock::now().time_since_epoch()).count();
return request_for_session;
}
}
@ -113,21 +114,6 @@ void KeeperStateMachine::init()
storage = std::make_unique<KeeperStorage>(coordination_settings->dead_session_check_period_ms.totalMilliseconds(), superdigest);
}
nuraft::ptr<nuraft::buffer> KeeperStateMachine::pre_commit(uint64_t log_idx, nuraft::buffer & data)
{
preprocess(log_idx, data);
return nullptr;
}
void KeeperStateMachine::preprocess(const uint64_t log_idx, nuraft::buffer & data)
{
auto request_for_session = parseRequest(data);
if (request_for_session.request->getOpNum() == Coordination::OpNum::SessionID)
return;
std::lock_guard lock(storage_and_responses_lock);
storage->preprocessRequest(request_for_session.request, request_for_session.session_id, request_for_session.time, log_idx);
}
nuraft::ptr<nuraft::buffer> KeeperStateMachine::commit(const uint64_t log_idx, nuraft::buffer & data)
{
auto request_for_session = parseRequest(data);
@ -196,12 +182,6 @@ void KeeperStateMachine::commit_config(const uint64_t /* log_idx */, nuraft::ptr
cluster_config = ClusterConfig::deserialize(*tmp);
}
void KeeperStateMachine::rollback(uint64_t log_idx, nuraft::buffer & /*data*/)
{
std::lock_guard lock(storage_and_responses_lock);
storage->rollbackRequest(log_idx);
}
nuraft::ptr<nuraft::snapshot> KeeperStateMachine::last_snapshot()
{
/// Just return the latest snapshot.
@ -363,7 +343,7 @@ void KeeperStateMachine::processReadRequest(const KeeperStorage::RequestForSessi
{
/// Pure local request, just process it with storage
std::lock_guard lock(storage_and_responses_lock);
auto responses = storage->processRequest(request_for_session.request, request_for_session.session_id, request_for_session.time, std::nullopt, true /*check_acl*/, true /*is_local*/);
auto responses = storage->processRequest(request_for_session.request, request_for_session.session_id, request_for_session.time, std::nullopt);
for (const auto & response : responses)
if (!responses_queue.push(response))
throw Exception(ErrorCodes::SYSTEM_ERROR, "Could not push response with session id {} into responses queue", response.session_id);

View File

@ -27,16 +27,16 @@ public:
/// Read state from the latest snapshot
void init();
void preprocess(uint64_t log_idx, nuraft::buffer & data);
nuraft::ptr<nuraft::buffer> pre_commit(uint64_t log_idx, nuraft::buffer & data) override;
/// Currently not supported
nuraft::ptr<nuraft::buffer> pre_commit(const uint64_t /*log_idx*/, nuraft::buffer & /*data*/) override { return nullptr; }
nuraft::ptr<nuraft::buffer> commit(const uint64_t log_idx, nuraft::buffer & data) override; /// NOLINT
/// Save new cluster config to our snapshot (copy of the config stored in StateManager)
void commit_config(const uint64_t log_idx, nuraft::ptr<nuraft::cluster_config> & new_conf) override; /// NOLINT
void rollback(uint64_t log_idx, nuraft::buffer & data) override;
/// Currently not supported
void rollback(const uint64_t /*log_idx*/, nuraft::buffer & /*data*/) override {}
uint64_t last_commit_index() override { return last_committed_idx; }

File diff suppressed because it is too large Load Diff

View File

@ -1,14 +1,14 @@
#pragma once
#include <unordered_map>
#include <vector>
#include <Coordination/ACLMap.h>
#include <Common/ZooKeeper/IKeeper.h>
#include <Common/ConcurrentBoundedQueue.h>
#include <Common/ZooKeeper/ZooKeeperCommon.h>
#include <Coordination/SessionExpiryQueue.h>
#include <Coordination/ACLMap.h>
#include <Coordination/SnapshotableHashTable.h>
#include <IO/WriteBufferFromString.h>
#include <Common/ConcurrentBoundedQueue.h>
#include <Common/ZooKeeper/IKeeper.h>
#include <Common/ZooKeeper/ZooKeeperCommon.h>
#include <unordered_map>
#include <vector>
#include <absl/container/flat_hash_set.h>
@ -29,6 +29,7 @@ struct KeeperStorageSnapshot;
class KeeperStorage
{
public:
struct Node
{
uint64_t acl_id = 0; /// 0 -- no ACL by default
@ -40,18 +41,26 @@ public:
Node() : size_bytes(sizeof(Node)) { }
/// Object memory size
uint64_t sizeInBytes() const { return size_bytes; }
uint64_t sizeInBytes() const
{
return size_bytes;
}
void setData(String new_data);
const auto & getData() const noexcept { return data; }
const auto & getData() const noexcept
{
return data;
}
void addChild(StringRef child_path);
void removeChild(StringRef child_path);
const auto & getChildren() const noexcept { return children; }
const auto & getChildren() const noexcept
{
return children;
}
private:
String data;
ChildrenSet children{};
@ -76,7 +85,10 @@ public:
std::string scheme;
std::string id;
bool operator==(const AuthID & other) const { return scheme == other.scheme && id == other.id; }
bool operator==(const AuthID & other) const
{
return scheme == other.scheme && id == other.id;
}
};
using RequestsForSessions = std::vector<RequestForSession>;
@ -84,7 +96,7 @@ public:
using Container = SnapshotableHashTable<Node>;
using Ephemerals = std::unordered_map<int64_t, std::unordered_set<std::string>>;
using SessionAndWatcher = std::unordered_map<int64_t, std::unordered_set<std::string>>;
using SessionIDs = std::vector<int64_t>;
using SessionIDs = std::unordered_set<int64_t>;
/// Just vector of SHA1 from user:password
using AuthIDs = std::vector<AuthID>;
@ -100,146 +112,6 @@ public:
/// container.
Container container;
// Applying ZooKeeper request to storage consists of two steps:
// - preprocessing which, instead of applying the changes directly to storage,
// generates deltas with those changes, denoted with the request ZXID
// - processing which applies deltas with the correct ZXID to the storage
//
// Delta objects allow us two things:
// - fetch the latest, uncommitted state of an object by getting the committed
// state of that same object from the storage and applying the deltas
// in the same order as they are defined
// - quickly commit the changes to the storage
struct CreateNodeDelta
{
Coordination::Stat stat;
bool is_ephemeral;
bool is_sequental;
Coordination::ACLs acls;
String data;
};
struct RemoveNodeDelta
{
int32_t version{-1};
};
struct UpdateNodeDelta
{
std::function<void(Node &)> update_fn;
int32_t version{-1};
};
struct SetACLDelta
{
Coordination::ACLs acls;
int32_t version{-1};
};
struct ErrorDelta
{
Coordination::Error error;
};
struct FailedMultiDelta
{
std::vector<Coordination::Error> error_codes;
};
// Denotes end of a subrequest in multi request
struct SubDeltaEnd
{
};
struct AddAuthDelta
{
int64_t session_id;
AuthID auth_id;
};
using Operation
= std::variant<CreateNodeDelta, RemoveNodeDelta, UpdateNodeDelta, SetACLDelta, AddAuthDelta, ErrorDelta, SubDeltaEnd, FailedMultiDelta>;
struct Delta
{
Delta(String path_, int64_t zxid_, Operation operation_) : path(std::move(path_)), zxid(zxid_), operation(std::move(operation_)) { }
Delta(int64_t zxid_, Coordination::Error error) : Delta("", zxid_, ErrorDelta{error}) { }
Delta(int64_t zxid_, Operation subdelta) : Delta("", zxid_, subdelta) { }
String path;
int64_t zxid;
Operation operation;
};
struct UncommittedState
{
explicit UncommittedState(KeeperStorage & storage_) : storage(storage_) { }
template <typename Visitor>
void applyDeltas(StringRef path, const Visitor & visitor) const
{
for (const auto & delta : deltas)
{
if (path.empty() || delta.path == path)
std::visit(visitor, delta.operation);
}
}
bool hasACL(int64_t session_id, bool is_local, std::function<bool(const AuthID &)> predicate)
{
for (const auto & session_auth : storage.session_and_auth[session_id])
{
if (predicate(session_auth))
return true;
}
if (is_local)
return false;
for (const auto & delta : deltas)
{
if (const auto * auth_delta = std::get_if<KeeperStorage::AddAuthDelta>(&delta.operation);
auth_delta && auth_delta->session_id == session_id && predicate(auth_delta->auth_id))
return true;
}
return false;
}
std::shared_ptr<Node> getNode(StringRef path);
bool hasNode(StringRef path) const;
Coordination::ACLs getACLs(StringRef path) const;
std::deque<Delta> deltas;
KeeperStorage & storage;
};
UncommittedState uncommitted_state{*this};
Coordination::Error commit(int64_t zxid, int64_t session_id);
// Create node in the storage
// Returns false if it failed to create the node, true otherwise
// We don't care about the exact failure because we should've caught it during preprocessing
bool createNode(
const std::string & path,
String data,
const Coordination::Stat & stat,
bool is_sequental,
bool is_ephemeral,
Coordination::ACLs node_acls,
int64_t session_id);
// Remove node in the storage
// Returns false if it failed to remove the node, true otherwise
// We don't care about the exact failure because we should've caught it during preprocessing
bool removeNode(const std::string & path, int32_t version);
bool checkACL(StringRef path, int32_t permissions, int64_t session_id, bool is_local);
/// Mapping session_id -> set of ephemeral nodes paths
Ephemerals ephemerals;
/// Mapping session_id -> set of watched nodes paths
@ -263,7 +135,10 @@ public:
void clearDeadWatches(int64_t session_id);
/// Get current zxid
int64_t getZXID() const { return zxid; }
int64_t getZXID() const
{
return zxid;
}
const String superdigest;
@ -287,53 +162,78 @@ public:
/// Process user request and return response.
/// check_acl = false only when converting data from ZooKeeper.
ResponsesForSessions processRequest(
const Coordination::ZooKeeperRequestPtr & request,
int64_t session_id,
int64_t time,
std::optional<int64_t> new_last_zxid,
bool check_acl = true,
bool is_local = false);
void preprocessRequest(
const Coordination::ZooKeeperRequestPtr & request, int64_t session_id, int64_t time, int64_t new_last_zxid, bool check_acl = true);
void rollbackRequest(int64_t rollback_zxid);
ResponsesForSessions processRequest(const Coordination::ZooKeeperRequestPtr & request, int64_t session_id, int64_t time, std::optional<int64_t> new_last_zxid, bool check_acl = true);
void finalize();
/// Set of methods for creating snapshots
/// Turn on snapshot mode, so data inside Container is not deleted, but replaced with new version.
void enableSnapshotMode(size_t up_to_version) { container.enableSnapshotMode(up_to_version); }
void enableSnapshotMode(size_t up_to_version)
{
container.enableSnapshotMode(up_to_version);
}
/// Turn off snapshot mode.
void disableSnapshotMode() { container.disableSnapshotMode(); }
void disableSnapshotMode()
{
container.disableSnapshotMode();
}
Container::const_iterator getSnapshotIteratorBegin() const { return container.begin(); }
Container::const_iterator getSnapshotIteratorBegin() const
{
return container.begin();
}
/// Clear outdated data from internal container.
void clearGarbageAfterSnapshot() { container.clearOutdatedNodes(); }
void clearGarbageAfterSnapshot()
{
container.clearOutdatedNodes();
}
/// Get all active sessions
const SessionAndTimeout & getActiveSessions() const { return session_and_timeout; }
const SessionAndTimeout & getActiveSessions() const
{
return session_and_timeout;
}
/// Get all dead sessions
std::vector<int64_t> getDeadSessions() const { return session_expiry_queue.getExpiredSessions(); }
std::vector<int64_t> getDeadSessions() const
{
return session_expiry_queue.getExpiredSessions();
}
/// Introspection functions mostly used in 4-letter commands
uint64_t getNodesCount() const { return container.size(); }
uint64_t getNodesCount() const
{
return container.size();
}
uint64_t getApproximateDataSize() const { return container.getApproximateDataSize(); }
uint64_t getApproximateDataSize() const
{
return container.getApproximateDataSize();
}
uint64_t getArenaDataSize() const { return container.keyArenaSize(); }
uint64_t getArenaDataSize() const
{
return container.keyArenaSize();
}
uint64_t getTotalWatchesCount() const;
uint64_t getWatchedPathsCount() const { return watches.size() + list_watches.size(); }
uint64_t getWatchedPathsCount() const
{
return watches.size() + list_watches.size();
}
uint64_t getSessionsWithWatchesCount() const;
uint64_t getSessionWithEphemeralNodesCount() const { return ephemerals.size(); }
uint64_t getSessionWithEphemeralNodesCount() const
{
return ephemerals.size();
}
uint64_t getTotalEphemeralNodesCount() const;
void dumpWatches(WriteBufferFromOwnString & buf) const;

View File

@ -12,6 +12,7 @@ public:
WriteBufferFromNuraftBuffer();
nuraft::ptr<nuraft::buffer> getBuffer();
bool isFinished() const { return finalized; }
~WriteBufferFromNuraftBuffer() override;

View File

@ -520,7 +520,6 @@ bool deserializeTxn(KeeperStorage & storage, ReadBuffer & in, Poco::Logger * /*l
if (request->getOpNum() == Coordination::OpNum::Multi && hasErrorsInMultiRequest(request))
return true;
storage.preprocessRequest(request, session_id, time, zxid, /* check_acl = */ false);
storage.processRequest(request, session_id, time, zxid, /* check_acl = */ false);
}
}

View File

@ -1,8 +1,6 @@
#include <chrono>
#include <gtest/gtest.h>
#include "Common/ZooKeeper/IKeeper.h"
#include "Coordination/KeeperStorage.h"
#include "config_core.h"
#if USE_NURAFT
@ -1263,7 +1261,6 @@ void testLogAndStateMachine(Coordination::CoordinationSettingsPtr settings, uint
changelog.append(entry);
changelog.end_of_append_batch(0, 0);
state_machine->pre_commit(i, changelog.entry_at(i)->get_buf());
state_machine->commit(i, changelog.entry_at(i)->get_buf());
bool snapshot_created = false;
if (i % settings->snapshot_distance == 0)
@ -1308,7 +1305,6 @@ void testLogAndStateMachine(Coordination::CoordinationSettingsPtr settings, uint
for (size_t i = restore_machine->last_commit_index() + 1; i < restore_changelog.next_slot(); ++i)
{
restore_machine->pre_commit(i, changelog.entry_at(i)->get_buf());
restore_machine->commit(i, changelog.entry_at(i)->get_buf());
}
@ -1411,7 +1407,6 @@ TEST_P(CoordinationTest, TestEphemeralNodeRemove)
request_c->path = "/hello";
request_c->is_ephemeral = true;
auto entry_c = getLogEntryFromZKRequest(0, 1, request_c);
state_machine->pre_commit(1, entry_c->get_buf());
state_machine->commit(1, entry_c->get_buf());
const auto & storage = state_machine->getStorage();
@ -1420,7 +1415,6 @@ TEST_P(CoordinationTest, TestEphemeralNodeRemove)
request_d->path = "/hello";
/// Delete from other session
auto entry_d = getLogEntryFromZKRequest(0, 2, request_d);
state_machine->pre_commit(2, entry_d->get_buf());
state_machine->commit(2, entry_d->get_buf());
EXPECT_EQ(storage.ephemerals.size(), 0);
@ -1783,130 +1777,6 @@ TEST_P(CoordinationTest, TestLogGap)
EXPECT_EQ(changelog1.next_slot(), 61);
}
template <typename ResponseType>
ResponseType getSingleResponse(const auto & responses)
{
EXPECT_FALSE(responses.empty());
return dynamic_cast<ResponseType &>(*responses[0].response);
}
TEST_P(CoordinationTest, TestUncommittedStateBasicCrud)
{
using namespace DB;
using namespace Coordination;
DB::KeeperStorage storage{500, ""};
constexpr std::string_view path = "/test";
const auto get_committed_data = [&]() -> std::optional<String>
{
auto request = std::make_shared<ZooKeeperGetRequest>();
request->path = path;
auto responses = storage.processRequest(request, 0, 0, std::nullopt, true, true);
const auto & get_response = getSingleResponse<ZooKeeperGetResponse>(responses);
if (get_response.error != Error::ZOK)
return std::nullopt;
return get_response.data;
};
const auto preprocess_get = [&](int64_t zxid)
{
auto get_request = std::make_shared<ZooKeeperGetRequest>();
get_request->path = path;
storage.preprocessRequest(get_request, 0, 0, zxid);
return get_request;
};
const auto create_request = std::make_shared<ZooKeeperCreateRequest>();
create_request->path = path;
create_request->data = "initial_data";
storage.preprocessRequest(create_request, 0, 0, 1);
storage.preprocessRequest(create_request, 0, 0, 2);
ASSERT_FALSE(get_committed_data());
const auto after_create_get = preprocess_get(3);
ASSERT_FALSE(get_committed_data());
const auto set_request = std::make_shared<ZooKeeperSetRequest>();
set_request->path = path;
set_request->data = "new_data";
storage.preprocessRequest(set_request, 0, 0, 4);
const auto after_set_get = preprocess_get(5);
ASSERT_FALSE(get_committed_data());
const auto remove_request = std::make_shared<ZooKeeperRemoveRequest>();
remove_request->path = path;
storage.preprocessRequest(remove_request, 0, 0, 6);
storage.preprocessRequest(remove_request, 0, 0, 7);
const auto after_remove_get = preprocess_get(8);
ASSERT_FALSE(get_committed_data());
{
const auto responses = storage.processRequest(create_request, 0, 0, 1);
const auto & create_response = getSingleResponse<ZooKeeperCreateResponse>(responses);
ASSERT_EQ(create_response.error, Error::ZOK);
}
{
const auto responses = storage.processRequest(create_request, 0, 0, 2);
const auto & create_response = getSingleResponse<ZooKeeperCreateResponse>(responses);
ASSERT_EQ(create_response.error, Error::ZNODEEXISTS);
}
{
const auto responses = storage.processRequest(after_create_get, 0, 0, 3);
const auto & get_response = getSingleResponse<ZooKeeperGetResponse>(responses);
ASSERT_EQ(get_response.error, Error::ZOK);
ASSERT_EQ(get_response.data, "initial_data");
}
ASSERT_EQ(get_committed_data(), "initial_data");
{
const auto responses = storage.processRequest(set_request, 0, 0, 4);
const auto & create_response = getSingleResponse<ZooKeeperSetResponse>(responses);
ASSERT_EQ(create_response.error, Error::ZOK);
}
{
const auto responses = storage.processRequest(after_set_get, 0, 0, 5);
const auto & get_response = getSingleResponse<ZooKeeperGetResponse>(responses);
ASSERT_EQ(get_response.error, Error::ZOK);
ASSERT_EQ(get_response.data, "new_data");
}
ASSERT_EQ(get_committed_data(), "new_data");
{
const auto responses = storage.processRequest(remove_request, 0, 0, 6);
const auto & create_response = getSingleResponse<ZooKeeperRemoveResponse>(responses);
ASSERT_EQ(create_response.error, Error::ZOK);
}
{
const auto responses = storage.processRequest(remove_request, 0, 0, 7);
const auto & create_response = getSingleResponse<ZooKeeperRemoveResponse>(responses);
ASSERT_EQ(create_response.error, Error::ZNONODE);
}
{
const auto responses = storage.processRequest(after_remove_get, 0, 0, 8);
const auto & get_response = getSingleResponse<ZooKeeperGetResponse>(responses);
ASSERT_EQ(get_response.error, Error::ZNONODE);
}
ASSERT_FALSE(get_committed_data());
}
INSTANTIATE_TEST_SUITE_P(CoordinationTestSuite,
CoordinationTest,

View File

@ -1,5 +1,6 @@
#pragma once
#include <unordered_set>
#include <vector>
@ -7,6 +8,8 @@ namespace DB
{
using ColumnNumbers = std::vector<size_t>;
using ColumnNumbersSet = std::unordered_set<size_t>;
using ColumnNumbersList = std::vector<ColumnNumbers>;
using ColumnNumbersSetList = std::vector<ColumnNumbersSet>;
}

View File

@ -16,6 +16,7 @@ using NameOrderedSet = std::set<std::string>;
using NameToNameMap = std::unordered_map<std::string, std::string>;
using NameToNameSetMap = std::unordered_map<std::string, NameSet>;
using NameToNameVector = std::vector<std::pair<std::string, std::string>>;
using NameToIndexMap = std::unordered_map<std::string, size_t>;
using NameWithAlias = std::pair<std::string, std::string>;
using NamesWithAliases = std::vector<NameWithAlias>;

View File

@ -1,3 +1,4 @@
#include <cstddef>
#include <Core/NamesAndTypes.h>
#include <base/sort.h>
@ -214,4 +215,17 @@ std::optional<NameAndTypePair> NamesAndTypesList::tryGetByName(const std::string
}
return {};
}
size_t NamesAndTypesList::getPosByName(const std::string &name) const noexcept
{
size_t pos = 0;
for (const NameAndTypePair & column : *this)
{
if (column.name == name)
break;
++pos;
}
return pos;
}
}

View File

@ -105,8 +105,11 @@ public:
/// Check that column contains in list
bool contains(const String & name) const;
/// Try to get column by name, return empty optional if column not found
/// Try to get column by name, returns empty optional if column not found
std::optional<NameAndTypePair> tryGetByName(const std::string & name) const;
/// Try to get column position by name, returns number of columns if column isn't found
size_t getPosByName(const std::string & name) const noexcept;
};
using NamesAndTypesLists = std::vector<NamesAndTypesList>;

View File

@ -86,6 +86,7 @@ static constexpr UInt64 operator""_GiB(unsigned long long value)
M(UInt64, s3_max_connections, 1024, "The maximum number of connections per server.", 0) \
M(Bool, s3_truncate_on_insert, false, "Enables or disables truncate before insert in s3 engine tables.", 0) \
M(Bool, s3_create_new_file_on_insert, false, "Enables or disables creating a new file on each insert in s3 engine tables", 0) \
M(Bool, enable_s3_requests_logging, false, "Enable very explicit logging of S3 requests. Makes sense for debug only.", 0) \
M(UInt64, hdfs_replication, 0, "The actual number of replications can be specified when the hdfs file is created.", 0) \
M(Bool, hdfs_truncate_on_insert, false, "Enables or disables truncate before insert in s3 engine tables", 0) \
M(Bool, hdfs_create_new_file_on_insert, false, "Enables or disables creating a new file on each insert in hdfs engine tables", 0) \
@ -566,7 +567,7 @@ static constexpr UInt64 operator""_GiB(unsigned long long value)
\
M(UInt64, remote_fs_read_max_backoff_ms, 10000, "Max wait time when trying to read data for remote disk", 0) \
M(UInt64, remote_fs_read_backoff_max_tries, 5, "Max attempts to read with backoff", 0) \
M(Bool, enable_filesystem_cache, true, "Use cache for remote filesystem. This setting does not turn on/off cache for disks (must me done via disk config), but allows to bypass cache for some queries if intended", 0) \
M(Bool, enable_filesystem_cache, true, "Use cache for remote filesystem. This setting does not turn on/off cache for disks (must be done via disk config), but allows to bypass cache for some queries if intended", 0) \
M(UInt64, filesystem_cache_max_wait_sec, 5, "Allow to wait at most this number of seconds for download of current remote_fs_buffer_size bytes, and skip cache if exceeded", 0) \
M(Bool, enable_filesystem_cache_on_write_operations, false, "Write into cache on write operations. To actually work this setting requires be added to disk config too", 0) \
M(Bool, enable_filesystem_cache_log, false, "Allows to record the filesystem caching log for each query", 0) \

View File

@ -68,6 +68,14 @@
namespace fs = std::filesystem;
namespace DB
{
namespace ErrorCodes
{
extern const int CANNOT_SET_SIGNAL_HANDLER;
}
}
DB::PipeFDs signal_pipe;
@ -76,7 +84,8 @@ DB::PipeFDs signal_pipe;
*/
static void call_default_signal_handler(int sig)
{
signal(sig, SIG_DFL);
if (SIG_ERR == signal(sig, SIG_DFL))
DB::throwFromErrno("Cannot set signal handler.", DB::ErrorCodes::CANNOT_SET_SIGNAL_HANDLER);
raise(sig);
}
@ -498,9 +507,8 @@ BaseDaemon::~BaseDaemon()
signal_listener_thread.join();
/// Reset signals to SIG_DFL to avoid trying to write to the signal_pipe that will be closed after.
for (int sig : handled_signals)
{
signal(sig, SIG_DFL);
}
if (SIG_ERR == signal(sig, SIG_DFL))
DB::throwFromErrno("Cannot set signal handler.", DB::ErrorCodes::CANNOT_SET_SIGNAL_HANDLER);
signal_pipe.close();
}

View File

@ -89,7 +89,7 @@ bool DatabaseMySQL::empty() const
return true;
for (const auto & [table_name, storage_info] : local_tables_cache)
if (!remove_or_detach_tables.count(table_name))
if (!remove_or_detach_tables.contains(table_name))
return false;
return true;
@ -103,7 +103,7 @@ DatabaseTablesIteratorPtr DatabaseMySQL::getTablesIterator(ContextPtr local_cont
fetchTablesIntoLocalCache(local_context);
for (const auto & [table_name, modify_time_and_storage] : local_tables_cache)
if (!remove_or_detach_tables.count(table_name) && (!filter_by_table_name || filter_by_table_name(table_name)))
if (!remove_or_detach_tables.contains(table_name) && (!filter_by_table_name || filter_by_table_name(table_name)))
tables[table_name] = modify_time_and_storage.second;
return std::make_unique<DatabaseTablesSnapshotIterator>(tables, database_name);
@ -120,7 +120,7 @@ StoragePtr DatabaseMySQL::tryGetTable(const String & mysql_table_name, ContextPt
fetchTablesIntoLocalCache(local_context);
if (!remove_or_detach_tables.count(mysql_table_name) && local_tables_cache.find(mysql_table_name) != local_tables_cache.end())
if (!remove_or_detach_tables.contains(mysql_table_name) && local_tables_cache.find(mysql_table_name) != local_tables_cache.end())
return local_tables_cache[mysql_table_name].second;
return StoragePtr{};
@ -349,11 +349,11 @@ void DatabaseMySQL::attachTable(ContextPtr /* context_ */, const String & table_
{
std::lock_guard<std::mutex> lock{mutex};
if (!local_tables_cache.count(table_name))
if (!local_tables_cache.contains(table_name))
throw Exception("Cannot attach table " + backQuoteIfNeed(database_name) + "." + backQuoteIfNeed(table_name) +
" because it does not exist.", ErrorCodes::UNKNOWN_TABLE);
if (!remove_or_detach_tables.count(table_name))
if (!remove_or_detach_tables.contains(table_name))
throw Exception("Cannot attach table " + backQuoteIfNeed(database_name) + "." + backQuoteIfNeed(table_name) +
" because it already exists.", ErrorCodes::TABLE_ALREADY_EXISTS);
@ -372,11 +372,11 @@ StoragePtr DatabaseMySQL::detachTable(ContextPtr /* context */, const String & t
{
std::lock_guard<std::mutex> lock{mutex};
if (remove_or_detach_tables.count(table_name))
if (remove_or_detach_tables.contains(table_name))
throw Exception("Table " + backQuoteIfNeed(database_name) + "." + backQuoteIfNeed(table_name) + " is dropped",
ErrorCodes::TABLE_IS_DROPPED);
if (!local_tables_cache.count(table_name))
if (!local_tables_cache.contains(table_name))
throw Exception("Table " + backQuoteIfNeed(database_name) + "." + backQuoteIfNeed(table_name) + " doesn't exist.",
ErrorCodes::UNKNOWN_TABLE);
@ -412,7 +412,7 @@ void DatabaseMySQL::detachTablePermanently(ContextPtr, const String & table_name
fs::path remove_flag = fs::path(getMetadataPath()) / (escapeForFileName(table_name) + suffix);
if (remove_or_detach_tables.count(table_name))
if (remove_or_detach_tables.contains(table_name))
throw Exception(ErrorCodes::TABLE_IS_DROPPED, "Table {}.{} is dropped", backQuoteIfNeed(database_name), backQuoteIfNeed(table_name));
if (fs::exists(remove_flag))

View File

@ -252,7 +252,7 @@ Strings DictionaryStructure::getKeysNames() const
static void checkAttributeKeys(const Poco::Util::AbstractConfiguration::Keys & keys)
{
static const std::unordered_set<std::string_view> valid_keys
= {"name", "type", "expression", "null_value", "hierarchical", "injective", "is_object_id"};
= {"name", "type", "expression", "null_value", "hierarchical", "bidirectional", "injective", "is_object_id"};
for (const auto & key : keys)
{
@ -350,6 +350,7 @@ std::vector<DictionaryAttribute> DictionaryStructure::getAttributes(
}
const auto hierarchical = config.getBool(prefix + "hierarchical", false);
const auto bidirectional = config.getBool(prefix + "bidirectional", false);
const auto injective = config.getBool(prefix + "injective", false);
const auto is_object_id = config.getBool(prefix + "is_object_id", false);
@ -362,6 +363,9 @@ std::vector<DictionaryAttribute> DictionaryStructure::getAttributes(
if (has_hierarchy && hierarchical)
throw Exception(ErrorCodes::BAD_ARGUMENTS, "Only one hierarchical attribute supported");
if (bidirectional && !hierarchical)
throw Exception(ErrorCodes::BAD_ARGUMENTS, "Bidirectional can only be applied to hierarchical attributes");
has_hierarchy = has_hierarchy || hierarchical;
res_attributes.emplace_back(DictionaryAttribute{
@ -372,6 +376,7 @@ std::vector<DictionaryAttribute> DictionaryStructure::getAttributes(
expression,
null_value,
hierarchical,
bidirectional,
injective,
is_object_id,
is_nullable});

View File

@ -67,6 +67,7 @@ struct DictionaryAttribute final
const std::string expression;
const Field null_value;
const bool hierarchical;
const bool bidirectional;
const bool injective;
const bool is_object_id;
const bool is_nullable;

View File

@ -43,6 +43,7 @@ FlatDictionary::FlatDictionary(
{
createAttributes();
loadData();
buildHierarchyParentToChildIndexIfNeeded();
calculateBytesAllocated();
}
@ -244,30 +245,43 @@ ColumnUInt8::Ptr FlatDictionary::isInHierarchy(
return result;
}
ColumnPtr FlatDictionary::getDescendants(
ColumnPtr key_column,
const DataTypePtr &,
size_t level) const
DictionaryHierarchyParentToChildIndexPtr FlatDictionary::getHierarchicalIndex() const
{
PaddedPODArray<UInt64> keys_backup;
const auto & keys = getColumnVectorData(this, key_column, keys_backup);
if (hierarhical_index)
return hierarhical_index;
size_t hierarchical_attribute_index = *dict_struct.hierarchical_attribute_index;
const auto & hierarchical_attribute = attributes[hierarchical_attribute_index];
const ContainerType<UInt64> & parent_keys = std::get<ContainerType<UInt64>>(hierarchical_attribute.container);
HashMap<UInt64, PaddedPODArray<UInt64>> parent_to_child;
parent_to_child.reserve(element_count);
for (size_t i = 0; i < parent_keys.size(); ++i)
UInt64 child_keys_size = static_cast<UInt64>(parent_keys.size());
for (UInt64 child_key = 0; child_key < child_keys_size; ++child_key)
{
auto parent_key = parent_keys[i];
if (!loaded_keys[child_key])
continue;
if (loaded_keys[i])
parent_to_child[parent_key].emplace_back(static_cast<UInt64>(i));
auto parent_key = parent_keys[child_key];
parent_to_child[parent_key].emplace_back(child_key);
}
return std::make_shared<DictionaryHierarchicalParentToChildIndex>(parent_to_child);
}
ColumnPtr FlatDictionary::getDescendants(
ColumnPtr key_column,
const DataTypePtr &,
size_t level,
DictionaryHierarchicalParentToChildIndexPtr parent_to_child_index) const
{
PaddedPODArray<UInt64> keys_backup;
const auto & keys = getColumnVectorData(this, key_column, keys_backup);
size_t keys_found;
auto result = getKeysDescendantsArray(keys, parent_to_child, level, keys_found);
auto result = getKeysDescendantsArray(keys, *parent_to_child_index, level, keys_found);
query_count.fetch_add(keys.size(), std::memory_order_relaxed);
found_count.fetch_add(keys_found, std::memory_order_relaxed);
@ -400,6 +414,15 @@ void FlatDictionary::loadData()
throw Exception(ErrorCodes::DICTIONARY_IS_EMPTY, "{}: dictionary source is empty and 'require_nonempty' property is set.", getFullName());
}
void FlatDictionary::buildHierarchyParentToChildIndexIfNeeded()
{
if (!dict_struct.hierarchical_attribute_index)
return;
if (dict_struct.attributes[*dict_struct.hierarchical_attribute_index].bidirectional)
hierarhical_index = getHierarchicalIndex();
}
void FlatDictionary::calculateBytesAllocated()
{
bytes_allocated += attributes.size() * sizeof(attributes.front());
@ -439,6 +462,12 @@ void FlatDictionary::calculateBytesAllocated()
if (update_field_loaded_block)
bytes_allocated += update_field_loaded_block->allocatedBytes();
if (hierarhical_index)
{
hierarchical_index_bytes_allocated = hierarhical_index->getSizeInBytes();
bytes_allocated += hierarchical_index_bytes_allocated;
}
bytes_allocated += string_arena.size();
}
@ -614,7 +643,7 @@ void registerDictionaryFlat(DictionaryFactory & factory)
const auto dict_id = StorageID::fromDictionaryConfig(config, config_prefix);
return std::make_unique<FlatDictionary>(dict_id, dict_struct, std::move(source_ptr), std::move(configuration));
return std::make_unique<FlatDictionary>(dict_id, dict_struct, std::move(source_ptr), configuration);
};
factory.registerLayout("flat", create_layout, false);

View File

@ -92,10 +92,15 @@ public:
ColumnPtr in_key_column,
const DataTypePtr & key_type) const override;
DictionaryHierarchicalParentToChildIndexPtr getHierarchicalIndex() const override;
size_t getHierarchicalIndexBytesAllocated() const override { return hierarchical_index_bytes_allocated; }
ColumnPtr getDescendants(
ColumnPtr key_column,
const DataTypePtr & key_type,
size_t level) const override;
size_t level,
DictionaryHierarchicalParentToChildIndexPtr parent_to_child_index) const override;
Pipe read(const Names & column_names, size_t max_block_size, size_t num_streams) const override;
@ -137,10 +142,15 @@ private:
};
void createAttributes();
void blockToAttributes(const Block & block);
void updateData();
void loadData();
void buildHierarchyParentToChildIndexIfNeeded();
void calculateBytesAllocated();
Attribute createAttribute(const DictionaryAttribute & attribute);
@ -165,6 +175,7 @@ private:
std::vector<bool> loaded_keys;
size_t bytes_allocated = 0;
size_t hierarchical_index_bytes_allocated = 0;
size_t element_count = 0;
size_t bucket_count = 0;
mutable std::atomic<size_t> query_count{0};
@ -172,6 +183,7 @@ private:
BlockPtr update_field_loaded_block;
Arena string_arena;
DictionaryHierarchicalParentToChildIndexPtr hierarhical_index;
};
}

View File

@ -37,6 +37,7 @@ HashedArrayDictionary<dictionary_key_type>::HashedArrayDictionary(
{
createAttributes();
loadData();
buildHierarchyParentToChildIndexIfNeeded();
calculateBytesAllocated();
}
@ -282,18 +283,14 @@ ColumnUInt8::Ptr HashedArrayDictionary<dictionary_key_type>::isInHierarchy(
}
template <DictionaryKeyType dictionary_key_type>
ColumnPtr HashedArrayDictionary<dictionary_key_type>::getDescendants(
ColumnPtr key_column [[maybe_unused]],
const DataTypePtr &,
size_t level [[maybe_unused]]) const
DictionaryHierarchicalParentToChildIndexPtr HashedArrayDictionary<dictionary_key_type>::getHierarchicalIndex() const
{
if constexpr (dictionary_key_type == DictionaryKeyType::Simple)
{
PaddedPODArray<UInt64> keys_backup;
const auto & keys = getColumnVectorData(this, key_column, keys_backup);
if (hierarchical_index)
return hierarchical_index;
size_t hierarchical_attribute_index = *dict_struct.hierarchical_attribute_index;
const auto & hierarchical_attribute = attributes[hierarchical_attribute_index];
const AttributeContainerType<UInt64> & parent_keys_container = std::get<AttributeContainerType<UInt64>>(hierarchical_attribute.container);
@ -306,6 +303,7 @@ ColumnPtr HashedArrayDictionary<dictionary_key_type>::getDescendants(
index_to_key[value] = key;
HashMap<UInt64, PaddedPODArray<UInt64>> parent_to_child;
parent_to_child.reserve(index_to_key.size());
for (size_t i = 0; i < parent_keys_container.size(); ++i)
{
@ -313,13 +311,33 @@ ColumnPtr HashedArrayDictionary<dictionary_key_type>::getDescendants(
if (it == index_to_key.end())
continue;
auto parent_key = it->getMapped();
auto child_key = parent_keys_container[i];
auto child_key = it->getMapped();
auto parent_key = parent_keys_container[i];
parent_to_child[parent_key].emplace_back(child_key);
}
return std::make_shared<DictionaryHierarchicalParentToChildIndex>(parent_to_child);
}
else
{
return nullptr;
}
}
template <DictionaryKeyType dictionary_key_type>
ColumnPtr HashedArrayDictionary<dictionary_key_type>::getDescendants(
ColumnPtr key_column [[maybe_unused]],
const DataTypePtr &,
size_t level [[maybe_unused]],
DictionaryHierarchicalParentToChildIndexPtr parent_to_child_index [[maybe_unused]]) const
{
if constexpr (dictionary_key_type == DictionaryKeyType::Simple)
{
PaddedPODArray<UInt64> keys_backup;
const auto & keys = getColumnVectorData(this, key_column, keys_backup);
size_t keys_found = 0;
auto result = getKeysDescendantsArray(keys, parent_to_child, level, keys_found);
auto result = getKeysDescendantsArray(keys, *parent_to_child_index, level, keys_found);
query_count.fetch_add(keys.size(), std::memory_order_relaxed);
found_count.fetch_add(keys_found, std::memory_order_relaxed);
@ -693,6 +711,16 @@ void HashedArrayDictionary<dictionary_key_type>::loadData()
getFullName());
}
template <DictionaryKeyType dictionary_key_type>
void HashedArrayDictionary<dictionary_key_type>::buildHierarchyParentToChildIndexIfNeeded()
{
if (!dict_struct.hierarchical_attribute_index)
return;
if (dict_struct.attributes[*dict_struct.hierarchical_attribute_index].bidirectional)
hierarchical_index = getHierarchicalIndex();
}
template <DictionaryKeyType dictionary_key_type>
void HashedArrayDictionary<dictionary_key_type>::calculateBytesAllocated()
{
@ -730,10 +758,16 @@ void HashedArrayDictionary<dictionary_key_type>::calculateBytesAllocated()
bytes_allocated += (*attribute.is_index_null).size();
}
bytes_allocated += string_arena.size();
if (update_field_loaded_block)
bytes_allocated += update_field_loaded_block->allocatedBytes();
if (hierarchical_index)
{
hierarchical_index_bytes_allocated = hierarchical_index->getSizeInBytes();
bytes_allocated += hierarchical_index_bytes_allocated;
}
bytes_allocated += string_arena.size();
}
template <DictionaryKeyType dictionary_key_type>

View File

@ -109,10 +109,15 @@ public:
ColumnPtr in_key_column,
const DataTypePtr & key_type) const override;
DictionaryHierarchicalParentToChildIndexPtr getHierarchicalIndex() const override;
size_t getHierarchicalIndexBytesAllocated() const override { return hierarchical_index_bytes_allocated; }
ColumnPtr getDescendants(
ColumnPtr key_column,
const DataTypePtr & key_type,
size_t level) const override;
size_t level,
DictionaryHierarchicalParentToChildIndexPtr parent_to_child_index) const override;
Pipe read(const Names & column_names, size_t max_block_size, size_t num_streams) const override;
@ -173,6 +178,8 @@ private:
void loadData();
void buildHierarchyParentToChildIndexIfNeeded();
void calculateBytesAllocated();
template <typename KeysProvider>
@ -214,6 +221,7 @@ private:
KeyAttribute key_attribute;
size_t bytes_allocated = 0;
size_t hierarchical_index_bytes_allocated = 0;
size_t element_count = 0;
size_t bucket_count = 0;
mutable std::atomic<size_t> query_count{0};
@ -221,6 +229,7 @@ private:
BlockPtr update_field_loaded_block;
Arena string_arena;
DictionaryHierarchicalParentToChildIndexPtr hierarchical_index;
};
extern template class HashedArrayDictionary<DictionaryKeyType::Simple>;

View File

@ -54,6 +54,7 @@ HashedDictionary<dictionary_key_type, sparse>::HashedDictionary(
{
createAttributes();
loadData();
buildHierarchyParentToChildIndexIfNeeded();
calculateBytesAllocated();
}
@ -317,29 +318,46 @@ ColumnUInt8::Ptr HashedDictionary<dictionary_key_type, sparse>::isInHierarchy(
return nullptr;
}
template <DictionaryKeyType dictionary_key_type, bool sparse>
DictionaryHierarchyParentToChildIndexPtr HashedDictionary<dictionary_key_type, sparse>::getHierarchicalIndex() const
{
if constexpr (dictionary_key_type == DictionaryKeyType::Simple)
{
if (hierarchical_index)
return hierarchical_index;
size_t hierarchical_attribute_index = *dict_struct.hierarchical_attribute_index;
const auto & hierarchical_attribute = attributes[hierarchical_attribute_index];
const CollectionType<UInt64> & parent_keys = std::get<CollectionType<UInt64>>(hierarchical_attribute.container);
HashMap<UInt64, PaddedPODArray<UInt64>> parent_to_child;
parent_to_child.reserve(parent_keys.size());
for (const auto & [key, value] : parent_keys)
parent_to_child[value].emplace_back(key);
return std::make_shared<DictionaryHierarchicalParentToChildIndex>(parent_to_child);
}
else
{
return nullptr;
}
}
template <DictionaryKeyType dictionary_key_type, bool sparse>
ColumnPtr HashedDictionary<dictionary_key_type, sparse>::getDescendants(
ColumnPtr key_column [[maybe_unused]],
const DataTypePtr &,
size_t level [[maybe_unused]]) const
size_t level [[maybe_unused]],
DictionaryHierarchicalParentToChildIndexPtr parent_to_child_index [[maybe_unused]]) const
{
if constexpr (dictionary_key_type == DictionaryKeyType::Simple)
{
PaddedPODArray<UInt64> keys_backup;
const auto & keys = getColumnVectorData(this, key_column, keys_backup);
size_t hierarchical_attribute_index = *dict_struct.hierarchical_attribute_index;
const auto & hierarchical_attribute = attributes[hierarchical_attribute_index];
const CollectionType<UInt64> & parent_keys = std::get<CollectionType<UInt64>>(hierarchical_attribute.container);
HashMap<UInt64, PaddedPODArray<UInt64>> parent_to_child;
for (const auto & [key, value] : parent_keys)
parent_to_child[value].emplace_back(key);
size_t keys_found;
auto result = getKeysDescendantsArray(keys, parent_to_child, level, keys_found);
auto result = getKeysDescendantsArray(keys, *parent_to_child_index, level, keys_found);
query_count.fetch_add(keys.size(), std::memory_order_relaxed);
found_count.fetch_add(keys_found, std::memory_order_relaxed);
@ -347,7 +365,9 @@ ColumnPtr HashedDictionary<dictionary_key_type, sparse>::getDescendants(
return result;
}
else
{
return nullptr;
}
}
template <DictionaryKeyType dictionary_key_type, bool sparse>
@ -631,6 +651,16 @@ void HashedDictionary<dictionary_key_type, sparse>::loadData()
getFullName());
}
template <DictionaryKeyType dictionary_key_type, bool sparse>
void HashedDictionary<dictionary_key_type, sparse>::buildHierarchyParentToChildIndexIfNeeded()
{
if (!dict_struct.hierarchical_attribute_index)
return;
if (dict_struct.attributes[*dict_struct.hierarchical_attribute_index].bidirectional)
hierarchical_index = getHierarchicalIndex();
}
template <DictionaryKeyType dictionary_key_type, bool sparse>
void HashedDictionary<dictionary_key_type, sparse>::calculateBytesAllocated()
{
@ -684,10 +714,16 @@ void HashedDictionary<dictionary_key_type, sparse>::calculateBytesAllocated()
}
}
bytes_allocated += string_arena.size();
if (update_field_loaded_block)
bytes_allocated += update_field_loaded_block->allocatedBytes();
if (hierarchical_index)
{
hierarchical_index_bytes_allocated = hierarchical_index->getSizeInBytes();
bytes_allocated += hierarchical_index_bytes_allocated;
}
bytes_allocated += string_arena.size();
}
template <DictionaryKeyType dictionary_key_type, bool sparse>

View File

@ -110,10 +110,15 @@ public:
ColumnPtr in_key_column,
const DataTypePtr & key_type) const override;
DictionaryHierarchicalParentToChildIndexPtr getHierarchicalIndex() const override;
size_t getHierarchicalIndexBytesAllocated() const override { return hierarchical_index_bytes_allocated; }
ColumnPtr getDescendants(
ColumnPtr key_column,
const DataTypePtr & key_type,
size_t level) const override;
size_t level,
DictionaryHierarchicalParentToChildIndexPtr parent_to_child_index) const override;
Pipe read(const Names & column_names, size_t max_block_size, size_t num_streams) const override;
@ -194,6 +199,8 @@ private:
void loadData();
void buildHierarchyParentToChildIndexIfNeeded();
void calculateBytesAllocated();
template <typename AttributeType, bool is_nullable, typename ValueSetter, typename DefaultValueExtractor>
@ -218,6 +225,7 @@ private:
std::vector<Attribute> attributes;
size_t bytes_allocated = 0;
size_t hierarchical_index_bytes_allocated = 0;
size_t element_count = 0;
size_t bucket_count = 0;
mutable std::atomic<size_t> query_count{0};
@ -226,6 +234,7 @@ private:
BlockPtr update_field_loaded_block;
Arena string_arena;
NoAttributesCollectionType no_attributes_container;
DictionaryHierarchicalParentToChildIndexPtr hierarchical_index;
};
extern template class HashedDictionary<DictionaryKeyType::Simple, false>;

View File

@ -8,6 +8,22 @@ namespace ErrorCodes
extern const int UNSUPPORTED_METHOD;
}
namespace detail
{
ColumnPtr convertElementsAndOffsetsIntoArray(ElementsAndOffsets && elements_and_offsets)
{
auto elements_column = ColumnVector<UInt64>::create();
elements_column->getData() = std::move(elements_and_offsets.elements);
auto offsets_column = ColumnVector<IColumn::Offset>::create();
offsets_column->getData() = std::move(elements_and_offsets.offsets);
auto column_array = ColumnArray::create(std::move(elements_column), std::move(offsets_column));
return column_array;
}
}
namespace
{
/** In case of cache or direct dictionary we does not have structure with child to parent representation.
@ -84,6 +100,26 @@ namespace
}
}
ColumnPtr getKeysDescendantsArray(
const PaddedPODArray<UInt64> & requested_keys,
const DictionaryHierarchicalParentToChildIndex & parent_to_child_index,
size_t level,
size_t & valid_keys)
{
if (level == 0)
{
detail::GetAllDescendantsStrategy strategy { .level = level };
auto elements_and_offsets = detail::getDescendants(requested_keys, parent_to_child_index, strategy, valid_keys);
return detail::convertElementsAndOffsetsIntoArray(std::move(elements_and_offsets));
}
else
{
detail::GetDescendantsAtSpecificLevelStrategy strategy { .level = level };
auto elements_and_offsets = detail::getDescendants(requested_keys, parent_to_child_index, strategy, valid_keys);
return detail::convertElementsAndOffsetsIntoArray(std::move(elements_and_offsets));
}
}
ColumnPtr getKeysHierarchyDefaultImplementation(
const IDictionary * dictionary,
ColumnPtr key_column,

View File

@ -14,25 +14,65 @@
namespace DB
{
class DictionaryHierarchicalParentToChildIndex;
using DictionaryHierarchyParentToChildIndexPtr = std::shared_ptr<DictionaryHierarchicalParentToChildIndex>;
class DictionaryHierarchicalParentToChildIndex
{
public:
struct KeysRange
{
UInt32 start_index;
UInt32 end_index;
};
explicit DictionaryHierarchicalParentToChildIndex(const HashMap<UInt64, PaddedPODArray<UInt64>> & parent_to_children_map_)
{
size_t parent_to_children_map_size = parent_to_children_map_.size();
keys.reserve(parent_to_children_map_size);
parent_to_children_keys_range.reserve(parent_to_children_map_size);
for (auto & [parent, children] : parent_to_children_map_)
{
size_t keys_size = keys.size();
UInt32 start_index = static_cast<UInt32>(keys_size);
UInt32 end_index = start_index + static_cast<UInt32>(children.size());
keys.insert(children.begin(), children.end());
parent_to_children_keys_range[parent] = KeysRange{start_index, end_index};
}
}
size_t getSizeInBytes() const
{
return parent_to_children_keys_range.getBufferSizeInBytes() + (keys.size() * sizeof(UInt64));
}
/// Map parent key to range of children from keys array
HashMap<UInt64, KeysRange> parent_to_children_keys_range;
/// Array of keys in hierarchy
PaddedPODArray<UInt64> keys;
};
namespace detail
{
template <typename KeyType>
struct ElementsAndOffsets
{
PaddedPODArray<KeyType> elements;
PaddedPODArray<UInt64> elements;
PaddedPODArray<IColumn::Offset> offsets;
};
template <typename T>
struct IsKeyValidFuncInterface
{
bool operator()(T key [[maybe_unused]]) { return false; }
bool operator()(UInt64 key [[maybe_unused]]) { return false; }
};
template <typename T>
struct GetParentKeyFuncInterface
{
std::optional<T> operator()(T key [[maybe_unused]]) { return {}; }
std::optional<UInt64> operator()(UInt64 key [[maybe_unused]]) { return {}; }
};
/** Calculate hierarchy for keys iterating the hierarchy from child to parent using get_parent_key_func provided by client.
@ -54,16 +94,16 @@ namespace detail
* Elements: [1, 2, 1, 3, 1, 4, 2, 1]
* Offsets: [1, 3, 5, 8, 8]
*/
template <typename KeyType, typename IsKeyValidFunc, typename GetParentKeyFunc>
ElementsAndOffsets<KeyType> getHierarchy(
const PaddedPODArray<KeyType> & keys,
const KeyType & hierarchy_null_value,
template <typename IsKeyValidFunc, typename GetParentKeyFunc>
ElementsAndOffsets getHierarchy(
const PaddedPODArray<UInt64> & keys,
const UInt64 & hierarchy_null_value,
IsKeyValidFunc && is_key_valid_func,
GetParentKeyFunc && get_parent_key_func)
{
size_t hierarchy_keys_size = keys.size();
PaddedPODArray<KeyType> elements;
PaddedPODArray<UInt64> elements;
elements.reserve(hierarchy_keys_size);
PaddedPODArray<IColumn::Offset> offsets;
@ -75,7 +115,7 @@ namespace detail
size_t array_element_offset;
};
HashMap<KeyType, OffsetInArray> already_processes_keys_to_offset;
HashMap<UInt64, OffsetInArray> already_processes_keys_to_offset;
already_processes_keys_to_offset.reserve(hierarchy_keys_size);
for (size_t i = 0; i < hierarchy_keys_size; ++i)
@ -123,7 +163,7 @@ namespace detail
elements.emplace_back(hierarchy_key);
++current_hierarchy_depth;
std::optional<KeyType> parent_key = std::forward<GetParentKeyFunc>(get_parent_key_func)(hierarchy_key);
std::optional<UInt64> parent_key = std::forward<GetParentKeyFunc>(get_parent_key_func)(hierarchy_key);
if (!parent_key.has_value())
break;
@ -134,7 +174,7 @@ namespace detail
offsets.emplace_back(elements.size());
}
ElementsAndOffsets<KeyType> result = {std::move(elements), std::move(offsets)};
ElementsAndOffsets result = {std::move(elements), std::move(offsets)};
return result;
}
@ -146,11 +186,11 @@ namespace detail
*
* Not: keys size must be equal to in_keys_size.
*/
template <typename KeyType, typename IsKeyValidFunc, typename GetParentKeyFunc>
template <typename IsKeyValidFunc, typename GetParentKeyFunc>
PaddedPODArray<UInt8> getIsInHierarchy(
const PaddedPODArray<KeyType> & keys,
const PaddedPODArray<KeyType> & in_keys,
const KeyType & hierarchy_null_value,
const PaddedPODArray<UInt64> & keys,
const PaddedPODArray<UInt64> & in_keys,
const UInt64 & hierarchy_null_value,
IsKeyValidFunc && is_key_valid_func,
GetParentKeyFunc && get_parent_func)
{
@ -159,7 +199,7 @@ namespace detail
PaddedPODArray<UInt8> result;
result.resize_fill(keys.size());
detail::ElementsAndOffsets<KeyType> hierarchy = detail::getHierarchy(
detail::ElementsAndOffsets hierarchy = detail::getHierarchy(
keys,
hierarchy_null_value,
std::forward<IsKeyValidFunc>(is_key_valid_func),
@ -216,19 +256,22 @@ namespace detail
* Result: [1], [2, 3], [4], [], [];
* Offsets: [1, 3, 4, 4, 4];
*/
template <typename KeyType, typename Strategy>
ElementsAndOffsets<KeyType> getDescendants(
const PaddedPODArray<KeyType> & keys,
const HashMap<KeyType, PaddedPODArray<KeyType>> & parent_to_child,
template <typename Strategy>
ElementsAndOffsets getDescendants(
const PaddedPODArray<UInt64> & keys,
const DictionaryHierarchicalParentToChildIndex & parent_to_child_index,
Strategy strategy,
size_t & valid_keys)
{
auto & parent_to_children_keys_range = parent_to_child_index.parent_to_children_keys_range;
auto & children_keys = parent_to_child_index.keys;
/// If strategy is GetAllDescendantsStrategy we try to cache and later reuse previously calculated descendants.
/// If strategy is GetDescendantsAtSpecificLevelStrategy we does not use cache strategy.
size_t keys_size = keys.size();
valid_keys = 0;
PaddedPODArray<KeyType> descendants;
PaddedPODArray<UInt64> descendants;
descendants.reserve(keys_size);
PaddedPODArray<IColumn::Offset> descendants_offsets;
@ -241,18 +284,18 @@ namespace detail
};
static constexpr Int64 key_range_requires_update = -1;
HashMap<KeyType, Range> already_processed_keys_to_range [[maybe_unused]];
HashMap<UInt64, Range> already_processed_keys_to_range [[maybe_unused]];
if constexpr (std::is_same_v<Strategy, GetAllDescendantsStrategy>)
already_processed_keys_to_range.reserve(keys_size);
struct KeyAndDepth
{
KeyType key;
UInt64 key;
Int64 depth;
};
HashSet<KeyType> already_processed_keys_during_loop;
HashSet<UInt64> already_processed_keys_during_loop;
already_processed_keys_during_loop.reserve(keys_size);
PaddedPODArray<KeyAndDepth> next_keys_to_process_stack;
@ -262,9 +305,9 @@ namespace detail
for (size_t i = 0; i < keys_size; ++i)
{
const KeyType & requested_key = keys[i];
const UInt64 & requested_key = keys[i];
if (parent_to_child.find(requested_key) == nullptr)
if (parent_to_children_keys_range.find(requested_key) == nullptr)
{
descendants_offsets.emplace_back(descendants.size());
continue;
@ -282,7 +325,7 @@ namespace detail
{
KeyAndDepth key_to_process = next_keys_to_process_stack.back();
KeyType key = key_to_process.key;
UInt64 key = key_to_process.key;
Int64 depth = key_to_process.depth;
next_keys_to_process_stack.pop_back();
@ -329,7 +372,7 @@ namespace detail
}
}
const auto * it = parent_to_child.find(key);
const auto * it = parent_to_children_keys_range.find(key);
if (!it || depth >= DBMS_HIERARCHICAL_DICTIONARY_MAX_DEPTH)
continue;
@ -352,15 +395,26 @@ namespace detail
++depth;
const auto & children = it->getMapped();
DictionaryHierarchicalParentToChildIndex::KeysRange children_range = it->getMapped();
for (auto child_key : children)
for (; children_range.start_index < children_range.end_index; ++children_range.start_index)
{
auto child_key = children_keys[children_range.start_index];
/// In case of GetAllDescendantsStrategy we add any descendant to result array
/// If strategy is GetDescendantsAtSpecificLevelStrategy we require depth == level
if (std::is_same_v<Strategy, GetAllDescendantsStrategy> || depth == level)
if constexpr (std::is_same_v<Strategy, GetAllDescendantsStrategy>)
descendants.emplace_back(child_key);
if constexpr (std::is_same_v<Strategy, GetDescendantsAtSpecificLevelStrategy>)
{
if (depth == level)
{
descendants.emplace_back(child_key);
continue;
}
}
next_keys_to_process_stack.emplace_back(KeyAndDepth{child_key, depth});
}
}
@ -370,24 +424,12 @@ namespace detail
descendants_offsets.emplace_back(descendants.size());
}
ElementsAndOffsets<KeyType> result = {std::move(descendants), std::move(descendants_offsets)};
ElementsAndOffsets result = {std::move(descendants), std::move(descendants_offsets)};
return result;
}
/// Converts ElementAndOffsets structure into ArrayColumn
template<typename KeyType>
ColumnPtr convertElementsAndOffsetsIntoArray(ElementsAndOffsets<KeyType> && elements_and_offsets)
{
auto elements_column = ColumnVector<KeyType>::create();
elements_column->getData() = std::move(elements_and_offsets.elements);
auto offsets_column = ColumnVector<IColumn::Offset>::create();
offsets_column->getData() = std::move(elements_and_offsets.offsets);
auto column_array = ColumnArray::create(std::move(elements_column), std::move(offsets_column));
return column_array;
}
ColumnPtr convertElementsAndOffsetsIntoArray(ElementsAndOffsets && elements_and_offsets);
}
/// Returns hierarchy array column for keys
@ -432,26 +474,11 @@ ColumnUInt8::Ptr getKeysIsInHierarchyColumn(
/// Returns descendants array column for keys
///
/// @param valid_keys - number of keys that are valid in parent_to_child map
template <typename KeyType>
ColumnPtr getKeysDescendantsArray(
const PaddedPODArray<KeyType> & requested_keys,
const HashMap<KeyType, PaddedPODArray<KeyType>> & parent_to_child,
const PaddedPODArray<UInt64> & requested_keys,
const DictionaryHierarchicalParentToChildIndex & parent_to_child_index,
size_t level,
size_t & valid_keys)
{
if (level == 0)
{
detail::GetAllDescendantsStrategy strategy { .level = level };
auto elements_and_offsets = detail::getDescendants(requested_keys, parent_to_child, strategy, valid_keys);
return detail::convertElementsAndOffsetsIntoArray(std::move(elements_and_offsets));
}
else
{
detail::GetDescendantsAtSpecificLevelStrategy strategy { .level = level };
auto elements_and_offsets = detail::getDescendants(requested_keys, parent_to_child, strategy, valid_keys);
return detail::convertElementsAndOffsetsIntoArray(std::move(elements_and_offsets));
}
}
size_t & valid_keys);
/** Default getHierarchy implementation for dictionaries that does not have structure with child to parent representation.
* Implementation will build such structure with getColumn calls, and then getHierarchy for such structure.

View File

@ -24,6 +24,9 @@ namespace ErrorCodes
class IDictionary;
using DictionaryPtr = std::unique_ptr<IDictionary>;
class DictionaryHierarchicalParentToChildIndex;
using DictionaryHierarchicalParentToChildIndexPtr = std::shared_ptr<DictionaryHierarchicalParentToChildIndex>;
/** DictionaryKeyType provides IDictionary client information about
* which key type is supported by dictionary.
*
@ -228,10 +231,23 @@ public:
getDictionaryID().getNameForLogs());
}
virtual DictionaryHierarchicalParentToChildIndexPtr getHierarchicalIndex() const
{
throw Exception(ErrorCodes::NOT_IMPLEMENTED,
"Method getHierarchicalIndex is not supported for {} dictionary.",
getDictionaryID().getNameForLogs());
}
virtual size_t getHierarchicalIndexBytesAllocated() const
{
return 0;
}
virtual ColumnPtr getDescendants(
ColumnPtr key_column [[maybe_unused]],
const DataTypePtr & key_type [[maybe_unused]],
size_t level [[maybe_unused]]) const
size_t level [[maybe_unused]],
DictionaryHierarchicalParentToChildIndexPtr parent_to_child_index [[maybe_unused]]) const
{
throw Exception(ErrorCodes::NOT_IMPLEMENTED,
"Method getDescendants is not supported for {} dictionary.",

View File

@ -16,6 +16,13 @@
#include <Storages/ExternalDataSourceConfiguration.h>
#include <Storages/MySQL/MySQLHelpers.h>
#include <Storages/MySQL/MySQLSettings.h>
#include <Columns/ColumnString.h>
#include <DataTypes/DataTypeString.h>
#include <IO/WriteBufferFromString.h>
#include <IO/WriteHelpers.h>
#include <Common/LocalDateTime.h>
#include <Common/logger_useful.h>
#include "readInvalidateQuery.h"
namespace DB
@ -118,15 +125,6 @@ void registerDictionarySourceMysql(DictionarySourceFactory & factory)
#if USE_MYSQL
# include <Columns/ColumnString.h>
# include <DataTypes/DataTypeString.h>
# include <IO/WriteBufferFromString.h>
# include <IO/WriteHelpers.h>
# include <Common/LocalDateTime.h>
# include <Common/logger_useful.h>
# include "readInvalidateQuery.h"
# include <mysqlxx/Exception.h>
# include <Core/Settings.h>
namespace DB
{

View File

@ -104,7 +104,7 @@ ColumnPtr RangeHashedDictionary<dictionary_key_type>::getColumn(
/// Cast range column to storage type
Columns modified_key_columns = key_columns;
auto range_storage_column = key_columns.back();
const ColumnPtr & range_storage_column = key_columns.back();
ColumnWithTypeAndName column_to_cast = {range_storage_column->convertToFullColumnIfConst(), key_types.back(), ""};
modified_key_columns.back() = castColumnAccurate(column_to_cast, dict_struct.range_min->type);
@ -314,7 +314,7 @@ ColumnUInt8::Ptr RangeHashedDictionary<dictionary_key_type>::hasKeys(const Colum
}
/// Cast range column to storage type
auto range_storage_column = key_columns.back();
const ColumnPtr & range_storage_column = key_columns.back();
ColumnWithTypeAndName column_to_cast = {range_storage_column->convertToFullColumnIfConst(), key_types.back(), ""};
auto range_column_updated = castColumnAccurate(column_to_cast, dict_struct.range_min->type);
auto key_columns_copy = key_columns;
@ -513,7 +513,7 @@ void RangeHashedDictionary<dictionary_key_type>::getItemsImpl(
size_t keys_found = 0;
auto range_column = key_columns.back();
const ColumnPtr & range_column = key_columns.back();
auto key_columns_copy = key_columns;
key_columns_copy.pop_back();
@ -984,7 +984,7 @@ Pipe RangeHashedDictionary<dictionary_key_type>::read(const Names & column_names
Columns result;
result.reserve(attribute_names_size);
auto key_column = key_columns.back();
const ColumnPtr & key_column = key_columns.back();
const auto * key_to_index_column = typeid_cast<const ColumnUInt64 *>(key_column.get());
if (!key_to_index_column)

View File

@ -290,6 +290,14 @@ void buildSingleAttribute(
attribute_element->appendChild(hierarchical_element);
}
if (dict_attr->bidirectional)
{
AutoPtr<Element> bidirectional_element(doc->createElement("bidirectional"));
AutoPtr<Text> bidirectional(doc->createTextNode("true"));
bidirectional_element->appendChild(bidirectional);
attribute_element->appendChild(bidirectional_element);
}
if (dict_attr->injective)
{
AutoPtr<Element> injective_element(doc->createElement("injective"));

View File

@ -151,13 +151,15 @@ TEST(HierarchyDictionariesUtils, getDescendants)
parent_to_child[1].emplace_back(3);
parent_to_child[2].emplace_back(4);
auto parent_to_child_index = std::make_shared<DictionaryHierarchicalParentToChildIndex>(parent_to_child);
PaddedPODArray<UInt64> keys = {0, 1, 2, 3, 4};
{
size_t keys_found;
auto result = DB::detail::getDescendants(
keys,
parent_to_child,
*parent_to_child_index,
DB::detail::GetAllDescendantsStrategy(),
keys_found);
@ -175,7 +177,7 @@ TEST(HierarchyDictionariesUtils, getDescendants)
size_t keys_found;
auto result = DB::detail::getDescendants(
keys,
parent_to_child,
*parent_to_child_index,
DB::detail::GetDescendantsAtSpecificLevelStrategy{1},
keys_found);
@ -195,13 +197,15 @@ TEST(HierarchyDictionariesUtils, getDescendants)
parent_to_child[1].emplace_back(2);
parent_to_child[2].emplace_back(1);
auto parent_to_child_index = std::make_shared<DictionaryHierarchicalParentToChildIndex>(parent_to_child);
PaddedPODArray<UInt64> keys = {1, 2, 3};
{
size_t keys_found;
auto result = DB::detail::getDescendants(
keys,
parent_to_child,
*parent_to_child_index,
DB::detail::GetAllDescendantsStrategy(),
keys_found);
@ -219,7 +223,7 @@ TEST(HierarchyDictionariesUtils, getDescendants)
size_t keys_found;
auto result = DB::detail::getDescendants(
keys,
parent_to_child,
*parent_to_child_index,
DB::detail::GetDescendantsAtSpecificLevelStrategy{1},
keys_found);

View File

@ -1,18 +0,0 @@
#pragma once
#include <Common/config.h>
#if USE_AZURE_BLOB_STORAGE
#include <Disks/IDiskRemote.h>
#include <azure/storage/blobs.hpp>
namespace DB
{
std::shared_ptr<Azure::Storage::Blobs::BlobContainerClient> getAzureBlobContainerClient(
const Poco::Util::AbstractConfiguration & config, const String & config_prefix);
}
#endif

View File

@ -1,168 +0,0 @@
#include <Disks/AzureBlobStorage/DiskAzureBlobStorage.h>
#if USE_AZURE_BLOB_STORAGE
#include <Disks/RemoteDisksCommon.h>
#include <Disks/IO/ReadBufferFromRemoteFSGather.h>
#include <Disks/IO/AsynchronousReadIndirectBufferFromRemoteFS.h>
#include <Disks/IO/ReadIndirectBufferFromRemoteFS.h>
#include <Disks/IO/WriteIndirectBufferFromRemoteFS.h>
#include <Common/getRandomASCIIString.h>
namespace DB
{
namespace ErrorCodes
{
extern const int AZURE_BLOB_STORAGE_ERROR;
}
DiskAzureBlobStorageSettings::DiskAzureBlobStorageSettings(
UInt64 max_single_part_upload_size_,
UInt64 min_bytes_for_seek_,
int max_single_read_retries_,
int max_single_download_retries_,
int thread_pool_size_) :
max_single_part_upload_size(max_single_part_upload_size_),
min_bytes_for_seek(min_bytes_for_seek_),
max_single_read_retries(max_single_read_retries_),
max_single_download_retries(max_single_download_retries_),
thread_pool_size(thread_pool_size_) {}
DiskAzureBlobStorage::DiskAzureBlobStorage(
const String & name_,
DiskPtr metadata_disk_,
std::shared_ptr<Azure::Storage::Blobs::BlobContainerClient> blob_container_client_,
SettingsPtr settings_,
GetDiskSettings settings_getter_) :
IDiskRemote(name_, "", metadata_disk_, nullptr, "DiskAzureBlobStorage", settings_->thread_pool_size),
blob_container_client(blob_container_client_),
current_settings(std::move(settings_)),
settings_getter(settings_getter_) {}
std::unique_ptr<ReadBufferFromFileBase> DiskAzureBlobStorage::readFile(
const String & path,
const ReadSettings & read_settings,
std::optional<size_t>,
std::optional<size_t>) const
{
auto settings = current_settings.get();
auto metadata = readMetadata(path);
LOG_TEST(log, "Read from file by path: {}", backQuote(metadata_disk->getPath() + path));
auto reader_impl = std::make_unique<ReadBufferFromAzureBlobStorageGather>(
blob_container_client, metadata.remote_fs_root_path, metadata.remote_fs_objects,
settings->max_single_read_retries, settings->max_single_download_retries, read_settings);
if (read_settings.remote_fs_method == RemoteFSReadMethod::threadpool)
{
auto reader = getThreadPoolReader();
return std::make_unique<AsynchronousReadIndirectBufferFromRemoteFS>(reader, read_settings, std::move(reader_impl));
}
else
{
auto buf = std::make_unique<ReadIndirectBufferFromRemoteFS>(std::move(reader_impl));
return std::make_unique<SeekAvoidingReadBuffer>(std::move(buf), current_settings.get()->min_bytes_for_seek);
}
}
std::unique_ptr<WriteBufferFromFileBase> DiskAzureBlobStorage::writeFile(
const String & path,
size_t buf_size,
WriteMode mode,
const WriteSettings &)
{
auto blob_path = path + "_" + getRandomASCIIString(8); /// NOTE: path contains the tmp_* prefix in the blob name
LOG_TRACE(log, "{} to file by path: {}. AzureBlob Storage path: {}",
mode == WriteMode::Rewrite ? "Write" : "Append", backQuote(metadata_disk->getPath() + path), blob_path);
auto buffer = std::make_unique<WriteBufferFromAzureBlobStorage>(
blob_container_client,
blob_path,
current_settings.get()->max_single_part_upload_size,
buf_size);
auto create_metadata_callback = [this, path, mode, blob_path] (size_t count)
{
readOrCreateUpdateAndStoreMetadata(path, mode, false, [blob_path, count] (Metadata & metadata) { metadata.addObject(blob_path, count); return true; });
};
return std::make_unique<WriteIndirectBufferFromRemoteFS>(std::move(buffer), std::move(create_metadata_callback), blob_path);
}
DiskType DiskAzureBlobStorage::getType() const
{
return DiskType::AzureBlobStorage;
}
bool DiskAzureBlobStorage::isRemote() const
{
return true;
}
bool DiskAzureBlobStorage::supportZeroCopyReplication() const
{
return true;
}
bool DiskAzureBlobStorage::checkUniqueId(const String & id) const
{
Azure::Storage::Blobs::ListBlobsOptions blobs_list_options;
blobs_list_options.Prefix = id;
blobs_list_options.PageSizeHint = 1;
auto blobs_list_response = blob_container_client->ListBlobs(blobs_list_options);
auto blobs_list = blobs_list_response.Blobs;
for (const auto & blob : blobs_list)
{
if (id == blob.Name)
return true;
}
return false;
}
void DiskAzureBlobStorage::removeFromRemoteFS(const std::vector<String> & paths)
{
for (const auto & path : paths)
{
try
{
auto delete_info = blob_container_client->DeleteBlob(path);
if (!delete_info.Value.Deleted)
throw Exception(ErrorCodes::AZURE_BLOB_STORAGE_ERROR, "Failed to delete file in AzureBlob Storage: {}", path);
}
catch (const Azure::Storage::StorageException & e)
{
LOG_INFO(log, "Caught an error while deleting file {} : {}", path, e.Message);
throw;
}
}
}
void DiskAzureBlobStorage::applyNewSettings(const Poco::Util::AbstractConfiguration & config, ContextPtr context, const String &, const DisksMap &)
{
auto new_settings = settings_getter(config, "storage_configuration.disks." + name, context);
current_settings.set(std::move(new_settings));
if (AsyncExecutor * exec = dynamic_cast<AsyncExecutor*>(&getExecutor()))
exec->setMaxThreads(current_settings.get()->thread_pool_size);
}
}
#endif

View File

@ -1,86 +0,0 @@
#pragma once
#include <Common/config.h>
#if USE_AZURE_BLOB_STORAGE
#include <Disks/IDiskRemote.h>
#include <IO/ReadBufferFromAzureBlobStorage.h>
#include <IO/WriteBufferFromAzureBlobStorage.h>
#include <IO/SeekAvoidingReadBuffer.h>
#include <azure/identity/managed_identity_credential.hpp>
#include <azure/storage/blobs.hpp>
namespace DB
{
struct DiskAzureBlobStorageSettings final
{
DiskAzureBlobStorageSettings(
UInt64 max_single_part_upload_size_,
UInt64 min_bytes_for_seek_,
int max_single_read_retries,
int max_single_download_retries,
int thread_pool_size_);
size_t max_single_part_upload_size; /// NOTE: on 32-bit machines it will be at most 4GB, but size_t is also used in BufferBase for offset
UInt64 min_bytes_for_seek;
size_t max_single_read_retries;
size_t max_single_download_retries;
size_t thread_pool_size;
};
class DiskAzureBlobStorage final : public IDiskRemote
{
public:
using SettingsPtr = std::unique_ptr<DiskAzureBlobStorageSettings>;
using GetDiskSettings = std::function<SettingsPtr(const Poco::Util::AbstractConfiguration &, const String, ContextPtr)>;
DiskAzureBlobStorage(
const String & name_,
DiskPtr metadata_disk_,
std::shared_ptr<Azure::Storage::Blobs::BlobContainerClient> blob_container_client_,
SettingsPtr settings_,
GetDiskSettings settings_getter_);
std::unique_ptr<ReadBufferFromFileBase> readFile(
const String & path,
const ReadSettings & settings,
std::optional<size_t> read_hint,
std::optional<size_t> file_size) const override;
std::unique_ptr<WriteBufferFromFileBase> writeFile(
const String & path,
size_t buf_size,
WriteMode mode,
const WriteSettings & settings) override;
DiskType getType() const override;
bool isRemote() const override;
bool supportZeroCopyReplication() const override;
bool checkUniqueId(const String & id) const override;
void removeFromRemoteFS(const std::vector<String> & paths) override;
void applyNewSettings(const Poco::Util::AbstractConfiguration & config, ContextPtr context, const String &, const DisksMap &) override;
private:
/// client used to access the files in the Blob Storage cloud
std::shared_ptr<Azure::Storage::Blobs::BlobContainerClient> blob_container_client;
MultiVersion<DiskAzureBlobStorageSettings> current_settings;
/// Gets disk settings from context.
GetDiskSettings settings_getter;
};
}
#endif

View File

@ -211,9 +211,9 @@ void DiskDecorator::shutdown()
delegate->shutdown();
}
void DiskDecorator::startup()
void DiskDecorator::startup(ContextPtr context)
{
delegate->startup();
delegate->startup(context);
}
void DiskDecorator::applyNewSettings(const Poco::Util::AbstractConfiguration & config, ContextPtr context, const String & config_prefix, const DisksMap & map)

View File

@ -71,7 +71,7 @@ public:
void onFreeze(const String & path) override;
SyncGuardPtr getDirectorySyncGuard(const String & path) const override;
void shutdown() override;
void startup() override;
void startup(ContextPtr context) override;
void applyNewSettings(const Poco::Util::AbstractConfiguration & config, ContextPtr context, const String & config_prefix, const DisksMap & map) override;
String getCacheBasePath() const override { return delegate->getCacheBasePath(); }
std::vector<String> getRemotePaths(const String & path) const override { return delegate->getRemotePaths(path); }

View File

@ -494,7 +494,7 @@ DiskLocal::DiskLocal(
disk_checker = std::make_unique<DiskLocalCheckThread>(this, context, local_disk_check_period_ms);
}
void DiskLocal::startup()
void DiskLocal::startup(ContextPtr)
{
try
{
@ -682,7 +682,7 @@ void registerDiskLocal(DiskFactory & factory)
std::shared_ptr<IDisk> disk
= std::make_shared<DiskLocal>(name, path, keep_free_space_bytes, context, config.getUInt("local_disk_check_period_ms", 0));
disk->startup();
disk->startup(context);
return std::make_shared<DiskRestartProxy>(disk);
};
factory.registerDiskType("local", creator);

View File

@ -110,7 +110,7 @@ public:
bool isBroken() const override { return broken; }
void startup() override;
void startup(ContextPtr) override;
void shutdown() override;

View File

@ -5,6 +5,7 @@
namespace DB
{
namespace ErrorCodes
{
extern const int DEADLOCK_AVOIDED;
@ -329,7 +330,7 @@ void DiskRestartProxy::getRemotePathsRecursive(const String & path, std::vector<
return DiskDecorator::getRemotePathsRecursive(path, paths_map);
}
void DiskRestartProxy::restart()
void DiskRestartProxy::restart(ContextPtr context)
{
/// Speed up processing unhealthy requests.
DiskDecorator::shutdown();
@ -352,7 +353,7 @@ void DiskRestartProxy::restart()
LOG_INFO(log, "Restart lock acquired. Restarting disk {}", DiskDecorator::getName());
DiskDecorator::startup();
DiskDecorator::startup(context);
LOG_INFO(log, "Disk restarted {}", DiskDecorator::getName());
}

View File

@ -68,7 +68,7 @@ public:
std::vector<String> getRemotePaths(const String & path) const override;
void getRemotePathsRecursive(const String & path, std::vector<LocalPathWithRemotePaths> & paths_map) override;
void restart();
void restart(ContextPtr context);
private:
friend class RestartAwareReadBuffer;

View File

@ -9,8 +9,12 @@
#include <IO/ReadHelpers.h>
#include <IO/WriteHelpers.h>
#include <Disks/IDiskRemote.h>
#include <Disks/IDisk.h>
#include <Disks/ObjectStorages/IObjectStorage.h>
#include <IO/ReadBufferFromFile.h>
#include <Disks/IO/AsynchronousReadIndirectBufferFromRemoteFS.h>
#include <Disks/IO/ReadIndirectBufferFromRemoteFS.h>
#include <Disks/IO/WriteIndirectBufferFromRemoteFS.h>
#include <Disks/IO/ReadBufferFromRemoteFSGather.h>
@ -173,7 +177,7 @@ std::unique_ptr<ReadBufferFromFileBase> DiskWebServer::readFile(const String & p
if (read_settings.remote_fs_method == RemoteFSReadMethod::threadpool)
{
auto reader = IDiskRemote::getThreadPoolReader();
auto reader = IObjectStorage::getThreadPoolReader();
return std::make_unique<AsynchronousReadIndirectBufferFromRemoteFS>(reader, read_settings, std::move(web_impl), min_bytes_for_seek);
}
else

View File

@ -1,10 +1,13 @@
#pragma once
#include <Disks/IDiskRemote.h>
#include <IO/WriteBufferFromFile.h>
#include <Core/UUID.h>
#include <set>
#include <Interpreters/Context_fwd.h>
#include <Disks/IDisk.h>
#include <IO/ReadBufferFromFile.h>
namespace DB
{

View File

@ -1,143 +0,0 @@
#include <Disks/HDFS/DiskHDFS.h>
#if USE_HDFS
#include <Disks/DiskLocal.h>
#include <Disks/RemoteDisksCommon.h>
#include <IO/SeekAvoidingReadBuffer.h>
#include <Storages/HDFS/WriteBufferFromHDFS.h>
#include <Storages/HDFS/HDFSCommon.h>
#include <Disks/IO/AsynchronousReadIndirectBufferFromRemoteFS.h>
#include <Disks/IO/ReadIndirectBufferFromRemoteFS.h>
#include <Disks/IO/WriteIndirectBufferFromRemoteFS.h>
#include <Disks/IO/ReadBufferFromRemoteFSGather.h>
#include <boost/algorithm/string/predicate.hpp>
#include <Common/logger_useful.h>
#include <base/FnTraits.h>
namespace DB
{
namespace ErrorCodes
{
extern const int BAD_ARGUMENTS;
extern const int LOGICAL_ERROR;
}
DiskHDFS::DiskHDFS(
const String & disk_name_,
const String & hdfs_root_path_,
SettingsPtr settings_,
DiskPtr metadata_disk_,
const Poco::Util::AbstractConfiguration & config_)
: IDiskRemote(disk_name_, hdfs_root_path_, metadata_disk_, nullptr, "DiskHDFS", settings_->thread_pool_size)
, config(config_)
, hdfs_builder(createHDFSBuilder(hdfs_root_path_, config))
, hdfs_fs(createHDFSFS(hdfs_builder.get()))
, settings(std::move(settings_))
{
}
std::unique_ptr<ReadBufferFromFileBase> DiskHDFS::readFile(const String & path, const ReadSettings & read_settings, std::optional<size_t>, std::optional<size_t>) const
{
auto metadata = readMetadata(path);
LOG_TEST(log,
"Read from file by path: {}. Existing HDFS objects: {}",
backQuote(metadata_disk->getPath() + path), metadata.remote_fs_objects.size());
auto hdfs_impl = std::make_unique<ReadBufferFromHDFSGather>(config, remote_fs_root_path, remote_fs_root_path, metadata.remote_fs_objects, read_settings);
auto buf = std::make_unique<ReadIndirectBufferFromRemoteFS>(std::move(hdfs_impl));
return std::make_unique<SeekAvoidingReadBuffer>(std::move(buf), settings->min_bytes_for_seek);
}
std::unique_ptr<WriteBufferFromFileBase> DiskHDFS::writeFile(const String & path, size_t buf_size, WriteMode mode, const WriteSettings &)
{
/// Path to store new HDFS object.
std::string file_name = getRandomName();
std::string hdfs_path = fs::path(remote_fs_root_path) / file_name;
LOG_TRACE(log, "{} to file by path: {}. HDFS path: {}", mode == WriteMode::Rewrite ? "Write" : "Append",
backQuote(metadata_disk->getPath() + path), hdfs_path);
/// Single O_WRONLY in libhdfs adds O_TRUNC
auto hdfs_buffer = std::make_unique<WriteBufferFromHDFS>(hdfs_path,
config, settings->replication, buf_size,
mode == WriteMode::Rewrite ? O_WRONLY : O_WRONLY | O_APPEND);
auto create_metadata_callback = [this, path, mode, file_name] (size_t count)
{
readOrCreateUpdateAndStoreMetadata(path, mode, false, [file_name, count] (Metadata & metadata) { metadata.addObject(file_name, count); return true; });
};
return std::make_unique<WriteIndirectBufferFromRemoteFS>(std::move(hdfs_buffer), std::move(create_metadata_callback), hdfs_path);
}
void DiskHDFS::removeFromRemoteFS(const std::vector<String> & paths)
{
for (const auto & hdfs_path : paths)
{
const size_t begin_of_path = hdfs_path.find('/', hdfs_path.find("//") + 2);
/// Add path from root to file name
int res = hdfsDelete(hdfs_fs.get(), hdfs_path.substr(begin_of_path).c_str(), 0);
if (res == -1)
throw Exception(ErrorCodes::LOGICAL_ERROR, "HDFSDelete failed with path: " + hdfs_path);
}
}
bool DiskHDFS::checkUniqueId(const String & hdfs_uri) const
{
if (!boost::algorithm::starts_with(hdfs_uri, remote_fs_root_path))
return false;
const size_t begin_of_path = hdfs_uri.find('/', hdfs_uri.find("//") + 2);
const String remote_fs_object_path = hdfs_uri.substr(begin_of_path);
return (0 == hdfsExists(hdfs_fs.get(), remote_fs_object_path.c_str()));
}
namespace
{
std::unique_ptr<DiskHDFSSettings> getSettings(const Poco::Util::AbstractConfiguration & config, const String & config_prefix, const Settings & settings)
{
return std::make_unique<DiskHDFSSettings>(
config.getUInt64(config_prefix + ".min_bytes_for_seek", 1024 * 1024),
config.getInt(config_prefix + ".thread_pool_size", 16),
config.getInt(config_prefix + ".objects_chunk_size_to_delete", 1000),
settings.hdfs_replication);
}
}
void registerDiskHDFS(DiskFactory & factory)
{
auto creator = [](const String & name,
const Poco::Util::AbstractConfiguration & config,
const String & config_prefix,
ContextPtr context_,
const DisksMap & /*map*/) -> DiskPtr
{
String uri{config.getString(config_prefix + ".endpoint")};
checkHDFSURL(uri);
if (uri.back() != '/')
throw Exception(ErrorCodes::BAD_ARGUMENTS, "HDFS path must ends with '/', but '{}' doesn't.", uri);
auto metadata_disk = prepareForLocalMetadata(name, config, config_prefix, context_).second;
return std::make_shared<DiskHDFS>(
name, uri,
getSettings(config, config_prefix, context_->getSettingsRef()),
metadata_disk, config);
};
factory.registerDiskType("hdfs", creator);
}
}
#endif

View File

@ -1,84 +0,0 @@
#pragma once
#include <Common/config.h>
#if USE_HDFS
#include <Disks/IDiskRemote.h>
#include <Storages/HDFS/HDFSCommon.h>
#include <Core/UUID.h>
#include <memory>
namespace DB
{
struct DiskHDFSSettings
{
size_t min_bytes_for_seek;
int thread_pool_size;
int objects_chunk_size_to_delete;
int replication;
DiskHDFSSettings(
int min_bytes_for_seek_,
int thread_pool_size_,
int objects_chunk_size_to_delete_,
int replication_)
: min_bytes_for_seek(min_bytes_for_seek_)
, thread_pool_size(thread_pool_size_)
, objects_chunk_size_to_delete(objects_chunk_size_to_delete_)
, replication(replication_) {}
};
/**
* Storage for persisting data in HDFS and metadata on the local disk.
* Files are represented by file in local filesystem (clickhouse_root/disks/disk_name/path/to/file)
* that contains HDFS object key with actual data.
*/
class DiskHDFS final : public IDiskRemote
{
public:
using SettingsPtr = std::unique_ptr<DiskHDFSSettings>;
DiskHDFS(
const String & disk_name_,
const String & hdfs_root_path_,
SettingsPtr settings_,
DiskPtr metadata_disk_,
const Poco::Util::AbstractConfiguration & config_);
DiskType getType() const override { return DiskType::HDFS; }
bool isRemote() const override { return true; }
bool supportZeroCopyReplication() const override { return true; }
std::unique_ptr<ReadBufferFromFileBase> readFile(
const String & path,
const ReadSettings & settings,
std::optional<size_t> read_hint,
std::optional<size_t> file_size) const override;
std::unique_ptr<WriteBufferFromFileBase> writeFile(const String & path, size_t buf_size, WriteMode mode, const WriteSettings & settings) override;
void removeFromRemoteFS(const std::vector<String> & paths) override;
/// Check file exists and ClickHouse has an access to it
/// Overrode in remote disk
/// Required for remote disk to ensure that replica has access to data written by other node
bool checkUniqueId(const String & hdfs_uri) const override;
private:
String getRandomName() { return toString(UUIDHelpers::generateV4()); }
const Poco::Util::AbstractConfiguration & config;
HDFSBuilderWrapper hdfs_builder;
HDFSFSPtr hdfs_fs;
SettingsPtr settings;
};
}
#endif

View File

@ -10,6 +10,8 @@
#include <Disks/DiskType.h>
#include <IO/ReadSettings.h>
#include <IO/WriteSettings.h>
#include <Disks/ObjectStorages/IObjectStorage.h>
#include <Disks/WriteMode.h>
#include <memory>
#include <mutex>
@ -48,14 +50,6 @@ class ReadBufferFromFileBase;
class WriteBufferFromFileBase;
class MMappedFileCache;
/**
* Mode of opening a file for write.
*/
enum class WriteMode
{
Rewrite,
Append
};
/**
* Provide interface for reservation.
@ -289,14 +283,14 @@ public:
virtual bool isReadOnly() const { return false; }
/// Check if disk is broken. Broken disks will have 0 space and not be used.
/// Check if disk is broken. Broken disks will have 0 space and cannot be used.
virtual bool isBroken() const { return false; }
/// Invoked when Global Context is shutdown.
virtual void shutdown() {}
/// Performs action on disk startup.
virtual void startup() {}
virtual void startup(ContextPtr) {}
/// Return some uniq string for file, overrode for IDiskRemote
/// Required for distinguish different copies of the same part on remote disk

View File

@ -1,708 +0,0 @@
#include <Disks/IDiskRemote.h>
#include "Disks/DiskFactory.h"
#include <IO/ReadBufferFromFile.h>
#include <IO/ReadHelpers.h>
#include <IO/WriteBufferFromFile.h>
#include <IO/WriteHelpers.h>
#include <Common/createHardLink.h>
#include <Common/quoteString.h>
#include <Common/logger_useful.h>
#include <Common/checkStackSize.h>
#include <boost/algorithm/string.hpp>
#include <Common/filesystemHelpers.h>
#include <Disks/IO/ThreadPoolRemoteFSReader.h>
#include <Common/FileCache.h>
namespace DB
{
namespace ErrorCodes
{
extern const int INCORRECT_DISK_INDEX;
extern const int UNKNOWN_FORMAT;
extern const int FILE_ALREADY_EXISTS;
extern const int PATH_ACCESS_DENIED;
extern const int FILE_DOESNT_EXIST;
extern const int BAD_FILE_TYPE;
}
IDiskRemote::Metadata IDiskRemote::Metadata::readMetadata(const String & remote_fs_root_path_, DiskPtr metadata_disk_, const String & metadata_file_path_)
{
Metadata result(remote_fs_root_path_, metadata_disk_, metadata_file_path_);
result.load();
return result;
}
IDiskRemote::Metadata IDiskRemote::Metadata::createAndStoreMetadata(const String & remote_fs_root_path_, DiskPtr metadata_disk_, const String & metadata_file_path_, bool sync)
{
Metadata result(remote_fs_root_path_, metadata_disk_, metadata_file_path_);
result.save(sync);
return result;
}
IDiskRemote::Metadata IDiskRemote::Metadata::readUpdateAndStoreMetadata(const String & remote_fs_root_path_, DiskPtr metadata_disk_, const String & metadata_file_path_, bool sync, IDiskRemote::MetadataUpdater updater)
{
Metadata result(remote_fs_root_path_, metadata_disk_, metadata_file_path_);
result.load();
if (updater(result))
result.save(sync);
return result;
}
IDiskRemote::Metadata IDiskRemote::Metadata::createUpdateAndStoreMetadata(const String & remote_fs_root_path_, DiskPtr metadata_disk_, const String & metadata_file_path_, bool sync, IDiskRemote::MetadataUpdater updater)
{
Metadata result(remote_fs_root_path_, metadata_disk_, metadata_file_path_);
updater(result);
result.save(sync);
return result;
}
IDiskRemote::Metadata IDiskRemote::Metadata::readUpdateStoreMetadataAndRemove(const String & remote_fs_root_path_, DiskPtr metadata_disk_, const String & metadata_file_path_, bool sync, IDiskRemote::MetadataUpdater updater)
{
Metadata result(remote_fs_root_path_, metadata_disk_, metadata_file_path_);
result.load();
if (updater(result))
result.save(sync);
metadata_disk_->removeFile(metadata_file_path_);
return result;
}
IDiskRemote::Metadata IDiskRemote::Metadata::createAndStoreMetadataIfNotExists(const String & remote_fs_root_path_, DiskPtr metadata_disk_, const String & metadata_file_path_, bool sync, bool overwrite)
{
if (overwrite || !metadata_disk_->exists(metadata_file_path_))
{
return createAndStoreMetadata(remote_fs_root_path_, metadata_disk_, metadata_file_path_, sync);
}
else
{
auto result = readMetadata(remote_fs_root_path_, metadata_disk_, metadata_file_path_);
if (result.read_only)
throw Exception("File is read-only: " + metadata_file_path_, ErrorCodes::PATH_ACCESS_DENIED);
return result;
}
}
void IDiskRemote::Metadata::load()
{
const ReadSettings read_settings;
auto buf = metadata_disk->readFile(metadata_file_path, read_settings, 1024); /* reasonable buffer size for small file */
UInt32 version;
readIntText(version, *buf);
if (version < VERSION_ABSOLUTE_PATHS || version > VERSION_READ_ONLY_FLAG)
throw Exception(
ErrorCodes::UNKNOWN_FORMAT,
"Unknown metadata file version. Path: {}. Version: {}. Maximum expected version: {}",
metadata_disk->getPath() + metadata_file_path, toString(version), toString(VERSION_READ_ONLY_FLAG));
assertChar('\n', *buf);
UInt32 remote_fs_objects_count;
readIntText(remote_fs_objects_count, *buf);
assertChar('\t', *buf);
readIntText(total_size, *buf);
assertChar('\n', *buf);
remote_fs_objects.resize(remote_fs_objects_count);
for (size_t i = 0; i < remote_fs_objects_count; ++i)
{
String remote_fs_object_path;
size_t remote_fs_object_size;
readIntText(remote_fs_object_size, *buf);
assertChar('\t', *buf);
readEscapedString(remote_fs_object_path, *buf);
if (version == VERSION_ABSOLUTE_PATHS)
{
if (!remote_fs_object_path.starts_with(remote_fs_root_path))
throw Exception(ErrorCodes::UNKNOWN_FORMAT,
"Path in metadata does not correspond to root path. Path: {}, root path: {}, disk path: {}",
remote_fs_object_path, remote_fs_root_path, metadata_disk->getPath());
remote_fs_object_path = remote_fs_object_path.substr(remote_fs_root_path.size());
}
assertChar('\n', *buf);
remote_fs_objects[i].relative_path = remote_fs_object_path;
remote_fs_objects[i].bytes_size = remote_fs_object_size;
}
readIntText(ref_count, *buf);
assertChar('\n', *buf);
if (version >= VERSION_READ_ONLY_FLAG)
{
readBoolText(read_only, *buf);
assertChar('\n', *buf);
}
}
/// Load metadata by path or create empty if `create` flag is set.
IDiskRemote::Metadata::Metadata(
const String & remote_fs_root_path_,
DiskPtr metadata_disk_,
const String & metadata_file_path_)
: remote_fs_root_path(remote_fs_root_path_)
, metadata_file_path(metadata_file_path_)
, metadata_disk(metadata_disk_)
{
}
void IDiskRemote::Metadata::addObject(const String & path, size_t size)
{
total_size += size;
remote_fs_objects.emplace_back(path, size);
}
void IDiskRemote::Metadata::saveToBuffer(WriteBuffer & buf, bool sync)
{
writeIntText(VERSION_RELATIVE_PATHS, buf);
writeChar('\n', buf);
writeIntText(remote_fs_objects.size(), buf);
writeChar('\t', buf);
writeIntText(total_size, buf);
writeChar('\n', buf);
for (const auto & [remote_fs_object_path, remote_fs_object_size] : remote_fs_objects)
{
writeIntText(remote_fs_object_size, buf);
writeChar('\t', buf);
writeEscapedString(remote_fs_object_path, buf);
writeChar('\n', buf);
}
writeIntText(ref_count, buf);
writeChar('\n', buf);
writeBoolText(read_only, buf);
writeChar('\n', buf);
buf.finalize();
if (sync)
buf.sync();
}
/// Fsync metadata file if 'sync' flag is set.
void IDiskRemote::Metadata::save(bool sync)
{
auto buf = metadata_disk->writeFile(metadata_file_path, 1024);
saveToBuffer(*buf, sync);
}
std::string IDiskRemote::Metadata::serializeToString()
{
WriteBufferFromOwnString write_buf;
saveToBuffer(write_buf, false);
return write_buf.str();
}
IDiskRemote::Metadata IDiskRemote::readMetadataUnlocked(const String & path, std::shared_lock<std::shared_mutex> &) const
{
return Metadata::readMetadata(remote_fs_root_path, metadata_disk, path);
}
IDiskRemote::Metadata IDiskRemote::readMetadata(const String & path) const
{
std::shared_lock lock(metadata_mutex);
return readMetadataUnlocked(path, lock);
}
IDiskRemote::Metadata IDiskRemote::readUpdateAndStoreMetadata(const String & path, bool sync, IDiskRemote::MetadataUpdater updater)
{
std::unique_lock lock(metadata_mutex);
return Metadata::readUpdateAndStoreMetadata(remote_fs_root_path, metadata_disk, path, sync, updater);
}
IDiskRemote::Metadata IDiskRemote::readUpdateStoreMetadataAndRemove(const String & path, bool sync, IDiskRemote::MetadataUpdater updater)
{
std::unique_lock lock(metadata_mutex);
return Metadata::readUpdateStoreMetadataAndRemove(remote_fs_root_path, metadata_disk, path, sync, updater);
}
IDiskRemote::Metadata IDiskRemote::readOrCreateUpdateAndStoreMetadata(const String & path, WriteMode mode, bool sync, IDiskRemote::MetadataUpdater updater)
{
if (mode == WriteMode::Rewrite || !metadata_disk->exists(path))
{
std::unique_lock lock(metadata_mutex);
return Metadata::createUpdateAndStoreMetadata(remote_fs_root_path, metadata_disk, path, sync, updater);
}
else
{
return Metadata::readUpdateAndStoreMetadata(remote_fs_root_path, metadata_disk, path, sync, updater);
}
}
IDiskRemote::Metadata IDiskRemote::createAndStoreMetadata(const String & path, bool sync)
{
return Metadata::createAndStoreMetadata(remote_fs_root_path, metadata_disk, path, sync);
}
IDiskRemote::Metadata IDiskRemote::createUpdateAndStoreMetadata(const String & path, bool sync, IDiskRemote::MetadataUpdater updater)
{
return Metadata::createUpdateAndStoreMetadata(remote_fs_root_path, metadata_disk, path, sync, updater);
}
std::unordered_map<String, String> IDiskRemote::getSerializedMetadata(const std::vector<std::string> & file_paths) const
{
std::unordered_map<String, String> metadatas;
std::shared_lock lock(metadata_mutex);
for (const auto & path : file_paths)
{
IDiskRemote::Metadata metadata = readMetadataUnlocked(path, lock);
metadata.ref_count = 0;
metadatas[path] = metadata.serializeToString();
}
return metadatas;
}
void IDiskRemote::removeMetadata(const String & path, std::vector<String> & paths_to_remove)
{
LOG_TRACE(log, "Remove file by path: {}", backQuote(metadata_disk->getPath() + path));
if (!metadata_disk->exists(path))
throw Exception(ErrorCodes::FILE_DOESNT_EXIST, "Metadata path '{}' doesn't exist", path);
if (!metadata_disk->isFile(path))
throw Exception(ErrorCodes::BAD_FILE_TYPE, "Path '{}' is not a regular file", path);
try
{
auto metadata_updater = [&paths_to_remove, this] (Metadata & metadata)
{
if (metadata.ref_count == 0)
{
for (const auto & [remote_fs_object_path, _] : metadata.remote_fs_objects)
{
paths_to_remove.push_back(remote_fs_root_path + remote_fs_object_path);
if (cache)
{
auto key = cache->hash(remote_fs_object_path);
cache->remove(key);
}
}
return false;
}
else /// In other case decrement number of references, save metadata and delete hardlink.
{
--metadata.ref_count;
}
return true;
};
readUpdateStoreMetadataAndRemove(path, false, metadata_updater);
/// If there is no references - delete content from remote FS.
}
catch (const Exception & e)
{
/// If it's impossible to read meta - just remove it from FS.
if (e.code() == ErrorCodes::UNKNOWN_FORMAT)
{
LOG_WARNING(log,
"Metadata file {} can't be read by reason: {}. Removing it forcibly.",
backQuote(path), e.nested() ? e.nested()->message() : e.message());
metadata_disk->removeFile(path);
}
else
throw;
}
}
void IDiskRemote::removeMetadataRecursive(const String & path, std::unordered_map<String, std::vector<String>> & paths_to_remove)
{
checkStackSize(); /// This is needed to prevent stack overflow in case of cyclic symlinks.
if (metadata_disk->isFile(path))
{
removeMetadata(path, paths_to_remove[path]);
}
else
{
for (auto it = iterateDirectory(path); it->isValid(); it->next())
removeMetadataRecursive(it->path(), paths_to_remove);
metadata_disk->removeDirectory(path);
}
}
std::vector<String> IDiskRemote::getRemotePaths(const String & local_path) const
{
auto metadata = readMetadata(local_path);
std::vector<String> remote_paths;
for (const auto & [remote_path, _] : metadata.remote_fs_objects)
remote_paths.push_back(fs::path(metadata.remote_fs_root_path) / remote_path);
return remote_paths;
}
void IDiskRemote::getRemotePathsRecursive(const String & local_path, std::vector<LocalPathWithRemotePaths> & paths_map)
{
/// Protect against concurrent delition of files (for example because of a merge).
if (metadata_disk->isFile(local_path))
{
try
{
paths_map.emplace_back(local_path, getRemotePaths(local_path));
}
catch (const Exception & e)
{
if (e.code() == ErrorCodes::FILE_DOESNT_EXIST)
return;
throw;
}
}
else
{
DiskDirectoryIteratorPtr it;
try
{
it = iterateDirectory(local_path);
}
catch (const fs::filesystem_error & e)
{
if (e.code() == std::errc::no_such_file_or_directory)
return;
throw;
}
for (; it->isValid(); it->next())
IDiskRemote::getRemotePathsRecursive(fs::path(local_path) / it->name(), paths_map);
}
}
DiskPtr DiskRemoteReservation::getDisk(size_t i) const
{
if (i != 0)
throw Exception("Can't use i != 0 with single disk reservation", ErrorCodes::INCORRECT_DISK_INDEX);
return disk;
}
void DiskRemoteReservation::update(UInt64 new_size)
{
std::lock_guard lock(disk->reservation_mutex);
disk->reserved_bytes -= size;
size = new_size;
disk->reserved_bytes += size;
}
DiskRemoteReservation::~DiskRemoteReservation()
{
try
{
std::lock_guard lock(disk->reservation_mutex);
if (disk->reserved_bytes < size)
{
disk->reserved_bytes = 0;
LOG_ERROR(disk->log, "Unbalanced reservations size for disk '{}'.", disk->getName());
}
else
{
disk->reserved_bytes -= size;
}
if (disk->reservation_count == 0)
LOG_ERROR(disk->log, "Unbalanced reservation count for disk '{}'.", disk->getName());
else
--disk->reservation_count;
}
catch (...)
{
tryLogCurrentException(__PRETTY_FUNCTION__);
}
}
IDiskRemote::IDiskRemote(
const String & name_,
const String & remote_fs_root_path_,
DiskPtr metadata_disk_,
FileCachePtr cache_,
const String & log_name_,
size_t thread_pool_size)
: IDisk(std::make_unique<AsyncExecutor>(log_name_, thread_pool_size))
, log(&Poco::Logger::get(log_name_))
, name(name_)
, remote_fs_root_path(remote_fs_root_path_)
, metadata_disk(metadata_disk_)
, cache(cache_)
{
}
String IDiskRemote::getCacheBasePath() const
{
return cache ? cache->getBasePath() : "";
}
bool IDiskRemote::exists(const String & path) const
{
return metadata_disk->exists(path);
}
bool IDiskRemote::isFile(const String & path) const
{
return metadata_disk->isFile(path);
}
void IDiskRemote::createFile(const String & path)
{
createAndStoreMetadata(path, false);
}
size_t IDiskRemote::getFileSize(const String & path) const
{
return readMetadata(path).total_size;
}
void IDiskRemote::moveFile(const String & from_path, const String & to_path)
{
if (exists(to_path))
throw Exception("File already exists: " + to_path, ErrorCodes::FILE_ALREADY_EXISTS);
metadata_disk->moveFile(from_path, to_path);
}
void IDiskRemote::replaceFile(const String & from_path, const String & to_path)
{
if (exists(to_path))
{
const String tmp_path = to_path + ".old";
moveFile(to_path, tmp_path);
moveFile(from_path, to_path);
removeFile(tmp_path);
}
else
moveFile(from_path, to_path);
}
void IDiskRemote::removeSharedFile(const String & path, bool delete_metadata_only)
{
std::vector<String> paths_to_remove;
removeMetadata(path, paths_to_remove);
if (!delete_metadata_only)
removeFromRemoteFS(paths_to_remove);
}
void IDiskRemote::removeSharedFileIfExists(const String & path, bool delete_metadata_only)
{
std::vector<String> paths_to_remove;
if (metadata_disk->exists(path))
{
removeMetadata(path, paths_to_remove);
if (!delete_metadata_only)
removeFromRemoteFS(paths_to_remove);
}
}
void IDiskRemote::removeSharedFiles(const RemoveBatchRequest & files, bool keep_all_batch_data, const NameSet & file_names_remove_metadata_only)
{
std::unordered_map<String, std::vector<String>> paths_to_remove;
for (const auto & file : files)
{
bool skip = file.if_exists && !metadata_disk->exists(file.path);
if (!skip)
removeMetadata(file.path, paths_to_remove[file.path]);
}
if (!keep_all_batch_data)
{
std::vector<String> remove_from_remote;
for (auto && [path, remote_paths] : paths_to_remove)
{
if (!file_names_remove_metadata_only.contains(fs::path(path).filename()))
remove_from_remote.insert(remove_from_remote.end(), remote_paths.begin(), remote_paths.end());
}
removeFromRemoteFS(remove_from_remote);
}
}
void IDiskRemote::removeSharedRecursive(const String & path, bool keep_all_batch_data, const NameSet & file_names_remove_metadata_only)
{
std::unordered_map<String, std::vector<String>> paths_to_remove;
removeMetadataRecursive(path, paths_to_remove);
if (!keep_all_batch_data)
{
std::vector<String> remove_from_remote;
for (auto && [local_path, remote_paths] : paths_to_remove)
{
if (!file_names_remove_metadata_only.contains(fs::path(local_path).filename()))
remove_from_remote.insert(remove_from_remote.end(), remote_paths.begin(), remote_paths.end());
}
removeFromRemoteFS(remove_from_remote);
}
}
void IDiskRemote::setReadOnly(const String & path)
{
/// We should store read only flag inside metadata file (instead of using FS flag),
/// because we modify metadata file when create hard-links from it.
readUpdateAndStoreMetadata(path, false, [] (Metadata & metadata) { metadata.read_only = true; return true; });
}
bool IDiskRemote::isDirectory(const String & path) const
{
return metadata_disk->isDirectory(path);
}
void IDiskRemote::createDirectory(const String & path)
{
metadata_disk->createDirectory(path);
}
void IDiskRemote::createDirectories(const String & path)
{
metadata_disk->createDirectories(path);
}
void IDiskRemote::clearDirectory(const String & path)
{
for (auto it = iterateDirectory(path); it->isValid(); it->next())
if (isFile(it->path()))
removeFile(it->path());
}
void IDiskRemote::removeDirectory(const String & path)
{
metadata_disk->removeDirectory(path);
}
DiskDirectoryIteratorPtr IDiskRemote::iterateDirectory(const String & path)
{
return metadata_disk->iterateDirectory(path);
}
void IDiskRemote::listFiles(const String & path, std::vector<String> & file_names)
{
for (auto it = iterateDirectory(path); it->isValid(); it->next())
file_names.push_back(it->name());
}
void IDiskRemote::setLastModified(const String & path, const Poco::Timestamp & timestamp)
{
metadata_disk->setLastModified(path, timestamp);
}
Poco::Timestamp IDiskRemote::getLastModified(const String & path)
{
return metadata_disk->getLastModified(path);
}
void IDiskRemote::createHardLink(const String & src_path, const String & dst_path)
{
readUpdateAndStoreMetadata(src_path, false, [] (Metadata & metadata) { metadata.ref_count++; return true; });
/// Create FS hardlink to metadata file.
metadata_disk->createHardLink(src_path, dst_path);
}
ReservationPtr IDiskRemote::reserve(UInt64 bytes)
{
auto unreserved_space = tryReserve(bytes);
if (!unreserved_space.has_value())
return {};
return std::make_unique<DiskRemoteReservation>(
std::static_pointer_cast<IDiskRemote>(shared_from_this()),
bytes, unreserved_space.value());
}
std::optional<UInt64> IDiskRemote::tryReserve(UInt64 bytes)
{
std::lock_guard lock(reservation_mutex);
auto available_space = getAvailableSpace();
UInt64 unreserved_space = available_space - std::min(available_space, reserved_bytes);
if (bytes == 0)
{
LOG_TRACE(log, "Reserving 0 bytes on remote_fs disk {}", backQuote(name));
++reservation_count;
return {unreserved_space};
}
if (unreserved_space >= bytes)
{
LOG_TRACE(log, "Reserving {} on disk {}, having unreserved {}.",
ReadableSize(bytes), backQuote(name), ReadableSize(unreserved_space));
++reservation_count;
reserved_bytes += bytes;
return {unreserved_space - bytes};
}
return {};
}
String IDiskRemote::getUniqueId(const String & path) const
{
LOG_TRACE(log, "Remote path: {}, Path: {}", remote_fs_root_path, path);
auto metadata = readMetadata(path);
String id;
if (!metadata.remote_fs_objects.empty())
id = metadata.remote_fs_root_path + metadata.remote_fs_objects[0].relative_path;
return id;
}
AsynchronousReaderPtr IDiskRemote::getThreadPoolReader()
{
constexpr size_t pool_size = 50;
constexpr size_t queue_size = 1000000;
static AsynchronousReaderPtr reader = std::make_shared<ThreadPoolRemoteFSReader>(pool_size, queue_size);
return reader;
}
UInt32 IDiskRemote::getRefCount(const String & path) const
{
return readMetadata(path).ref_count;
}
ThreadPool & IDiskRemote::getThreadPoolWriter()
{
constexpr size_t pool_size = 100;
constexpr size_t queue_size = 1000000;
static ThreadPool writer(pool_size, pool_size, queue_size);
return writer;
}
}

Some files were not shown because too many files have changed in this diff Show More