Merge branch 'ClickHouse:master' into hive_style_partitioning

This commit is contained in:
Yarik Briukhovetskyi 2024-07-07 21:55:39 +02:00 committed by GitHub
commit e646713122
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
244 changed files with 5457 additions and 10530 deletions

View File

@ -40,8 +40,6 @@ Every month we get together with the community (users, contributors, customers,
Keep an eye out for upcoming meetups and events around the world. Somewhere else you want us to be? Please feel free to reach out to tyler `<at>` clickhouse `<dot>` com. You can also peruse [ClickHouse Events](https://clickhouse.com/company/news-events) for a list of all upcoming trainings, meetups, speaking engagements, etc. Keep an eye out for upcoming meetups and events around the world. Somewhere else you want us to be? Please feel free to reach out to tyler `<at>` clickhouse `<dot>` com. You can also peruse [ClickHouse Events](https://clickhouse.com/company/news-events) for a list of all upcoming trainings, meetups, speaking engagements, etc.
* [AWS Summit in DC](https://clickhouse.com/company/events/2024-06-aws-summit-dc) - Jun 26
* [ClickHouse Meetup in Amsterdam](https://www.meetup.com/clickhouse-netherlands-user-group/events/300781068/) - Jun 27
* [ClickHouse Meetup in Paris](https://www.meetup.com/clickhouse-france-user-group/events/300783448/) - Jul 9 * [ClickHouse Meetup in Paris](https://www.meetup.com/clickhouse-france-user-group/events/300783448/) - Jul 9
* [ClickHouse Cloud - Live Update Call](https://clickhouse.com/company/events/202407-cloud-update-live) - Jul 9 * [ClickHouse Cloud - Live Update Call](https://clickhouse.com/company/events/202407-cloud-update-live) - Jul 9
* [ClickHouse Meetup @ Ramp - New York City](https://www.meetup.com/clickhouse-new-york-user-group/events/300595845/) - Jul 9 * [ClickHouse Meetup @ Ramp - New York City](https://www.meetup.com/clickhouse-new-york-user-group/events/300595845/) - Jul 9

View File

@ -84,5 +84,5 @@ if (CMAKE_CROSSCOMPILING)
message (FATAL_ERROR "Trying to cross-compile to unsupported system: ${CMAKE_SYSTEM_NAME}!") message (FATAL_ERROR "Trying to cross-compile to unsupported system: ${CMAKE_SYSTEM_NAME}!")
endif () endif ()
message (STATUS "Cross-compiling for target: ${CMAKE_CXX_COMPILE_TARGET}") message (STATUS "Cross-compiling for target: ${CMAKE_CXX_COMPILER_TARGET}")
endif () endif ()

2
contrib/orc vendored

@ -1 +1 @@
Subproject commit 947cebaf9432d708253ac08dc3012daa6b4ede6f Subproject commit bcc025c09828c556f54cfbdf83a66b9acae7d17f

2
contrib/s2geometry vendored

@ -1 +1 @@
Subproject commit 0146e2d1355828f8f633cb050948250ad7406c57 Subproject commit 6522a40338d58752c2a4227a3fc2bc4107c73e43

View File

@ -1,7 +1,6 @@
option(ENABLE_S2_GEOMETRY "Enable S2 Geometry" ${ENABLE_LIBRARIES}) option(ENABLE_S2_GEOMETRY "Enable S2 Geometry" ${ENABLE_LIBRARIES})
# ARCH_S390X broke upstream, it can be re-enabled once https://github.com/google/s2geometry/pull/372 is merged if (NOT ENABLE_S2_GEOMETRY)
if (NOT ENABLE_S2_GEOMETRY OR ARCH_S390X)
message(STATUS "Not using S2 Geometry") message(STATUS "Not using S2 Geometry")
return() return()
endif() endif()

View File

@ -34,7 +34,7 @@ RUN arch=${TARGETARCH:-amd64} \
# lts / testing / prestable / etc # lts / testing / prestable / etc
ARG REPO_CHANNEL="stable" ARG REPO_CHANNEL="stable"
ARG REPOSITORY="https://packages.clickhouse.com/tgz/${REPO_CHANNEL}" ARG REPOSITORY="https://packages.clickhouse.com/tgz/${REPO_CHANNEL}"
ARG VERSION="24.6.1.4423" ARG VERSION="24.6.2.17"
ARG PACKAGES="clickhouse-keeper" ARG PACKAGES="clickhouse-keeper"
ARG DIRECT_DOWNLOAD_URLS="" ARG DIRECT_DOWNLOAD_URLS=""

47
docker/reqgenerator.py Normal file
View File

@ -0,0 +1,47 @@
#!/usr/bin/env python3
# To run this script you must install docker and piddeptree python package
#
import subprocess
import os
import sys
def build_docker_deps(image_name, imagedir):
cmd = f"""docker run --entrypoint "/bin/bash" {image_name} -c "pip install pipdeptree 2>/dev/null 1>/dev/null && pipdeptree --freeze --warn silence | sed 's/ \+//g' | sort | uniq" > {imagedir}/requirements.txt"""
subprocess.check_call(cmd, shell=True)
def check_docker_file_install_with_pip(filepath):
image_name = None
with open(filepath, "r") as f:
for line in f:
if "docker build" in line:
arr = line.split(" ")
if len(arr) > 4:
image_name = arr[4]
if "pip3 install" in line or "pip install" in line:
return image_name, True
return image_name, False
def process_affected_images(images_dir):
for root, _dirs, files in os.walk(images_dir):
for f in files:
if f == "Dockerfile":
docker_file_path = os.path.join(root, f)
print("Checking image on path", docker_file_path)
image_name, has_pip = check_docker_file_install_with_pip(
docker_file_path
)
if has_pip:
print("Find pip in", image_name)
try:
build_docker_deps(image_name, root)
except Exception as ex:
print(ex)
else:
print("Pip not found in", docker_file_path)
process_affected_images(sys.argv[1])

View File

@ -32,7 +32,7 @@ RUN arch=${TARGETARCH:-amd64} \
# lts / testing / prestable / etc # lts / testing / prestable / etc
ARG REPO_CHANNEL="stable" ARG REPO_CHANNEL="stable"
ARG REPOSITORY="https://packages.clickhouse.com/tgz/${REPO_CHANNEL}" ARG REPOSITORY="https://packages.clickhouse.com/tgz/${REPO_CHANNEL}"
ARG VERSION="24.6.1.4423" ARG VERSION="24.6.2.17"
ARG PACKAGES="clickhouse-client clickhouse-server clickhouse-common-static" ARG PACKAGES="clickhouse-client clickhouse-server clickhouse-common-static"
ARG DIRECT_DOWNLOAD_URLS="" ARG DIRECT_DOWNLOAD_URLS=""

View File

@ -28,7 +28,7 @@ RUN sed -i "s|http://archive.ubuntu.com|${apt_archive}|g" /etc/apt/sources.list
ARG REPO_CHANNEL="stable" ARG REPO_CHANNEL="stable"
ARG REPOSITORY="deb [signed-by=/usr/share/keyrings/clickhouse-keyring.gpg] https://packages.clickhouse.com/deb ${REPO_CHANNEL} main" ARG REPOSITORY="deb [signed-by=/usr/share/keyrings/clickhouse-keyring.gpg] https://packages.clickhouse.com/deb ${REPO_CHANNEL} main"
ARG VERSION="24.6.1.4423" ARG VERSION="24.6.2.17"
ARG PACKAGES="clickhouse-client clickhouse-server clickhouse-common-static" ARG PACKAGES="clickhouse-client clickhouse-server clickhouse-common-static"
#docker-official-library:off #docker-official-library:off

View File

@ -19,10 +19,7 @@ RUN apt-get update \
odbcinst \ odbcinst \
psmisc \ psmisc \
python3 \ python3 \
python3-lxml \
python3-pip \ python3-pip \
python3-requests \
python3-termcolor \
unixodbc \ unixodbc \
pv \ pv \
jq \ jq \
@ -31,7 +28,8 @@ RUN apt-get update \
&& apt-get clean \ && apt-get clean \
&& rm -rf /var/lib/apt/lists/* /var/cache/debconf /tmp/* && rm -rf /var/lib/apt/lists/* /var/cache/debconf /tmp/*
RUN pip3 install numpy==1.26.3 scipy==1.12.0 pandas==1.5.3 Jinja2==3.1.3 COPY requirements.txt /
RUN pip3 install --no-cache-dir -r /requirements.txt
# This symlink is required by gcc to find the lld linker # This symlink is required by gcc to find the lld linker
RUN ln -s /usr/bin/lld-${LLVM_VERSION} /usr/bin/ld.lld RUN ln -s /usr/bin/lld-${LLVM_VERSION} /usr/bin/ld.lld
@ -39,6 +37,10 @@ RUN ln -s /usr/bin/lld-${LLVM_VERSION} /usr/bin/ld.lld
# https://salsa.debian.org/pkg-llvm-team/llvm-toolchain/-/commit/992e52c0b156a5ba9c6a8a54f8c4857ddd3d371d # https://salsa.debian.org/pkg-llvm-team/llvm-toolchain/-/commit/992e52c0b156a5ba9c6a8a54f8c4857ddd3d371d
RUN sed -i '/_IMPORT_CHECK_FILES_FOR_\(mlir-\|llvm-bolt\|merge-fdata\|MLIR\)/ {s|^|#|}' /usr/lib/llvm-${LLVM_VERSION}/lib/cmake/llvm/LLVMExports-*.cmake RUN sed -i '/_IMPORT_CHECK_FILES_FOR_\(mlir-\|llvm-bolt\|merge-fdata\|MLIR\)/ {s|^|#|}' /usr/lib/llvm-${LLVM_VERSION}/lib/cmake/llvm/LLVMExports-*.cmake
# LLVM changes paths for compiler-rt libraries. For some reason clang-18.1.8 cannot catch up libraries from default install path.
# It's very dirty workaround, better to build compiler and LLVM ourself and use it. Details: https://github.com/llvm/llvm-project/issues/95792
RUN test ! -d /usr/lib/llvm-18/lib/clang/18/lib/x86_64-pc-linux-gnu || ln -s /usr/lib/llvm-18/lib/clang/18/lib/x86_64-pc-linux-gnu /usr/lib/llvm-18/lib/clang/18/lib/x86_64-unknown-linux-gnu
ARG CCACHE_VERSION=4.6.1 ARG CCACHE_VERSION=4.6.1
RUN mkdir /tmp/ccache \ RUN mkdir /tmp/ccache \
&& cd /tmp/ccache \ && cd /tmp/ccache \

View File

@ -0,0 +1,41 @@
Jinja2==3.1.3
MarkupSafe==2.1.5
PyJWT==2.3.0
PyYAML==6.0.1
Pygments==2.11.2
SecretStorage==3.3.1
blinker==1.4
certifi==2020.6.20
chardet==4.0.0
cryptography==3.4.8
dbus-python==1.2.18
distro==1.7.0
httplib2==0.20.2
idna==3.3
importlib-metadata==4.6.4
jeepney==0.7.1
keyring==23.5.0
launchpadlib==1.10.16
lazr.restfulclient==0.14.4
lazr.uri==1.0.6
lxml==4.8.0
more-itertools==8.10.0
numpy==1.26.3
oauthlib==3.2.0
packaging==24.1
pandas==1.5.3
pip==24.1.1
pipdeptree==2.23.0
pyparsing==2.4.7
python-apt==2.4.0+ubuntu3
python-dateutil==2.9.0.post0
pytz==2024.1
requests==2.32.3
scipy==1.12.0
setuptools==59.6.0
six==1.16.0
termcolor==1.1.0
urllib3==1.26.5
wadllib==1.3.6
wheel==0.37.1
zipp==1.0.0

View File

@ -84,6 +84,8 @@ function start_server
echo "ClickHouse server pid '$server_pid' started and responded" echo "ClickHouse server pid '$server_pid' started and responded"
} }
export -f start_server
function clone_root function clone_root
{ {
[ "$UID" -eq 0 ] && git config --global --add safe.directory "$FASTTEST_SOURCE" [ "$UID" -eq 0 ] && git config --global --add safe.directory "$FASTTEST_SOURCE"
@ -254,6 +256,19 @@ function configure
rm -f "$FASTTEST_DATA/config.d/secure_ports.xml" rm -f "$FASTTEST_DATA/config.d/secure_ports.xml"
} }
function timeout_with_logging() {
local exit_code=0
timeout -s TERM --preserve-status "${@}" || exit_code="${?}"
if [[ "${exit_code}" -eq "124" ]]
then
echo "The command 'timeout ${*}' has been killed by timeout"
fi
return $exit_code
}
function run_tests function run_tests
{ {
clickhouse-server --version clickhouse-server --version
@ -292,6 +307,8 @@ function run_tests
clickhouse stop --pid-path "$FASTTEST_DATA" clickhouse stop --pid-path "$FASTTEST_DATA"
} }
export -f run_tests
case "$stage" in case "$stage" in
"") "")
ls -la ls -la
@ -315,7 +332,7 @@ case "$stage" in
configure 2>&1 | ts '%Y-%m-%d %H:%M:%S' | tee "$FASTTEST_OUTPUT/install_log.txt" configure 2>&1 | ts '%Y-%m-%d %H:%M:%S' | tee "$FASTTEST_OUTPUT/install_log.txt"
;& ;&
"run_tests") "run_tests")
run_tests timeout_with_logging 35m bash -c run_tests ||:
/process_functional_tests_result.py --in-results-dir "$FASTTEST_OUTPUT/" \ /process_functional_tests_result.py --in-results-dir "$FASTTEST_OUTPUT/" \
--out-results-file "$FASTTEST_OUTPUT/test_results.tsv" \ --out-results-file "$FASTTEST_OUTPUT/test_results.tsv" \
--out-status-file "$FASTTEST_OUTPUT/check_status.tsv" || echo -e "failure\tCannot parse results" > "$FASTTEST_OUTPUT/check_status.tsv" --out-status-file "$FASTTEST_OUTPUT/check_status.tsv" || echo -e "failure\tCannot parse results" > "$FASTTEST_OUTPUT/check_status.tsv"

View File

@ -31,7 +31,8 @@ RUN apt-get update \
&& apt-get clean \ && apt-get clean \
&& rm -rf /var/lib/apt/lists/* /var/cache/debconf /tmp/* && rm -rf /var/lib/apt/lists/* /var/cache/debconf /tmp/*
RUN pip3 install Jinja2 COPY requirements.txt /
RUN pip3 install --no-cache-dir -r /requirements.txt
COPY * / COPY * /

View File

@ -0,0 +1,27 @@
blinker==1.4
cryptography==3.4.8
dbus-python==1.2.18
distro==1.7.0
httplib2==0.20.2
importlib-metadata==4.6.4
jeepney==0.7.1
Jinja2==3.1.4
keyring==23.5.0
launchpadlib==1.10.16
lazr.restfulclient==0.14.4
lazr.uri==1.0.6
MarkupSafe==2.1.5
more-itertools==8.10.0
oauthlib==3.2.0
packaging==24.1
pip==24.1.1
pipdeptree==2.23.0
PyJWT==2.3.0
pyparsing==2.4.7
python-apt==2.4.0+ubuntu3
SecretStorage==3.3.1
setuptools==59.6.0
six==1.16.0
wadllib==1.3.6
wheel==0.37.1
zipp==1.0.0

View File

@ -33,7 +33,8 @@ RUN apt-get update \
&& apt-get clean \ && apt-get clean \
&& rm -rf /var/lib/apt/lists/* /var/cache/debconf /tmp/* && rm -rf /var/lib/apt/lists/* /var/cache/debconf /tmp/*
RUN pip3 install pycurl COPY requirements.txt /
RUN pip3 install --no-cache-dir -r requirements.txt && rm -rf /root/.cache/pip
# Architecture of the image when BuildKit/buildx is used # Architecture of the image when BuildKit/buildx is used
ARG TARGETARCH ARG TARGETARCH

View File

@ -0,0 +1,26 @@
blinker==1.4
cryptography==3.4.8
dbus-python==1.2.18
distro==1.7.0
httplib2==0.20.2
importlib-metadata==4.6.4
jeepney==0.7.1
keyring==23.5.0
launchpadlib==1.10.16
lazr.restfulclient==0.14.4
lazr.uri==1.0.6
more-itertools==8.10.0
oauthlib==3.2.0
packaging==24.1
pip==24.1.1
pipdeptree==2.23.0
pycurl==7.45.3
PyJWT==2.3.0
pyparsing==2.4.7
python-apt==2.4.0+ubuntu3
SecretStorage==3.3.1
setuptools==59.6.0
six==1.16.0
wadllib==1.3.6
wheel==0.37.1
zipp==1.0.0

View File

@ -2,4 +2,5 @@
# Helper docker container to run python bottle apps # Helper docker container to run python bottle apps
FROM python:3 FROM python:3
RUN python -m pip install bottle COPY requirements.txt /
RUN python -m pip install --no-cache-dir -r requirements.txt

View File

@ -0,0 +1,6 @@
bottle==0.12.25
packaging==24.1
pip==23.2.1
pipdeptree==2.23.0
setuptools==69.0.3
wheel==0.42.0

View File

@ -26,7 +26,6 @@ RUN apt-get update \
libicu-dev \ libicu-dev \
bsdutils \ bsdutils \
curl \ curl \
python3-pika \
liblua5.1-dev \ liblua5.1-dev \
luajit \ luajit \
libssl-dev \ libssl-dev \
@ -61,49 +60,8 @@ RUN curl -fsSL https://download.docker.com/linux/ubuntu/gpg | apt-key add - \
# kazoo 2.10.0 is broken # kazoo 2.10.0 is broken
# https://s3.amazonaws.com/clickhouse-test-reports/59337/524625a1d2f4cc608a3f1059e3df2c30f353a649/integration_tests__asan__analyzer__[5_6].html # https://s3.amazonaws.com/clickhouse-test-reports/59337/524625a1d2f4cc608a3f1059e3df2c30f353a649/integration_tests__asan__analyzer__[5_6].html
RUN python3 -m pip install --no-cache-dir \ COPY requirements.txt /
PyMySQL==1.1.0 \ RUN python3 -m pip install --no-cache-dir -r requirements.txt
asyncio==3.4.3 \
avro==1.10.2 \
azure-storage-blob==12.19.0 \
boto3==1.34.24 \
cassandra-driver==3.29.0 \
confluent-kafka==2.3.0 \
delta-spark==2.3.0 \
dict2xml==1.7.4 \
dicttoxml==1.7.16 \
docker==6.1.3 \
docker-compose==1.29.2 \
grpcio==1.60.0 \
grpcio-tools==1.60.0 \
kafka-python==2.0.2 \
lz4==4.3.3 \
minio==7.2.3 \
nats-py==2.6.0 \
protobuf==4.25.2 \
kazoo==2.9.0 \
psycopg2-binary==2.9.6 \
pyhdfs==0.3.1 \
pymongo==3.11.0 \
pyspark==3.3.2 \
pytest==7.4.4 \
pytest-order==1.0.0 \
pytest-random==0.2 \
pytest-repeat==0.9.3 \
pytest-timeout==2.2.0 \
pytest-xdist==3.5.0 \
pytest-reportlog==0.4.0 \
pytz==2023.3.post1 \
pyyaml==5.3.1 \
redis==5.0.1 \
requests-kerberos==0.14.0 \
tzlocal==2.1 \
retry==0.9.2 \
bs4==0.0.2 \
lxml==5.1.0 \
urllib3==2.0.7 \
jwcrypto==1.5.6
# bs4, lxml are for cloud tests, do not delete
# Hudi supports only spark 3.3.*, not 3.4 # Hudi supports only spark 3.3.*, not 3.4
RUN curl -fsSL -O https://archive.apache.org/dist/spark/spark-3.3.2/spark-3.3.2-bin-hadoop3.tgz \ RUN curl -fsSL -O https://archive.apache.org/dist/spark/spark-3.3.2/spark-3.3.2-bin-hadoop3.tgz \

View File

@ -0,0 +1,113 @@
PyHDFS==0.3.1
PyJWT==2.3.0
PyMySQL==1.1.0
PyNaCl==1.5.0
PyYAML==5.3.1
SecretStorage==3.3.1
argon2-cffi-bindings==21.2.0
argon2-cffi==23.1.0
async-timeout==4.0.3
asyncio==3.4.3
attrs==23.2.0
avro==1.10.2
azure-core==1.30.1
azure-storage-blob==12.19.0
bcrypt==4.1.3
beautifulsoup4==4.12.3
blinker==1.4
boto3==1.34.24
botocore==1.34.101
bs4==0.0.2
cassandra-driver==3.29.0
certifi==2024.2.2
cffi==1.16.0
charset-normalizer==3.3.2
click==8.1.7
confluent-kafka==2.3.0
cryptography==3.4.8
dbus-python==1.2.18
decorator==5.1.1
delta-spark==2.3.0
dict2xml==1.7.4
dicttoxml==1.7.16
distro-info==1.1+ubuntu0.2
distro==1.7.0
docker-compose==1.29.2
docker==6.1.3
dockerpty==0.4.1
docopt==0.6.2
exceptiongroup==1.2.1
execnet==2.1.1
geomet==0.2.1.post1
grpcio-tools==1.60.0
grpcio==1.60.0
gssapi==1.8.3
httplib2==0.20.2
idna==3.7
importlib-metadata==4.6.4
iniconfig==2.0.0
isodate==0.6.1
jeepney==0.7.1
jmespath==1.0.1
jsonschema==3.2.0
jwcrypto==1.5.6
kafka-python==2.0.2
kazoo==2.9.0
keyring==23.5.0
krb5==0.5.1
launchpadlib==1.10.16
lazr.restfulclient==0.14.4
lazr.uri==1.0.6
lxml==5.1.0
lz4==4.3.3
minio==7.2.3
more-itertools==8.10.0
nats-py==2.6.0
oauthlib==3.2.0
packaging==24.0
paramiko==3.4.0
pika==1.2.0
pip==24.1.1
pipdeptree==2.23.0
pluggy==1.5.0
protobuf==4.25.2
psycopg2-binary==2.9.6
py4j==0.10.9.5
py==1.11.0
pycparser==2.22
pycryptodome==3.20.0
pymongo==3.11.0
pyparsing==2.4.7
pyrsistent==0.20.0
pyspark==3.3.2
pyspnego==0.10.2
pytest-order==1.0.0
pytest-random==0.2
pytest-repeat==0.9.3
pytest-reportlog==0.4.0
pytest-timeout==2.2.0
pytest-xdist==3.5.0
pytest==7.4.4
python-apt==2.4.0+ubuntu3
python-dateutil==2.9.0.post0
python-dotenv==0.21.1
pytz==2023.3.post1
redis==5.0.1
requests-kerberos==0.14.0
requests==2.31.0
retry==0.9.2
s3transfer==0.10.1
setuptools==59.6.0
simplejson==3.19.2
six==1.16.0
soupsieve==2.5
texttable==1.7.0
tomli==2.0.1
typing_extensions==4.11.0
tzlocal==2.1
unattended-upgrades==0.1
urllib3==2.0.7
wadllib==1.3.6
websocket-client==0.59.0
wheel==0.37.1
zipp==1.0.0

View File

@ -1,3 +1,4 @@
# docker build -t clickhouse/libfuzzer .
ARG FROM_TAG=latest ARG FROM_TAG=latest
FROM clickhouse/test-base:$FROM_TAG FROM clickhouse/test-base:$FROM_TAG
@ -29,7 +30,8 @@ RUN apt-get update \
&& apt-get clean \ && apt-get clean \
&& rm -rf /var/lib/apt/lists/* /var/cache/debconf /tmp/* && rm -rf /var/lib/apt/lists/* /var/cache/debconf /tmp/*
RUN pip3 install Jinja2 COPY requirements.txt /
RUN pip3 install --no-cache-dir -r /requirements.txt
COPY * / COPY * /

View File

@ -0,0 +1,27 @@
blinker==1.4
cryptography==3.4.8
dbus-python==1.2.18
distro==1.7.0
httplib2==0.20.2
importlib-metadata==4.6.4
jeepney==0.7.1
Jinja2==3.1.4
keyring==23.5.0
launchpadlib==1.10.16
lazr.restfulclient==0.14.4
lazr.uri==1.0.6
MarkupSafe==2.1.5
more-itertools==8.10.0
oauthlib==3.2.0
packaging==24.1
pip==24.1.1
pipdeptree==2.23.0
PyJWT==2.3.0
pyparsing==2.4.7
python-apt==2.4.0+ubuntu3
SecretStorage==3.3.1
setuptools==59.6.0
six==1.16.0
wadllib==1.3.6
wheel==0.37.1
zipp==1.0.0

View File

@ -23,7 +23,6 @@ RUN apt-get update \
python3 \ python3 \
python3-dev \ python3-dev \
python3-pip \ python3-pip \
python3-setuptools \
rsync \ rsync \
tree \ tree \
tzdata \ tzdata \
@ -33,12 +32,14 @@ RUN apt-get update \
cargo \ cargo \
ripgrep \ ripgrep \
zstd \ zstd \
&& pip3 --no-cache-dir install 'clickhouse-driver==0.2.1' scipy \
&& apt-get purge --yes python3-dev g++ \ && apt-get purge --yes python3-dev g++ \
&& apt-get autoremove --yes \ && apt-get autoremove --yes \
&& apt-get clean \ && apt-get clean \
&& rm -rf /var/lib/apt/lists/* /var/cache/debconf /tmp/* && rm -rf /var/lib/apt/lists/* /var/cache/debconf /tmp/*
COPY requirements.txt /
RUN pip3 --no-cache-dir install -r requirements.txt
COPY run.sh / COPY run.sh /
CMD ["bash", "/run.sh"] CMD ["bash", "/run.sh"]

View File

@ -0,0 +1,32 @@
blinker==1.4
clickhouse-driver==0.2.7
cryptography==3.4.8
dbus-python==1.2.18
distro==1.7.0
httplib2==0.20.2
importlib-metadata==4.6.4
jeepney==0.7.1
keyring==23.5.0
launchpadlib==1.10.16
lazr.restfulclient==0.14.4
lazr.uri==1.0.6
more-itertools==8.10.0
numpy==1.26.3
oauthlib==3.2.0
packaging==24.1
pip==24.1.1
pipdeptree==2.23.0
Pygments==2.11.2
PyJWT==2.3.0
pyparsing==2.4.7
python-apt==2.4.0+ubuntu3
pytz==2023.4
PyYAML==6.0.1
scipy==1.12.0
SecretStorage==3.3.1
setuptools==59.6.0
six==1.16.0
tzlocal==2.1
wadllib==1.3.6
wheel==0.37.1
zipp==1.0.0

View File

@ -18,11 +18,8 @@ RUN apt-get update --yes \
&& apt-get clean \ && apt-get clean \
&& rm -rf /var/lib/apt/lists/* /var/cache/debconf /tmp/* && rm -rf /var/lib/apt/lists/* /var/cache/debconf /tmp/*
RUN pip3 install \ COPY requirements.txt /
numpy \ RUN pip3 install --no-cache-dir -r /requirements.txt
pyodbc \
deepdiff \
sqlglot
ARG odbc_driver_url="https://github.com/ClickHouse/clickhouse-odbc/releases/download/v1.1.6.20200320/clickhouse-odbc-1.1.6-Linux.tar.gz" ARG odbc_driver_url="https://github.com/ClickHouse/clickhouse-odbc/releases/download/v1.1.6.20200320/clickhouse-odbc-1.1.6-Linux.tar.gz"

View File

@ -0,0 +1,30 @@
blinker==1.4
cryptography==3.4.8
dbus-python==1.2.18
deepdiff==7.0.1
distro==1.7.0
httplib2==0.20.2
importlib-metadata==4.6.4
jeepney==0.7.1
keyring==23.5.0
launchpadlib==1.10.16
lazr.restfulclient==0.14.4
lazr.uri==1.0.6
more-itertools==8.10.0
numpy==1.26.4
oauthlib==3.2.0
ordered-set==4.1.0
packaging==24.1
pip==24.1.1
pipdeptree==2.23.0
PyJWT==2.3.0
pyodbc==5.1.0
pyparsing==2.4.7
python-apt==2.4.0+ubuntu3
SecretStorage==3.3.1
setuptools==59.6.0
six==1.16.0
sqlglot==23.16.0
wadllib==1.3.6
wheel==0.37.1
zipp==1.0.0

View File

@ -14,9 +14,8 @@ RUN apt-get update --yes \
&& apt-get clean \ && apt-get clean \
&& rm -rf /var/lib/apt/lists/* /var/cache/debconf /tmp/* && rm -rf /var/lib/apt/lists/* /var/cache/debconf /tmp/*
RUN pip3 install \ COPY requirements.txt /
pyyaml \ RUN pip3 install --no-cache-dir -r /requirements.txt
clickhouse-driver
ARG sqltest_repo="https://github.com/elliotchance/sqltest/" ARG sqltest_repo="https://github.com/elliotchance/sqltest/"

View File

@ -0,0 +1,29 @@
blinker==1.4
clickhouse-driver==0.2.7
cryptography==3.4.8
dbus-python==1.2.18
distro==1.7.0
httplib2==0.20.2
importlib-metadata==4.6.4
jeepney==0.7.1
keyring==23.5.0
launchpadlib==1.10.16
lazr.restfulclient==0.14.4
lazr.uri==1.0.6
more-itertools==8.10.0
oauthlib==3.2.0
packaging==24.1
pip==24.1.1
pipdeptree==2.23.0
PyJWT==2.3.0
pyparsing==2.4.7
python-apt==2.4.0+ubuntu3
pytz==2024.1
PyYAML==6.0.1
SecretStorage==3.3.1
setuptools==59.6.0
six==1.16.0
tzlocal==5.2
wadllib==1.3.6
wheel==0.37.1
zipp==1.0.0

View File

@ -6,7 +6,6 @@ FROM clickhouse/stateless-test:$FROM_TAG
RUN apt-get update -y \ RUN apt-get update -y \
&& env DEBIAN_FRONTEND=noninteractive \ && env DEBIAN_FRONTEND=noninteractive \
apt-get install --yes --no-install-recommends \ apt-get install --yes --no-install-recommends \
python3-requests \
nodejs \ nodejs \
npm \ npm \
&& apt-get clean \ && apt-get clean \

View File

@ -25,10 +25,7 @@ RUN apt-get update -y \
openssl \ openssl \
postgresql-client \ postgresql-client \
python3 \ python3 \
python3-lxml \
python3-pip \ python3-pip \
python3-requests \
python3-termcolor \
qemu-user-static \ qemu-user-static \
sqlite3 \ sqlite3 \
sudo \ sudo \
@ -51,7 +48,8 @@ RUN curl -OL https://github.com/protocolbuffers/protobuf/releases/download/v${PR
&& unzip protoc-${PROTOC_VERSION}-linux-x86_64.zip -d /usr/local \ && unzip protoc-${PROTOC_VERSION}-linux-x86_64.zip -d /usr/local \
&& rm protoc-${PROTOC_VERSION}-linux-x86_64.zip && rm protoc-${PROTOC_VERSION}-linux-x86_64.zip
RUN pip3 install numpy==1.26.3 scipy==1.12.0 pandas==1.5.3 Jinja2==3.1.3 pyarrow==15.0.0 COPY requirements.txt /
RUN pip3 install --no-cache-dir -r /requirements.txt
RUN mkdir -p /tmp/clickhouse-odbc-tmp \ RUN mkdir -p /tmp/clickhouse-odbc-tmp \
&& cd /tmp/clickhouse-odbc-tmp \ && cd /tmp/clickhouse-odbc-tmp \

View File

@ -0,0 +1,51 @@
awscli==1.22.34
blinker==1.4
botocore==1.23.34
certifi==2020.6.20
chardet==4.0.0
colorama==0.4.4
cryptography==3.4.8
dbus-python==1.2.18
distro==1.7.0
docutils==0.17.1
gyp==0.1
httplib2==0.20.2
idna==3.3
importlib-metadata==4.6.4
jeepney==0.7.1
Jinja2==3.1.3
jmespath==0.10.0
keyring==23.5.0
launchpadlib==1.10.16
lazr.restfulclient==0.14.4
lazr.uri==1.0.6
lxml==4.8.0
MarkupSafe==2.1.5
more-itertools==8.10.0
numpy==1.26.3
oauthlib==3.2.0
packaging==24.1
pandas==1.5.3
pip==24.1.1
pipdeptree==2.23.0
pyarrow==15.0.0
pyasn1==0.4.8
PyJWT==2.3.0
pyparsing==2.4.7
python-apt==2.4.0+ubuntu3
python-dateutil==2.8.1
pytz==2024.1
PyYAML==6.0.1
requests==2.32.3
roman==3.3
rsa==4.8
s3transfer==0.5.0
scipy==1.12.0
SecretStorage==3.3.1
setuptools==59.6.0
six==1.16.0
termcolor==1.1.0
urllib3==1.26.5
wadllib==1.3.6
wheel==0.37.1
zipp==1.0.0

View File

@ -6,6 +6,9 @@ source /setup_export_logs.sh
# fail on errors, verbose and export all env variables # fail on errors, verbose and export all env variables
set -e -x -a set -e -x -a
MAX_RUN_TIME=${MAX_RUN_TIME:-10800}
MAX_RUN_TIME=$((MAX_RUN_TIME == 0 ? 10800 : MAX_RUN_TIME))
# Choose random timezone for this test run. # Choose random timezone for this test run.
# #
# NOTE: that clickhouse-test will randomize session_timezone by itself as well # NOTE: that clickhouse-test will randomize session_timezone by itself as well
@ -262,14 +265,17 @@ function run_tests()
export -f run_tests export -f run_tests
# This should be enough to setup job and collect artifacts
TIMEOUT=$((MAX_RUN_TIME - 300))
if [ "$NUM_TRIES" -gt "1" ]; then if [ "$NUM_TRIES" -gt "1" ]; then
# We don't run tests with Ordinary database in PRs, only in master. # We don't run tests with Ordinary database in PRs, only in master.
# So run new/changed tests with Ordinary at least once in flaky check. # So run new/changed tests with Ordinary at least once in flaky check.
timeout_with_logging "$MAX_RUN_TIME" bash -c 'NUM_TRIES=1; USE_DATABASE_ORDINARY=1; run_tests' \ timeout_with_logging "$TIMEOUT" bash -c 'NUM_TRIES=1; USE_DATABASE_ORDINARY=1; run_tests' \
| sed 's/All tests have finished//' | sed 's/No tests were run//' ||: | sed 's/All tests have finished//' | sed 's/No tests were run//' ||:
fi fi
timeout_with_logging "$MAX_RUN_TIME" bash -c run_tests ||: timeout_with_logging "$TIMEOUT" bash -c run_tests ||:
echo "Files in current directory" echo "Files in current directory"
ls -la ./ ls -la ./

View File

@ -38,7 +38,7 @@ function fn_exists() {
function timeout_with_logging() { function timeout_with_logging() {
local exit_code=0 local exit_code=0
timeout "${@}" || exit_code="${?}" timeout -s TERM --preserve-status "${@}" || exit_code="${?}"
if [[ "${exit_code}" -eq "124" ]] if [[ "${exit_code}" -eq "124" ]]
then then

View File

@ -23,22 +23,8 @@ RUN apt-get update && env DEBIAN_FRONTEND=noninteractive apt-get install --yes \
&& rm -rf /var/lib/apt/lists/* /var/cache/debconf /tmp/* && rm -rf /var/lib/apt/lists/* /var/cache/debconf /tmp/*
# python-magic is the same version as in Ubuntu 22.04 # python-magic is the same version as in Ubuntu 22.04
RUN pip3 install \ COPY requirements.txt /
PyGithub \ RUN pip3 install --no-cache-dir -r requirements.txt
black==23.12.0 \
boto3 \
codespell==2.2.1 \
mypy==1.8.0 \
pylint==3.1.0 \
python-magic==0.4.24 \
flake8==4.0.1 \
requests \
thefuzz \
tqdm==4.66.4 \
types-requests \
unidiff \
jwt \
&& rm -rf /root/.cache/pip
RUN echo "en_US.UTF-8 UTF-8" > /etc/locale.gen && locale-gen en_US.UTF-8 RUN echo "en_US.UTF-8 UTF-8" > /etc/locale.gen && locale-gen en_US.UTF-8
ENV LC_ALL en_US.UTF-8 ENV LC_ALL en_US.UTF-8

View File

@ -0,0 +1,58 @@
aiohttp==3.9.5
aiosignal==1.3.1
astroid==3.1.0
async-timeout==4.0.3
attrs==23.2.0
black==23.12.0
boto3==1.34.131
botocore==1.34.131
certifi==2024.6.2
cffi==1.16.0
charset-normalizer==3.3.2
click==8.1.7
codespell==2.2.1
cryptography==42.0.8
Deprecated==1.2.14
dill==0.3.8
flake8==4.0.1
frozenlist==1.4.1
idna==3.7
isort==5.13.2
jmespath==1.0.1
jwt==1.3.1
mccabe==0.6.1
multidict==6.0.5
mypy==1.8.0
mypy-extensions==1.0.0
packaging==24.1
pathspec==0.9.0
pip==24.1.1
pipdeptree==2.23.0
platformdirs==4.2.2
pycodestyle==2.8.0
pycparser==2.22
pyflakes==2.4.0
PyGithub==2.3.0
PyJWT==2.8.0
pylint==3.1.0
PyNaCl==1.5.0
python-dateutil==2.9.0.post0
python-magic==0.4.24
PyYAML==6.0.1
rapidfuzz==3.9.3
requests==2.32.3
s3transfer==0.10.1
setuptools==59.6.0
six==1.16.0
thefuzz==0.22.1
tomli==2.0.1
tomlkit==0.12.5
tqdm==4.66.4
types-requests==2.32.0.20240622
typing_extensions==4.12.2
unidiff==0.7.5
urllib3==2.2.2
wheel==0.37.1
wrapt==1.16.0
yamllint==1.26.3
yarl==1.9.4

View File

@ -0,0 +1,26 @@
---
sidebar_position: 1
sidebar_label: 2024
---
# 2024 Changelog
### ClickHouse release v24.6.2.17-stable (5710a8b5c0c) FIXME as compared to v24.6.1.4423-stable (dcced7c8478)
#### New Feature
* Backported in [#66002](https://github.com/ClickHouse/ClickHouse/issues/66002): Add AzureQueue storage. [#65458](https://github.com/ClickHouse/ClickHouse/pull/65458) ([Kseniia Sumarokova](https://github.com/kssenii)).
#### Improvement
* Backported in [#65898](https://github.com/ClickHouse/ClickHouse/issues/65898): Respect cgroup CPU limit in Keeper. [#65819](https://github.com/ClickHouse/ClickHouse/pull/65819) ([Antonio Andelic](https://github.com/antonio2368)).
#### Bug Fix (user-visible misbehavior in an official stable release)
* Backported in [#65935](https://github.com/ClickHouse/ClickHouse/issues/65935): For queries that read from `PostgreSQL`, cancel the internal `PostgreSQL` query if the ClickHouse query is finished. Otherwise, `ClickHouse` query cannot be canceled until the internal `PostgreSQL` query is finished. [#65771](https://github.com/ClickHouse/ClickHouse/pull/65771) ([Maksim Kita](https://github.com/kitaisreal)).
#### NOT FOR CHANGELOG / INSIGNIFICANT
* Backported in [#65907](https://github.com/ClickHouse/ClickHouse/issues/65907): Fix bug with session closing in Keeper. [#65735](https://github.com/ClickHouse/ClickHouse/pull/65735) ([Antonio Andelic](https://github.com/antonio2368)).
* Backported in [#65962](https://github.com/ClickHouse/ClickHouse/issues/65962): Add missing workload identity changes. [#65848](https://github.com/ClickHouse/ClickHouse/pull/65848) ([SmitaRKulkarni](https://github.com/SmitaRKulkarni)).
* Backported in [#66033](https://github.com/ClickHouse/ClickHouse/issues/66033): Follow up to [#65046](https://github.com/ClickHouse/ClickHouse/issues/65046). [#65928](https://github.com/ClickHouse/ClickHouse/pull/65928) ([Kseniia Sumarokova](https://github.com/kssenii)).
* Backported in [#66076](https://github.com/ClickHouse/ClickHouse/issues/66076): Fix support of non-const scale arguments in rounding functions. [#65983](https://github.com/ClickHouse/ClickHouse/pull/65983) ([Mikhail Gorshkov](https://github.com/mgorshkov)).
* Backported in [#66017](https://github.com/ClickHouse/ClickHouse/issues/66017): Fix race in s3queue. [#65986](https://github.com/ClickHouse/ClickHouse/pull/65986) ([Kseniia Sumarokova](https://github.com/kssenii)).

View File

@ -84,6 +84,7 @@ The BACKUP and RESTORE statements take a list of DATABASE and TABLE names, a des
- [`compression_method`](/docs/en/sql-reference/statements/create/table.md/#column-compression-codecs) and compression_level - [`compression_method`](/docs/en/sql-reference/statements/create/table.md/#column-compression-codecs) and compression_level
- `password` for the file on disk - `password` for the file on disk
- `base_backup`: the destination of the previous backup of this source. For example, `Disk('backups', '1.zip')` - `base_backup`: the destination of the previous backup of this source. For example, `Disk('backups', '1.zip')`
- `use_same_s3_credentials_for_base_backup`: whether base backup to S3 should inherit credentials from the query. Only works with `S3`.
- `structure_only`: if enabled, allows to only backup or restore the CREATE statements without the data of tables - `structure_only`: if enabled, allows to only backup or restore the CREATE statements without the data of tables
- `storage_policy`: storage policy for the tables being restored. See [Using Multiple Block Devices for Data Storage](../engines/table-engines/mergetree-family/mergetree.md#table_engine-mergetree-multiple-volumes). This setting is only applicable to the `RESTORE` command. The specified storage policy applies only to tables with an engine from the `MergeTree` family. - `storage_policy`: storage policy for the tables being restored. See [Using Multiple Block Devices for Data Storage](../engines/table-engines/mergetree-family/mergetree.md#table_engine-mergetree-multiple-volumes). This setting is only applicable to the `RESTORE` command. The specified storage policy applies only to tables with an engine from the `MergeTree` family.
- `s3_storage_class`: the storage class used for S3 backup. For example, `STANDARD` - `s3_storage_class`: the storage class used for S3 backup. For example, `STANDARD`

View File

@ -974,6 +974,13 @@ Default value: false
- [exclude_deleted_rows_for_part_size_in_merge](#exclude_deleted_rows_for_part_size_in_merge) setting - [exclude_deleted_rows_for_part_size_in_merge](#exclude_deleted_rows_for_part_size_in_merge) setting
## use_compact_variant_discriminators_serialization {#use_compact_variant_discriminators_serialization}
Enables compact mode for binary serialization of discriminators in Variant data type.
This mode allows to use significantly less memory for storing discriminators in parts when there is mostly one variant or a lot of NULL values.
Default value: true
## merge_workload ## merge_workload
Used to regulate how resources are utilized and shared between merges and other workloads. Specified value is used as `workload` setting value for background merges of this table. If not specified (empty string), then server setting `merge_workload` is used instead. Used to regulate how resources are utilized and shared between merges and other workloads. Specified value is used as `workload` setting value for background merges of this table. If not specified (empty string), then server setting `merge_workload` is used instead.

View File

@ -0,0 +1,37 @@
---
slug: /en/sql-reference/aggregate-functions/reference/aggthrow
sidebar_position: 101
---
# aggThrow
This function can be used for the purpose of testing exception safety. It will throw an exception on creation with the specified probability.
**Syntax**
```sql
aggThrow(throw_prob)
```
**Arguments**
- `throw_prob` — Probability to throw on creation. [Float64](../../data-types/float.md).
**Returned value**
- An exception: `Code: 503. DB::Exception: Aggregate function aggThrow has thrown exception successfully`.
**Example**
Query:
```sql
SELECT number % 2 AS even, aggThrow(number) FROM numbers(10) GROUP BY even;
```
Result:
```response
Received exception:
Code: 503. DB::Exception: Aggregate function aggThrow has thrown exception successfully: While executing AggregatingTransform. (AGGREGATE_FUNCTION_THROW)
```

View File

@ -43,6 +43,7 @@ Standard aggregate functions:
ClickHouse-specific aggregate functions: ClickHouse-specific aggregate functions:
- [aggThrow](../reference/aggthrow.md)
- [analysisOfVariance](../reference/analysis_of_variance.md) - [analysisOfVariance](../reference/analysis_of_variance.md)
- [any](../reference/any_respect_nulls.md) - [any](../reference/any_respect_nulls.md)
- [anyHeavy](../reference/anyheavy.md) - [anyHeavy](../reference/anyheavy.md)

View File

@ -83,7 +83,57 @@ Result:
``` ```
## makeDate32 ## makeDate32
Like [makeDate](#makedate) but produces a [Date32](../data-types/date32.md). Creates a date of type [Date32](../../sql-reference/data-types/date32.md) from a year, month, day (or optionally a year and a day).
**Syntax**
```sql
makeDate32(year, [month,] day)
```
**Arguments**
- `year` — Year. [Integer](../../sql-reference/data-types/int-uint.md), [Float](../../sql-reference/data-types/float.md) or [Decimal](../../sql-reference/data-types/decimal.md).
- `month` — Month (optional). [Integer](../../sql-reference/data-types/int-uint.md), [Float](../../sql-reference/data-types/float.md) or [Decimal](../../sql-reference/data-types/decimal.md).
- `day` — Day. [Integer](../../sql-reference/data-types/int-uint.md), [Float](../../sql-reference/data-types/float.md) or [Decimal](../../sql-reference/data-types/decimal.md).
:::note
If `month` is omitted then `day` should take a value between `1` and `365`, otherwise it should take a value between `1` and `31`.
:::
**Returned values**
- A date created from the arguments. [Date32](../../sql-reference/data-types/date32.md).
**Examples**
Create a date from a year, month, and day:
Query:
```sql
SELECT makeDate32(2024, 1, 1);
```
Result:
```response
2024-01-01
```
Create a Date from a year and day of year:
Query:
``` sql
SELECT makeDate32(2024, 100);
```
Result:
```response
2024-04-09
```
## makeDateTime ## makeDateTime
@ -125,12 +175,38 @@ Result:
## makeDateTime64 ## makeDateTime64
Like [makeDateTime](#makedatetime) but produces a [DateTime64](../data-types/datetime64.md). Creates a [DateTime64](../../sql-reference/data-types/datetime64.md) data type value from its components: year, month, day, hour, minute, second. With optional sub-second precision.
**Syntax** **Syntax**
```sql
makeDateTime64(year, month, day, hour, minute, second[, precision])
```
**Arguments**
- `year` — Year (0-9999). [Integer](../../sql-reference/data-types/int-uint.md), [Float](../../sql-reference/data-types/float.md) or [Decimal](../../sql-reference/data-types/decimal.md).
- `month` — Month (1-12). [Integer](../../sql-reference/data-types/int-uint.md), [Float](../../sql-reference/data-types/float.md) or [Decimal](../../sql-reference/data-types/decimal.md).
- `day` — Day (1-31). [Integer](../../sql-reference/data-types/int-uint.md), [Float](../../sql-reference/data-types/float.md) or [Decimal](../../sql-reference/data-types/decimal.md).
- `hour` — Hour (0-23). [Integer](../../sql-reference/data-types/int-uint.md), [Float](../../sql-reference/data-types/float.md) or [Decimal](../../sql-reference/data-types/decimal.md).
- `minute` — Minute (0-59). [Integer](../../sql-reference/data-types/int-uint.md), [Float](../../sql-reference/data-types/float.md) or [Decimal](../../sql-reference/data-types/decimal.md).
- `second` — Second (0-59). [Integer](../../sql-reference/data-types/int-uint.md), [Float](../../sql-reference/data-types/float.md) or [Decimal](../../sql-reference/data-types/decimal.md).
- `precision` — Optional precision of the sub-second component (0-9). [Integer](../../sql-reference/data-types/int-uint.md).
**Returned value**
- A date and time created from the supplied arguments. [DateTime64](../../sql-reference/data-types/datetime64.md).
**Example**
``` sql ``` sql
makeDateTime64(year, month, day, hour, minute, second[, fraction[, precision[, timezone]]]) SELECT makeDateTime64(2023, 5, 15, 10, 30, 45, 779, 5);
```
```response
┌─makeDateTime64(2023, 5, 15, 10, 30, 45, 779, 5)─┐
│ 2023-05-15 10:30:45.00779 │
└─────────────────────────────────────────────────┘
``` ```
## timestamp ## timestamp

View File

@ -86,7 +86,7 @@ Returns the fully qualified domain name of the ClickHouse server.
fqdn(); fqdn();
``` ```
This function is case-insensitive. Aliases: `fullHostName`, 'FQDN'.
**Returned value** **Returned value**

View File

@ -6,41 +6,119 @@ sidebar_label: Time Window
# Time Window Functions # Time Window Functions
Time window functions return the inclusive lower and exclusive upper bound of the corresponding window. The functions for working with WindowView are listed below: Time window functions return the inclusive lower and exclusive upper bound of the corresponding window. The functions for working with [WindowView](../statements/create/view.md/#window-view-experimental) are listed below:
## tumble ## tumble
A tumbling time window assigns records to non-overlapping, continuous windows with a fixed duration (`interval`). A tumbling time window assigns records to non-overlapping, continuous windows with a fixed duration (`interval`).
**Syntax**
``` sql ``` sql
tumble(time_attr, interval [, timezone]) tumble(time_attr, interval [, timezone])
``` ```
**Arguments** **Arguments**
- `time_attr` - Date and time. [DateTime](../data-types/datetime.md) data type. - `time_attr` Date and time. [DateTime](../data-types/datetime.md).
- `interval` - Window interval in [Interval](../data-types/special-data-types/interval.md) data type. - `interval` Window interval in [Interval](../data-types/special-data-types/interval.md).
- `timezone` — [Timezone name](../../operations/server-configuration-parameters/settings.md#server_configuration_parameters-timezone) (optional). - `timezone` — [Timezone name](../../operations/server-configuration-parameters/settings.md#server_configuration_parameters-timezone) (optional).
**Returned values** **Returned values**
- The inclusive lower and exclusive upper bound of the corresponding tumbling window. [Tuple](../data-types/tuple.md)([DateTime](../data-types/datetime.md), [DateTime](../data-types/datetime.md))`. - The inclusive lower and exclusive upper bound of the corresponding tumbling window. [Tuple](../data-types/tuple.md)([DateTime](../data-types/datetime.md), [DateTime](../data-types/datetime.md)).
**Example** **Example**
Query: Query:
``` sql ``` sql
SELECT tumble(now(), toIntervalDay('1')) SELECT tumble(now(), toIntervalDay('1'));
``` ```
Result: Result:
``` text ``` text
┌─tumble(now(), toIntervalDay('1'))─────────────┐ ┌─tumble(now(), toIntervalDay('1'))─────────────┐
['2020-01-01 00:00:00','2020-01-02 00:00:00'] ('2024-07-04 00:00:00','2024-07-05 00:00:00')
└───────────────────────────────────────────────┘ └───────────────────────────────────────────────┘
``` ```
## tumbleStart
Returns the inclusive lower bound of the corresponding [tumbling window](#tumble).
**Syntax**
``` sql
tumbleStart(time_attr, interval [, timezone]);
```
**Arguments**
- `time_attr` — Date and time. [DateTime](../data-types/datetime.md).
- `interval` — Window interval in [Interval](../data-types/special-data-types/interval.md).
- `timezone` — [Timezone name](../../operations/server-configuration-parameters/settings.md#server_configuration_parameters-timezone) (optional).
The parameters above can also be passed to the function as a [tuple](../data-types/tuple.md).
**Returned values**
- The inclusive lower bound of the corresponding tumbling window. [DateTime](../data-types/datetime.md), [Tuple](../data-types/tuple.md) or [UInt32](../data-types/int-uint.md).
**Example**
Query:
```sql
SELECT tumbleStart(now(), toIntervalDay('1'));
```
Result:
```response
┌─tumbleStart(now(), toIntervalDay('1'))─┐
│ 2024-07-04 00:00:00 │
└────────────────────────────────────────┘
```
## tumbleEnd
Returns the exclusive upper bound of the corresponding [tumbling window](#tumble).
**Syntax**
``` sql
tumbleEnd(time_attr, interval [, timezone]);
```
**Arguments**
- `time_attr` — Date and time. [DateTime](../data-types/datetime.md).
- `interval` — Window interval in [Interval](../data-types/special-data-types/interval.md).
- `timezone` — [Timezone name](../../operations/server-configuration-parameters/settings.md#server_configuration_parameters-timezone) (optional).
The parameters above can also be passed to the function as a [tuple](../data-types/tuple.md).
**Returned values**
- The inclusive lower bound of the corresponding tumbling window. [DateTime](../data-types/datetime.md), [Tuple](../data-types/tuple.md) or [UInt32](../data-types/int-uint.md).
**Example**
Query:
```sql
SELECT tumbleEnd(now(), toIntervalDay('1'));
```
Result:
```response
┌─tumbleEnd(now(), toIntervalDay('1'))─┐
│ 2024-07-05 00:00:00 │
└──────────────────────────────────────┘
```
## hop ## hop
A hopping time window has a fixed duration (`window_interval`) and hops by a specified hop interval (`hop_interval`). If the `hop_interval` is smaller than the `window_interval`, hopping windows are overlapping. Thus, records can be assigned to multiple windows. A hopping time window has a fixed duration (`window_interval`) and hops by a specified hop interval (`hop_interval`). If the `hop_interval` is smaller than the `window_interval`, hopping windows are overlapping. Thus, records can be assigned to multiple windows.
@ -51,65 +129,118 @@ hop(time_attr, hop_interval, window_interval [, timezone])
**Arguments** **Arguments**
- `time_attr` - Date and time. [DateTime](../data-types/datetime.md) data type. - `time_attr` Date and time. [DateTime](../data-types/datetime.md).
- `hop_interval` - Hop interval in [Interval](../data-types/special-data-types/interval.md) data type. Should be a positive number. - `hop_interval` — Positive Hop interval. [Interval](../data-types/special-data-types/interval.md).
- `window_interval` - Window interval in [Interval](../data-types/special-data-types/interval.md) data type. Should be a positive number. - `window_interval` — Positive Window interval. [Interval](../data-types/special-data-types/interval.md).
- `timezone` — [Timezone name](../../operations/server-configuration-parameters/settings.md#server_configuration_parameters-timezone) (optional). - `timezone` — [Timezone name](../../operations/server-configuration-parameters/settings.md#server_configuration_parameters-timezone) (optional).
**Returned values** **Returned values**
- The inclusive lower and exclusive upper bound of the corresponding hopping window. Since one record can be assigned to multiple hop windows, the function only returns the bound of the **first** window when hop function is used **without** `WINDOW VIEW`. [Tuple](../data-types/tuple.md)([DateTime](../data-types/datetime.md), [DateTime](../data-types/datetime.md))`. - The inclusive lower and exclusive upper bound of the corresponding hopping window. [Tuple](../data-types/tuple.md)([DateTime](../data-types/datetime.md), [DateTime](../data-types/datetime.md))`.
:::note
Since one record can be assigned to multiple hop windows, the function only returns the bound of the **first** window when hop function is used **without** `WINDOW VIEW`.
:::
**Example** **Example**
Query: Query:
``` sql ``` sql
SELECT hop(now(), INTERVAL '1' SECOND, INTERVAL '2' SECOND) SELECT hop(now(), INTERVAL '1' DAY, INTERVAL '2' DAY);
``` ```
Result: Result:
``` text ``` text
┌─hop(now(), toIntervalSecond('1'), toIntervalSecond('2'))──┐ ┌─hop(now(), toIntervalDay('1'), toIntervalDay('2'))─┐
│ ('2020-01-14 16:58:22','2020-01-14 16:58:24') │ │ ('2024-07-03 00:00:00','2024-07-05 00:00:00') │
└───────────────────────────────────────────────────────────┘ └────────────────────────────────────────────────────┘
```
## tumbleStart
Returns the inclusive lower bound of the corresponding tumbling window.
``` sql
tumbleStart(bounds_tuple);
tumbleStart(time_attr, interval [, timezone]);
```
## tumbleEnd
Returns the exclusive upper bound of the corresponding tumbling window.
``` sql
tumbleEnd(bounds_tuple);
tumbleEnd(time_attr, interval [, timezone]);
``` ```
## hopStart ## hopStart
Returns the inclusive lower bound of the corresponding hopping window. Returns the inclusive lower bound of the corresponding [hopping window](#hop).
**Syntax**
``` sql ``` sql
hopStart(bounds_tuple);
hopStart(time_attr, hop_interval, window_interval [, timezone]); hopStart(time_attr, hop_interval, window_interval [, timezone]);
``` ```
**Arguments**
- `time_attr` — Date and time. [DateTime](../data-types/datetime.md).
- `hop_interval` — Positive Hop interval. [Interval](../data-types/special-data-types/interval.md).
- `window_interval` — Positive Window interval. [Interval](../data-types/special-data-types/interval.md).
- `timezone` — [Timezone name](../../operations/server-configuration-parameters/settings.md#server_configuration_parameters-timezone) (optional).
The parameters above can also be passed to the function as a [tuple](../data-types/tuple.md).
**Returned values**
- The inclusive lower bound of the corresponding hopping window. [DateTime](../data-types/datetime.md), [Tuple](../data-types/tuple.md) or [UInt32](../data-types/int-uint.md).
:::note
Since one record can be assigned to multiple hop windows, the function only returns the bound of the **first** window when hop function is used **without** `WINDOW VIEW`.
:::
**Example**
Query:
``` sql
SELECT hopStart(now(), INTERVAL '1' DAY, INTERVAL '2' DAY);
```
Result:
``` text
┌─hopStart(now(), toIntervalDay('1'), toIntervalDay('2'))─┐
│ 2024-07-03 00:00:00 │
└─────────────────────────────────────────────────────────┘
```
## hopEnd ## hopEnd
Returns the exclusive upper bound of the corresponding hopping window. Returns the exclusive upper bound of the corresponding [hopping window](#hop).
**Syntax**
``` sql ``` sql
hopEnd(bounds_tuple);
hopEnd(time_attr, hop_interval, window_interval [, timezone]); hopEnd(time_attr, hop_interval, window_interval [, timezone]);
```
**Arguments**
- `time_attr` — Date and time. [DateTime](../data-types/datetime.md).
- `hop_interval` — Positive Hop interval. [Interval](../data-types/special-data-types/interval.md).
- `window_interval` — Positive Window interval. [Interval](../data-types/special-data-types/interval.md).
- `timezone` — [Timezone name](../../operations/server-configuration-parameters/settings.md#server_configuration_parameters-timezone) (optional).
The parameters above can also be passed to the function as a [tuple](../data-types/tuple.md).
**Returned values**
- The exclusive upper bound of the corresponding hopping window. [DateTime](../data-types/datetime.md), [Tuple](../data-types/tuple.md) or [UInt32](../data-types/int-uint.md).
:::note
Since one record can be assigned to multiple hop windows, the function only returns the bound of the **first** window when hop function is used **without** `WINDOW VIEW`.
:::
**Example**
Query:
``` sql
SELECT hopEnd(now(), INTERVAL '1' DAY, INTERVAL '2' DAY);
```
Result:
``` text
┌─hopEnd(now(), toIntervalDay('1'), toIntervalDay('2'))─┐
│ 2024-07-05 00:00:00 │
└───────────────────────────────────────────────────────┘
``` ```
## Related content ## Related content

View File

@ -23,6 +23,7 @@ ClickHouse supports the standard grammar for defining windows and window functio
| `GROUPS` frame | ❌ | | `GROUPS` frame | ❌ |
| Calculating aggregate functions over a frame (`sum(value) over (order by time)`) | ✅ (All aggregate functions are supported) | | Calculating aggregate functions over a frame (`sum(value) over (order by time)`) | ✅ (All aggregate functions are supported) |
| `rank()`, `dense_rank()`, `row_number()` | ✅ | | `rank()`, `dense_rank()`, `row_number()` | ✅ |
| `percent_rank()` | ✅ Efficiently computes the relative standing of a value within a partition in a dataset. This function effectively replaces the more verbose and computationally intensive manual SQL calculation expressed as `ifNull((rank() OVER(PARTITION BY x ORDER BY y) - 1) / nullif(count(1) OVER(PARTITION BY x) - 1, 0), 0)`|
| `lag/lead(value, offset)` | ❌ <br/> You can use one of the following workarounds:<br/> 1) `any(value) over (.... rows between <offset> preceding and <offset> preceding)`, or `following` for `lead` <br/> 2) `lagInFrame/leadInFrame`, which are analogous, but respect the window frame. To get behavior identical to `lag/lead`, use `rows between unbounded preceding and unbounded following` | | `lag/lead(value, offset)` | ❌ <br/> You can use one of the following workarounds:<br/> 1) `any(value) over (.... rows between <offset> preceding and <offset> preceding)`, or `following` for `lead` <br/> 2) `lagInFrame/leadInFrame`, which are analogous, but respect the window frame. To get behavior identical to `lag/lead`, use `rows between unbounded preceding and unbounded following` |
| ntile(buckets) | ✅ <br/> Specify window like, (partition by x order by y rows between unbounded preceding and unrounded following). | | ntile(buckets) | ✅ <br/> Specify window like, (partition by x order by y rows between unbounded preceding and unrounded following). |

View File

@ -626,6 +626,28 @@ static void initializeAzureSDKLogger(
#endif #endif
} }
#if defined(SANITIZER)
static std::vector<String> getSanitizerNames()
{
std::vector<String> names;
#if defined(ADDRESS_SANITIZER)
names.push_back("address");
#endif
#if defined(THREAD_SANITIZER)
names.push_back("thread");
#endif
#if defined(MEMORY_SANITIZER)
names.push_back("memory");
#endif
#if defined(UNDEFINED_BEHAVIOR_SANITIZER)
names.push_back("undefined behavior");
#endif
return names;
}
#endif
int Server::main(const std::vector<std::string> & /*args*/) int Server::main(const std::vector<std::string> & /*args*/)
try try
{ {
@ -716,7 +738,17 @@ try
global_context->addWarningMessage("ThreadFuzzer is enabled. Application will run slowly and unstable."); global_context->addWarningMessage("ThreadFuzzer is enabled. Application will run slowly and unstable.");
#if defined(SANITIZER) #if defined(SANITIZER)
global_context->addWarningMessage("Server was built with sanitizer. It will work slowly."); auto sanitizers = getSanitizerNames();
String log_message;
if (sanitizers.empty())
log_message = "sanitizer";
else if (sanitizers.size() == 1)
log_message = fmt::format("{} sanitizer", sanitizers.front());
else
log_message = fmt::format("sanitizers ({})", fmt::join(sanitizers, ", "));
global_context->addWarningMessage(fmt::format("Server was built with {}. It will work slowly.", log_message));
#endif #endif
#if defined(SANITIZE_COVERAGE) || WITH_COVERAGE #if defined(SANITIZE_COVERAGE) || WITH_COVERAGE

View File

@ -1093,10 +1093,4 @@ void ColumnObject::finalize()
checkObjectHasNoAmbiguosPaths(getKeys()); checkObjectHasNoAmbiguosPaths(getKeys());
} }
void ColumnObject::updateHashFast(SipHash & hash) const
{
for (const auto & entry : subcolumns)
for (auto & part : entry->data.data)
part->updateHashFast(hash);
}
} }

View File

@ -242,7 +242,7 @@ public:
const char * skipSerializedInArena(const char *) const override { throwMustBeConcrete(); } const char * skipSerializedInArena(const char *) const override { throwMustBeConcrete(); }
void updateHashWithValue(size_t, SipHash &) const override { throwMustBeConcrete(); } void updateHashWithValue(size_t, SipHash &) const override { throwMustBeConcrete(); }
void updateWeakHash32(WeakHash32 &) const override { throwMustBeConcrete(); } void updateWeakHash32(WeakHash32 &) const override { throwMustBeConcrete(); }
void updateHashFast(SipHash & hash) const override; void updateHashFast(SipHash &) const override { throwMustBeConcrete(); }
void expand(const Filter &, bool) override { throwMustBeConcrete(); } void expand(const Filter &, bool) override { throwMustBeConcrete(); }
bool hasEqualValues() const override { throwMustBeConcrete(); } bool hasEqualValues() const override { throwMustBeConcrete(); }
size_t byteSizeAt(size_t) const override { throwMustBeConcrete(); } size_t byteSizeAt(size_t) const override { throwMustBeConcrete(); }

View File

@ -711,7 +711,13 @@ void ColumnTuple::takeDynamicStructureFromSourceColumns(const Columns & source_c
ColumnPtr ColumnTuple::compress() const ColumnPtr ColumnTuple::compress() const
{ {
if (columns.empty()) if (columns.empty())
return Ptr(); {
return ColumnCompressed::create(size(), 0,
[n = column_length]
{
return ColumnTuple::create(n);
});
}
size_t byte_size = 0; size_t byte_size = 0;
Columns compressed; Columns compressed;

View File

@ -5,7 +5,7 @@
namespace DB namespace DB
{ {
static void inline hexStringDecode(const char * pos, const char * end, char *& out, size_t word_size = 2) static void inline hexStringDecode(const char * pos, const char * end, char *& out, size_t word_size)
{ {
if ((end - pos) & 1) if ((end - pos) & 1)
{ {
@ -23,7 +23,7 @@ static void inline hexStringDecode(const char * pos, const char * end, char *& o
++out; ++out;
} }
static void inline binStringDecode(const char * pos, const char * end, char *& out) static void inline binStringDecode(const char * pos, const char * end, char *& out, size_t word_size)
{ {
if (pos == end) if (pos == end)
{ {
@ -53,7 +53,7 @@ static void inline binStringDecode(const char * pos, const char * end, char *& o
++out; ++out;
} }
assert((end - pos) % 8 == 0); chassert((end - pos) % word_size == 0);
while (end - pos != 0) while (end - pos != 0)
{ {

View File

@ -1,184 +0,0 @@
#pragma once
#include <base/defines.h>
#include <Common/Exception.h>
#include <algorithm>
#include <memory>
#include <typeindex>
#include <vector>
#include <string>
namespace DB
{
namespace ErrorCodes
{
extern const int LOGICAL_ERROR;
}
/* This is a collections of objects derived from ItemBase.
* Collection contains no more than one instance for each derived type.
* The derived type is used to access the instance.
*/
template<class ItemBase>
class CollectionOfDerivedItems
{
public:
using Self = CollectionOfDerivedItems<ItemBase>;
using ItemPtr = std::shared_ptr<ItemBase>;
private:
struct Rec
{
std::type_index type_idx;
ItemPtr ptr;
bool operator<(const Rec & other) const
{
return type_idx < other.type_idx;
}
bool operator<(const std::type_index & value) const
{
return type_idx < value;
}
bool operator==(const Rec & other) const
{
return type_idx == other.type_idx;
}
};
using Records = std::vector<Rec>;
public:
void swap(Self & other) noexcept
{
records.swap(other.records);
}
void clear()
{
records.clear();
}
bool empty() const
{
return records.empty();
}
size_t size() const
{
return records.size();
}
Self clone() const
{
Self result;
result.records.reserve(records.size());
for (const auto & rec : records)
result.records.emplace_back(rec.type_idx, rec.ptr->clone());
return result;
}
void append(Self && other)
{
auto middle_idx = records.size();
std::move(other.records.begin(), other.records.end(), std::back_inserter(records));
std::inplace_merge(records.begin(), records.begin() + middle_idx, records.end());
chassert(isUniqTypes());
}
template <class T>
void add(std::shared_ptr<T> info)
{
static_assert(std::is_base_of_v<ItemBase, T>, "Template parameter must inherit items base class");
return addImpl(std::type_index(typeid(T)), std::move(info));
}
template <class T>
std::shared_ptr<T> get() const
{
static_assert(std::is_base_of_v<ItemBase, T>, "Template parameter must inherit items base class");
auto it = getImpl(std::type_index(typeid(T)));
if (it == records.cend())
return nullptr;
auto cast = std::dynamic_pointer_cast<T>(it->ptr);
chassert(cast);
return cast;
}
template <class T>
std::shared_ptr<T> extract()
{
static_assert(std::is_base_of_v<ItemBase, T>, "Template parameter must inherit items base class");
auto it = getImpl(std::type_index(typeid(T)));
if (it == records.cend())
return nullptr;
auto cast = std::dynamic_pointer_cast<T>(it->ptr);
chassert(cast);
records.erase(it);
return cast;
}
std::string debug() const
{
std::string result;
for (auto & rec : records)
{
result.append(rec.type_idx.name());
result.append(" ");
}
return result;
}
private:
bool isUniqTypes() const
{
auto uniq_it = std::adjacent_find(records.begin(), records.end());
return uniq_it == records.end();
}
void addImpl(std::type_index type_idx, ItemPtr item)
{
auto it = std::lower_bound(records.begin(), records.end(), type_idx);
if (it == records.end())
{
records.emplace_back(type_idx, item);
return;
}
if (it->type_idx == type_idx)
throw Exception(ErrorCodes::LOGICAL_ERROR, "inserted items must be unique by their type, type {} is inserted twice", type_idx.name());
records.emplace(it, type_idx, item);
chassert(isUniqTypes());
}
Records::const_iterator getImpl(std::type_index type_idx) const
{
auto it = std::lower_bound(records.cbegin(), records.cend(), type_idx);
if (it == records.cend())
return records.cend();
if (it->type_idx != type_idx)
return records.cend();
return it;
}
Records records;
};
}

View File

@ -36,7 +36,7 @@ class IColumn;
M(Dialect, dialect, Dialect::clickhouse, "Which dialect will be used to parse query", 0)\ M(Dialect, dialect, Dialect::clickhouse, "Which dialect will be used to parse query", 0)\
M(UInt64, min_compress_block_size, 65536, "The actual size of the block to compress, if the uncompressed data less than max_compress_block_size is no less than this value and no less than the volume of data for one mark.", 0) \ M(UInt64, min_compress_block_size, 65536, "The actual size of the block to compress, if the uncompressed data less than max_compress_block_size is no less than this value and no less than the volume of data for one mark.", 0) \
M(UInt64, max_compress_block_size, 1048576, "The maximum size of blocks of uncompressed data before compressing for writing to a table.", 0) \ M(UInt64, max_compress_block_size, 1048576, "The maximum size of blocks of uncompressed data before compressing for writing to a table.", 0) \
M(UInt64, max_block_size, DEFAULT_BLOCK_SIZE, "Maximum block size in rows for reading", 0) \ M(UInt64, max_block_size, DEFAULT_BLOCK_SIZE, "Maximum block size for reading", 0) \
M(UInt64, max_insert_block_size, DEFAULT_INSERT_BLOCK_SIZE, "The maximum block size for insertion, if we control the creation of blocks for insertion.", 0) \ M(UInt64, max_insert_block_size, DEFAULT_INSERT_BLOCK_SIZE, "The maximum block size for insertion, if we control the creation of blocks for insertion.", 0) \
M(UInt64, min_insert_block_size_rows, DEFAULT_INSERT_BLOCK_SIZE, "Squash blocks passed to INSERT query to specified size in rows, if blocks are not big enough.", 0) \ M(UInt64, min_insert_block_size_rows, DEFAULT_INSERT_BLOCK_SIZE, "Squash blocks passed to INSERT query to specified size in rows, if blocks are not big enough.", 0) \
M(UInt64, min_insert_block_size_bytes, (DEFAULT_INSERT_BLOCK_SIZE * 256), "Squash blocks passed to INSERT query to specified size in bytes, if blocks are not big enough.", 0) \ M(UInt64, min_insert_block_size_bytes, (DEFAULT_INSERT_BLOCK_SIZE * 256), "Squash blocks passed to INSERT query to specified size in bytes, if blocks are not big enough.", 0) \
@ -634,8 +634,9 @@ class IColumn;
M(Bool, optimize_time_filter_with_preimage, true, "Optimize Date and DateTime predicates by converting functions into equivalent comparisons without conversions (e.g. toYear(col) = 2023 -> col >= '2023-01-01' AND col <= '2023-12-31')", 0) \ M(Bool, optimize_time_filter_with_preimage, true, "Optimize Date and DateTime predicates by converting functions into equivalent comparisons without conversions (e.g. toYear(col) = 2023 -> col >= '2023-01-01' AND col <= '2023-12-31')", 0) \
M(Bool, normalize_function_names, true, "Normalize function names to their canonical names", 0) \ M(Bool, normalize_function_names, true, "Normalize function names to their canonical names", 0) \
M(Bool, enable_early_constant_folding, true, "Enable query optimization where we analyze function and subqueries results and rewrite query if there are constants there", 0) \ M(Bool, enable_early_constant_folding, true, "Enable query optimization where we analyze function and subqueries results and rewrite query if there are constants there", 0) \
M(Bool, deduplicate_blocks_in_dependent_materialized_views, false, "Should deduplicate blocks for materialized views. Use true to always deduplicate in dependent tables.", 0) \ M(Bool, deduplicate_blocks_in_dependent_materialized_views, false, "Should deduplicate blocks for materialized views if the block is not a duplicate for the table. Use true to always deduplicate in dependent tables.", 0) \
M(Bool, throw_if_deduplication_in_dependent_materialized_views_enabled_with_async_insert, true, "Throw exception on INSERT query when the setting `deduplicate_blocks_in_dependent_materialized_views` is enabled along with `async_insert`. It guarantees correctness, because these features can't work together.", 0) \ M(Bool, throw_if_deduplication_in_dependent_materialized_views_enabled_with_async_insert, true, "Throw exception on INSERT query when the setting `deduplicate_blocks_in_dependent_materialized_views` is enabled along with `async_insert`. It guarantees correctness, because these features can't work together.", 0) \
M(Bool, update_insert_deduplication_token_in_dependent_materialized_views, false, "Should update insert deduplication token with table identifier during insert in dependent materialized views.", 0) \
M(Bool, materialized_views_ignore_errors, false, "Allows to ignore errors for MATERIALIZED VIEW, and deliver original block to the table regardless of MVs", 0) \ M(Bool, materialized_views_ignore_errors, false, "Allows to ignore errors for MATERIALIZED VIEW, and deliver original block to the table regardless of MVs", 0) \
M(Bool, ignore_materialized_views_with_dropped_target_table, false, "Ignore MVs with dropped target table during pushing to views", 0) \ M(Bool, ignore_materialized_views_with_dropped_target_table, false, "Ignore MVs with dropped target table during pushing to views", 0) \
M(Bool, allow_experimental_refreshable_materialized_view, false, "Allow refreshable materialized views (CREATE MATERIALIZED VIEW <name> REFRESH ...).", 0) \ M(Bool, allow_experimental_refreshable_materialized_view, false, "Allow refreshable materialized views (CREATE MATERIALIZED VIEW <name> REFRESH ...).", 0) \
@ -953,7 +954,6 @@ class IColumn;
#define OBSOLETE_SETTINGS(M, ALIAS) \ #define OBSOLETE_SETTINGS(M, ALIAS) \
/** Obsolete settings that do nothing but left for compatibility reasons. Remove each one after half a year of obsolescence. */ \ /** Obsolete settings that do nothing but left for compatibility reasons. Remove each one after half a year of obsolescence. */ \
MAKE_OBSOLETE(M, Bool, update_insert_deduplication_token_in_dependent_materialized_views, 1) \
MAKE_OBSOLETE(M, UInt64, max_memory_usage_for_all_queries, 0) \ MAKE_OBSOLETE(M, UInt64, max_memory_usage_for_all_queries, 0) \
MAKE_OBSOLETE(M, UInt64, multiple_joins_rewriter_version, 0) \ MAKE_OBSOLETE(M, UInt64, multiple_joins_rewriter_version, 0) \
MAKE_OBSOLETE(M, Bool, enable_debug_queries, false) \ MAKE_OBSOLETE(M, Bool, enable_debug_queries, false) \

View File

@ -2,9 +2,11 @@
#include <DataTypes/Serializations/SerializationDynamic.h> #include <DataTypes/Serializations/SerializationDynamic.h>
#include <DataTypes/Serializations/SerializationDynamicElement.h> #include <DataTypes/Serializations/SerializationDynamicElement.h>
#include <DataTypes/Serializations/SerializationVariantElement.h> #include <DataTypes/Serializations/SerializationVariantElement.h>
#include <DataTypes/Serializations/SerializationVariantElementNullMap.h>
#include <DataTypes/DataTypeFactory.h> #include <DataTypes/DataTypeFactory.h>
#include <DataTypes/NestedUtils.h> #include <DataTypes/NestedUtils.h>
#include <DataTypes/DataTypeNullable.h> #include <DataTypes/DataTypeNullable.h>
#include <DataTypes/DataTypesNumber.h>
#include <Columns/ColumnDynamic.h> #include <Columns/ColumnDynamic.h>
#include <Columns/ColumnVariant.h> #include <Columns/ColumnVariant.h>
#include <Core/Field.h> #include <Core/Field.h>
@ -110,28 +112,58 @@ std::unique_ptr<IDataType::SubstreamData> DataTypeDynamic::getDynamicSubcolumnDa
} }
/// Extract nested subcolumn of requested dynamic subcolumn if needed. /// Extract nested subcolumn of requested dynamic subcolumn if needed.
if (!subcolumn_nested_name.empty()) /// If requested subcolumn is null map, it's processed separately as there is no Nullable type yet.
bool is_null_map_subcolumn = subcolumn_nested_name == "null";
if (is_null_map_subcolumn)
{
res->type = std::make_shared<DataTypeUInt8>();
}
else if (!subcolumn_nested_name.empty())
{ {
res = getSubcolumnData(subcolumn_nested_name, *res, throw_if_null); res = getSubcolumnData(subcolumn_nested_name, *res, throw_if_null);
if (!res) if (!res)
return nullptr; return nullptr;
} }
res->serialization = std::make_shared<SerializationDynamicElement>(res->serialization, subcolumn_type->getName()); res->serialization = std::make_shared<SerializationDynamicElement>(res->serialization, subcolumn_type->getName(), is_null_map_subcolumn);
res->type = makeNullableOrLowCardinalityNullableSafe(res->type); /// Make resulting subcolumn Nullable only if type subcolumn can be inside Nullable or can be LowCardinality(Nullable()).
bool make_subcolumn_nullable = subcolumn_type->canBeInsideNullable() || subcolumn_type->lowCardinality();
if (!is_null_map_subcolumn && make_subcolumn_nullable)
res->type = makeNullableOrLowCardinalityNullableSafe(res->type);
if (data.column) if (data.column)
{ {
if (discriminator) if (discriminator)
{ {
/// Provided Dynamic column has subcolumn of this type, we should use VariantSubcolumnCreator to /// Provided Dynamic column has subcolumn of this type, we should use VariantSubcolumnCreator/VariantNullMapSubcolumnCreator to
/// create full subcolumn from variant according to discriminators. /// create full subcolumn from variant according to discriminators.
const auto & variant_column = assert_cast<const ColumnDynamic &>(*data.column).getVariantColumn(); const auto & variant_column = assert_cast<const ColumnDynamic &>(*data.column).getVariantColumn();
auto creator = SerializationVariantElement::VariantSubcolumnCreator(variant_column.getLocalDiscriminatorsPtr(), "", *discriminator, variant_column.localDiscriminatorByGlobal(*discriminator)); std::unique_ptr<ISerialization::ISubcolumnCreator> creator;
res->column = creator.create(res->column); if (is_null_map_subcolumn)
creator = std::make_unique<SerializationVariantElementNullMap::VariantNullMapSubcolumnCreator>(
variant_column.getLocalDiscriminatorsPtr(),
"",
*discriminator,
variant_column.localDiscriminatorByGlobal(*discriminator));
else
creator = std::make_unique<SerializationVariantElement::VariantSubcolumnCreator>(
variant_column.getLocalDiscriminatorsPtr(),
"",
*discriminator,
variant_column.localDiscriminatorByGlobal(*discriminator),
make_subcolumn_nullable);
res->column = creator->create(res->column);
}
/// Provided Dynamic column doesn't have subcolumn of this type, just create column filled with default values.
else if (is_null_map_subcolumn)
{
/// Fill null map with 1 when there is no such Dynamic subcolumn.
auto column = ColumnUInt8::create();
assert_cast<ColumnUInt8 &>(*column).getData().resize_fill(data.column->size(), 1);
res->column = std::move(column);
} }
else else
{ {
/// Provided Dynamic column doesn't have subcolumn of this type, just create column filled with default values.
auto column = res->type->createColumn(); auto column = res->type->createColumn();
column->insertManyDefaults(data.column->size()); column->insertManyDefaults(data.column->size());
res->column = std::move(column); res->column = std::move(column);

View File

@ -173,7 +173,7 @@ bool IDataType::hasDynamicSubcolumns() const
auto data = SubstreamData(getDefaultSerialization()).withType(getPtr()); auto data = SubstreamData(getDefaultSerialization()).withType(getPtr());
auto callback = [&](const SubstreamPath &, const String &, const SubstreamData & subcolumn_data) auto callback = [&](const SubstreamPath &, const String &, const SubstreamData & subcolumn_data)
{ {
has_dynamic_subcolumns |= subcolumn_data.type->hasDynamicSubcolumnsData(); has_dynamic_subcolumns |= subcolumn_data.type && subcolumn_data.type->hasDynamicSubcolumnsData();
}; };
forEachSubcolumn(callback, data); forEachSubcolumn(callback, data);
return has_dynamic_subcolumns; return has_dynamic_subcolumns;

View File

@ -64,6 +64,9 @@ String ISerialization::Substream::toString() const
if (type == VariantElement) if (type == VariantElement)
return fmt::format("VariantElement({})", variant_element_name); return fmt::format("VariantElement({})", variant_element_name);
if (type == VariantElementNullMap)
return fmt::format("VariantElementNullMap({}.null)", variant_element_name);
return String(magic_enum::enum_name(type)); return String(magic_enum::enum_name(type));
} }
@ -195,6 +198,8 @@ String getNameForSubstreamPath(
stream_name += ".variant_offsets"; stream_name += ".variant_offsets";
else if (it->type == Substream::VariantElement) else if (it->type == Substream::VariantElement)
stream_name += "." + it->variant_element_name; stream_name += "." + it->variant_element_name;
else if (it->type == Substream::VariantElementNullMap)
stream_name += "." + it->variant_element_name + ".null";
else if (it->type == SubstreamType::DynamicStructure) else if (it->type == SubstreamType::DynamicStructure)
stream_name += ".dynamic_structure"; stream_name += ".dynamic_structure";
} }
@ -395,7 +400,8 @@ bool ISerialization::hasSubcolumnForPath(const SubstreamPath & path, size_t pref
return path[last_elem].type == Substream::NullMap return path[last_elem].type == Substream::NullMap
|| path[last_elem].type == Substream::TupleElement || path[last_elem].type == Substream::TupleElement
|| path[last_elem].type == Substream::ArraySizes || path[last_elem].type == Substream::ArraySizes
|| path[last_elem].type == Substream::VariantElement; || path[last_elem].type == Substream::VariantElement
|| path[last_elem].type == Substream::VariantElementNullMap;
} }
ISerialization::SubstreamData ISerialization::createFromPath(const SubstreamPath & path, size_t prefix_len) ISerialization::SubstreamData ISerialization::createFromPath(const SubstreamPath & path, size_t prefix_len)

View File

@ -184,6 +184,7 @@ public:
VariantOffsets, VariantOffsets,
VariantElements, VariantElements,
VariantElement, VariantElement,
VariantElementNullMap,
DynamicData, DynamicData,
DynamicStructure, DynamicStructure,
@ -256,6 +257,8 @@ public:
bool position_independent_encoding = true; bool position_independent_encoding = true;
bool use_compact_variant_discriminators_serialization = false;
enum class DynamicStatisticsMode enum class DynamicStatisticsMode
{ {
NONE, /// Don't write statistics. NONE, /// Don't write statistics.
@ -434,6 +437,9 @@ protected:
template <typename State, typename StatePtr> template <typename State, typename StatePtr>
State * checkAndGetState(const StatePtr & state) const; State * checkAndGetState(const StatePtr & state) const;
template <typename State, typename StatePtr>
static State * checkAndGetState(const StatePtr & state, const ISerialization * serialization);
[[noreturn]] void throwUnexpectedDataAfterParsedValue(IColumn & column, ReadBuffer & istr, const FormatSettings &, const String & type_name) const; [[noreturn]] void throwUnexpectedDataAfterParsedValue(IColumn & column, ReadBuffer & istr, const FormatSettings &, const String & type_name) const;
}; };
@ -444,10 +450,16 @@ using SubstreamType = ISerialization::Substream::Type;
template <typename State, typename StatePtr> template <typename State, typename StatePtr>
State * ISerialization::checkAndGetState(const StatePtr & state) const State * ISerialization::checkAndGetState(const StatePtr & state) const
{
return checkAndGetState<State, StatePtr>(state, this);
}
template <typename State, typename StatePtr>
State * ISerialization::checkAndGetState(const StatePtr & state, const ISerialization * serialization)
{ {
if (!state) if (!state)
throw Exception(ErrorCodes::LOGICAL_ERROR, throw Exception(ErrorCodes::LOGICAL_ERROR,
"Got empty state for {}", demangle(typeid(*this).name())); "Got empty state for {}", demangle(typeid(*serialization).name()));
auto * state_concrete = typeid_cast<State *>(state.get()); auto * state_concrete = typeid_cast<State *>(state.get());
if (!state_concrete) if (!state_concrete)
@ -455,7 +467,7 @@ State * ISerialization::checkAndGetState(const StatePtr & state) const
auto & state_ref = *state; auto & state_ref = *state;
throw Exception(ErrorCodes::LOGICAL_ERROR, throw Exception(ErrorCodes::LOGICAL_ERROR,
"Invalid State for {}. Expected: {}, got {}", "Invalid State for {}. Expected: {}, got {}",
demangle(typeid(*this).name()), demangle(typeid(*serialization).name()),
demangle(typeid(State).name()), demangle(typeid(State).name()),
demangle(typeid(state_ref).name())); demangle(typeid(state_ref).name()));
} }

View File

@ -1,5 +1,6 @@
#include <DataTypes/Serializations/SerializationDynamicElement.h> #include <DataTypes/Serializations/SerializationDynamicElement.h>
#include <DataTypes/Serializations/SerializationVariantElement.h> #include <DataTypes/Serializations/SerializationVariantElement.h>
#include <DataTypes/Serializations/SerializationVariantElementNullMap.h>
#include <DataTypes/Serializations/SerializationDynamic.h> #include <DataTypes/Serializations/SerializationDynamic.h>
#include <DataTypes/DataTypeVariant.h> #include <DataTypes/DataTypeVariant.h>
#include <DataTypes/DataTypeFactory.h> #include <DataTypes/DataTypeFactory.h>
@ -77,7 +78,10 @@ void SerializationDynamicElement::deserializeBinaryBulkStatePrefix(
if (auto global_discr = assert_cast<const DataTypeVariant &>(*variant_type).tryGetVariantDiscriminator(dynamic_element_name)) if (auto global_discr = assert_cast<const DataTypeVariant &>(*variant_type).tryGetVariantDiscriminator(dynamic_element_name))
{ {
settings.path.push_back(Substream::DynamicData); settings.path.push_back(Substream::DynamicData);
dynamic_element_state->variant_serialization = std::make_shared<SerializationVariantElement>(nested_serialization, dynamic_element_name, *global_discr); if (is_null_map_subcolumn)
dynamic_element_state->variant_serialization = std::make_shared<SerializationVariantElementNullMap>(dynamic_element_name, *global_discr);
else
dynamic_element_state->variant_serialization = std::make_shared<SerializationVariantElement>(nested_serialization, dynamic_element_name, *global_discr);
dynamic_element_state->variant_serialization->deserializeBinaryBulkStatePrefix(settings, dynamic_element_state->variant_element_state, cache); dynamic_element_state->variant_serialization->deserializeBinaryBulkStatePrefix(settings, dynamic_element_state->variant_element_state, cache);
settings.path.pop_back(); settings.path.pop_back();
} }
@ -98,7 +102,16 @@ void SerializationDynamicElement::deserializeBinaryBulkWithMultipleStreams(
SubstreamsCache * cache) const SubstreamsCache * cache) const
{ {
if (!state) if (!state)
{
if (is_null_map_subcolumn)
{
auto mutable_column = result_column->assumeMutable();
auto & data = assert_cast<ColumnUInt8 &>(*mutable_column).getData();
data.resize_fill(data.size() + limit, 1);
}
return; return;
}
auto * dynamic_element_state = checkAndGetState<DeserializeBinaryBulkStateDynamicElement>(state); auto * dynamic_element_state = checkAndGetState<DeserializeBinaryBulkStateDynamicElement>(state);
@ -108,6 +121,12 @@ void SerializationDynamicElement::deserializeBinaryBulkWithMultipleStreams(
dynamic_element_state->variant_serialization->deserializeBinaryBulkWithMultipleStreams(result_column, limit, settings, dynamic_element_state->variant_element_state, cache); dynamic_element_state->variant_serialization->deserializeBinaryBulkWithMultipleStreams(result_column, limit, settings, dynamic_element_state->variant_element_state, cache);
settings.path.pop_back(); settings.path.pop_back();
} }
else if (is_null_map_subcolumn)
{
auto mutable_column = result_column->assumeMutable();
auto & data = assert_cast<ColumnUInt8 &>(*mutable_column).getData();
data.resize_fill(data.size() + limit, 1);
}
else else
{ {
auto mutable_column = result_column->assumeMutable(); auto mutable_column = result_column->assumeMutable();

View File

@ -13,11 +13,11 @@ private:
/// To be able to deserialize Dynamic element as a subcolumn /// To be able to deserialize Dynamic element as a subcolumn
/// we need its type name and global discriminator. /// we need its type name and global discriminator.
String dynamic_element_name; String dynamic_element_name;
bool is_null_map_subcolumn;
public: public:
SerializationDynamicElement(const SerializationPtr & nested_, const String & dynamic_element_name_) SerializationDynamicElement(const SerializationPtr & nested_, const String & dynamic_element_name_, bool is_null_map_subcolumn_ = false)
: SerializationWrapper(nested_) : SerializationWrapper(nested_), dynamic_element_name(dynamic_element_name_), is_null_map_subcolumn(is_null_map_subcolumn_)
, dynamic_element_name(dynamic_element_name_)
{ {
} }

View File

@ -1,5 +1,6 @@
#include <DataTypes/Serializations/SerializationVariant.h> #include <DataTypes/Serializations/SerializationVariant.h>
#include <DataTypes/Serializations/SerializationVariantElement.h> #include <DataTypes/Serializations/SerializationVariantElement.h>
#include <DataTypes/Serializations/SerializationVariantElementNullMap.h>
#include <DataTypes/Serializations/SerializationNumber.h> #include <DataTypes/Serializations/SerializationNumber.h>
#include <DataTypes/Serializations/SerializationNullable.h> #include <DataTypes/Serializations/SerializationNullable.h>
#include <DataTypes/Serializations/SerializationNamed.h> #include <DataTypes/Serializations/SerializationNamed.h>
@ -30,12 +31,18 @@ namespace ErrorCodes
struct SerializeBinaryBulkStateVariant : public ISerialization::SerializeBinaryBulkState struct SerializeBinaryBulkStateVariant : public ISerialization::SerializeBinaryBulkState
{ {
std::vector<ISerialization::SerializeBinaryBulkStatePtr> states; explicit SerializeBinaryBulkStateVariant(UInt64 mode) : discriminators_mode(mode)
{
}
SerializationVariant::DiscriminatorsSerializationMode discriminators_mode;
std::vector<ISerialization::SerializeBinaryBulkStatePtr> variant_states;
}; };
struct DeserializeBinaryBulkStateVariant : public ISerialization::DeserializeBinaryBulkState struct DeserializeBinaryBulkStateVariant : public ISerialization::DeserializeBinaryBulkState
{ {
std::vector<ISerialization::DeserializeBinaryBulkStatePtr> states; ISerialization::DeserializeBinaryBulkStatePtr discriminators_state;
std::vector<ISerialization::DeserializeBinaryBulkStatePtr> variant_states;
}; };
void SerializationVariant::enumerateStreams( void SerializationVariant::enumerateStreams(
@ -65,13 +72,19 @@ void SerializationVariant::enumerateStreams(
for (size_t i = 0; i < variants.size(); ++i) for (size_t i = 0; i < variants.size(); ++i)
{ {
settings.path.back().creator = std::make_shared<SerializationVariantElement::VariantSubcolumnCreator>(local_discriminators, variant_names[i], i, column_variant ? column_variant->localDiscriminatorByGlobal(i) : i); DataTypePtr type = type_variant ? type_variant->getVariant(i) : nullptr;
settings.path.back().creator = std::make_shared<SerializationVariantElement::VariantSubcolumnCreator>(
local_discriminators,
variant_names[i],
i,
column_variant ? column_variant->localDiscriminatorByGlobal(i) : i,
!type || type->canBeInsideNullable() || type->lowCardinality());
auto variant_data = SubstreamData(variants[i]) auto variant_data = SubstreamData(variants[i])
.withType(type_variant ? type_variant->getVariant(i) : nullptr) .withType(type)
.withColumn(column_variant ? column_variant->getVariantPtrByGlobalDiscriminator(i) : nullptr) .withColumn(column_variant ? column_variant->getVariantPtrByGlobalDiscriminator(i) : nullptr)
.withSerializationInfo(data.serialization_info) .withSerializationInfo(data.serialization_info)
.withDeserializeState(variant_deserialize_state ? variant_deserialize_state->states[i] : nullptr); .withDeserializeState(variant_deserialize_state ? variant_deserialize_state->variant_states[i] : nullptr);
addVariantElementToPath(settings.path, i); addVariantElementToPath(settings.path, i);
settings.path.back().data = variant_data; settings.path.back().data = variant_data;
@ -79,6 +92,24 @@ void SerializationVariant::enumerateStreams(
settings.path.pop_back(); settings.path.pop_back();
} }
/// Variant subcolumns like variant.Type have type Nullable(Type), so we want to support reading null map subcolumn from it: variant.Type.null.
/// Nullable column is created during deserialization of a variant subcolumn according to the discriminators, so we don't have actual Nullable
/// serialization with null map subcolumn. To be able to read null map subcolumn from the variant subcolumn we use special serialization
/// SerializationVariantElementNullMap.
auto null_map_data = SubstreamData(std::make_shared<SerializationNumber<UInt8>>())
.withType(type_variant ? std::make_shared<DataTypeUInt8>() : nullptr)
.withColumn(column_variant ? ColumnUInt8::create() : nullptr);
for (size_t i = 0; i < variants.size(); ++i)
{
settings.path.back().creator = std::make_shared<SerializationVariantElementNullMap::VariantNullMapSubcolumnCreator>(local_discriminators, variant_names[i], i, column_variant ? column_variant->localDiscriminatorByGlobal(i) : i);
settings.path.push_back(Substream::VariantElementNullMap);
settings.path.back().variant_element_name = variant_names[i];
settings.path.back().data = null_map_data;
callback(settings.path);
settings.path.pop_back();
}
settings.path.pop_back(); settings.path.pop_back();
} }
@ -87,17 +118,26 @@ void SerializationVariant::serializeBinaryBulkStatePrefix(
SerializeBinaryBulkSettings & settings, SerializeBinaryBulkSettings & settings,
SerializeBinaryBulkStatePtr & state) const SerializeBinaryBulkStatePtr & state) const
{ {
const ColumnVariant & col = assert_cast<const ColumnVariant &>(column); settings.path.push_back(Substream::VariantDiscriminators);
auto * discriminators_stream = settings.getter(settings.path);
settings.path.pop_back();
auto variant_state = std::make_shared<SerializeBinaryBulkStateVariant>(); if (!discriminators_stream)
variant_state->states.resize(variants.size()); throw Exception(ErrorCodes::LOGICAL_ERROR, "Got empty stream for VariantDiscriminators in SerializationVariant::serializeBinaryBulkStatePrefix");
UInt64 mode = settings.use_compact_variant_discriminators_serialization ? DiscriminatorsSerializationMode::COMPACT : DiscriminatorsSerializationMode::BASIC;
writeBinaryLittleEndian(mode, *discriminators_stream);
const ColumnVariant & col = assert_cast<const ColumnVariant &>(column);
auto variant_state = std::make_shared<SerializeBinaryBulkStateVariant>(mode);
variant_state->variant_states.resize(variants.size());
settings.path.push_back(Substream::VariantElements); settings.path.push_back(Substream::VariantElements);
for (size_t i = 0; i < variants.size(); ++i) for (size_t i = 0; i < variants.size(); ++i)
{ {
addVariantElementToPath(settings.path, i); addVariantElementToPath(settings.path, i);
variants[i]->serializeBinaryBulkStatePrefix(col.getVariantByGlobalDiscriminator(i), settings, variant_state->states[i]); variants[i]->serializeBinaryBulkStatePrefix(col.getVariantByGlobalDiscriminator(i), settings, variant_state->variant_states[i]);
settings.path.pop_back(); settings.path.pop_back();
} }
@ -116,7 +156,7 @@ void SerializationVariant::serializeBinaryBulkStateSuffix(
for (size_t i = 0; i < variants.size(); ++i) for (size_t i = 0; i < variants.size(); ++i)
{ {
addVariantElementToPath(settings.path, i); addVariantElementToPath(settings.path, i);
variants[i]->serializeBinaryBulkStateSuffix(settings, variant_state->states[i]); variants[i]->serializeBinaryBulkStateSuffix(settings, variant_state->variant_states[i]);
settings.path.pop_back(); settings.path.pop_back();
} }
settings.path.pop_back(); settings.path.pop_back();
@ -128,14 +168,19 @@ void SerializationVariant::deserializeBinaryBulkStatePrefix(
DeserializeBinaryBulkStatePtr & state, DeserializeBinaryBulkStatePtr & state,
SubstreamsDeserializeStatesCache * cache) const SubstreamsDeserializeStatesCache * cache) const
{ {
DeserializeBinaryBulkStatePtr discriminators_state = deserializeDiscriminatorsStatePrefix(settings, cache);
if (!discriminators_state)
return;
auto variant_state = std::make_shared<DeserializeBinaryBulkStateVariant>(); auto variant_state = std::make_shared<DeserializeBinaryBulkStateVariant>();
variant_state->states.resize(variants.size()); variant_state->discriminators_state = discriminators_state;
variant_state->variant_states.resize(variants.size());
settings.path.push_back(Substream::VariantElements); settings.path.push_back(Substream::VariantElements);
for (size_t i = 0; i < variants.size(); ++i) for (size_t i = 0; i < variants.size(); ++i)
{ {
addVariantElementToPath(settings.path, i); addVariantElementToPath(settings.path, i);
variants[i]->deserializeBinaryBulkStatePrefix(settings, variant_state->states[i], cache); variants[i]->deserializeBinaryBulkStatePrefix(settings, variant_state->variant_states[i], cache);
settings.path.pop_back(); settings.path.pop_back();
} }
@ -143,6 +188,29 @@ void SerializationVariant::deserializeBinaryBulkStatePrefix(
state = std::move(variant_state); state = std::move(variant_state);
} }
ISerialization::DeserializeBinaryBulkStatePtr SerializationVariant::deserializeDiscriminatorsStatePrefix(
DeserializeBinaryBulkSettings & settings,
SubstreamsDeserializeStatesCache * cache)
{
settings.path.push_back(Substream::VariantDiscriminators);
DeserializeBinaryBulkStatePtr discriminators_state = nullptr;
if (auto cached_state = getFromSubstreamsDeserializeStatesCache(cache, settings.path))
{
discriminators_state = cached_state;
}
else if (auto * discriminators_stream = settings.getter(settings.path))
{
UInt64 mode;
readBinaryLittleEndian(mode, *discriminators_stream);
discriminators_state = std::make_shared<DeserializeBinaryBulkStateVariantDiscriminators>(mode);
addToSubstreamsDeserializeStatesCache(cache, settings.path, discriminators_state);
}
settings.path.pop_back();
return discriminators_state;
}
void SerializationVariant::serializeBinaryBulkWithMultipleStreamsAndUpdateVariantStatistics( void SerializationVariant::serializeBinaryBulkWithMultipleStreamsAndUpdateVariantStatistics(
const IColumn & column, const IColumn & column,
@ -165,13 +233,71 @@ void SerializationVariant::serializeBinaryBulkWithMultipleStreamsAndUpdateVarian
auto * variant_state = checkAndGetState<SerializeBinaryBulkStateVariant>(state); auto * variant_state = checkAndGetState<SerializeBinaryBulkStateVariant>(state);
/// If offset = 0 and limit == col.size() or we have only NULLs, we don't need to calculate /// Don't write anything if column is empty.
if (limit == 0)
return;
/// Write number of rows in this granule in compact mode.
if (variant_state->discriminators_mode.value == DiscriminatorsSerializationMode::COMPACT)
writeVarUInt(UInt64(limit), *discriminators_stream);
/// If column has only one none empty discriminators and no NULLs we don't need to
/// calculate limits for variants and use provided offset/limit.
if (auto non_empty_local_discr = col.getLocalDiscriminatorOfOneNoneEmptyVariantNoNulls())
{
auto non_empty_global_discr = col.globalDiscriminatorByLocal(*non_empty_local_discr);
/// In compact mode write the format of the granule and single non-empty discriminator.
if (variant_state->discriminators_mode.value == DiscriminatorsSerializationMode::COMPACT)
{
writeBinaryLittleEndian(UInt8(CompactDiscriminatorsGranuleFormat::COMPACT), *discriminators_stream);
writeBinaryLittleEndian(non_empty_global_discr, *discriminators_stream);
}
/// For basic mode just serialize this discriminator limit times.
else
{
for (size_t i = 0; i < limit; ++i)
writeBinaryLittleEndian(non_empty_global_discr, *discriminators_stream);
}
settings.path.push_back(Substream::VariantElements);
addVariantElementToPath(settings.path, non_empty_global_discr);
/// We can use the same offset/limit as for whole Variant column
variants[non_empty_global_discr]->serializeBinaryBulkWithMultipleStreams(col.getVariantByGlobalDiscriminator(non_empty_global_discr), offset, limit, settings, variant_state->variant_states[non_empty_global_discr]);
variants_statistics[variant_names[non_empty_global_discr]] += limit;
settings.path.pop_back();
settings.path.pop_back();
return;
}
/// If column has only NULLs, just serialize NULL discriminators.
else if (col.hasOnlyNulls())
{
/// In compact mode write single NULL_DISCRIMINATOR.
if (variant_state->discriminators_mode.value == DiscriminatorsSerializationMode::COMPACT)
{
writeBinaryLittleEndian(UInt8(CompactDiscriminatorsGranuleFormat::COMPACT), *discriminators_stream);
writeBinaryLittleEndian(ColumnVariant::NULL_DISCRIMINATOR, *discriminators_stream);
}
/// In basic mode write NULL_DISCRIMINATOR limit times.
else
{
for (size_t i = 0; i < limit; ++i)
writeBinaryLittleEndian(ColumnVariant::NULL_DISCRIMINATOR, *discriminators_stream);
}
return;
}
/// If offset = 0 and limit == col.size() we don't need to calculate
/// offsets and limits for variants and need to just serialize whole columns. /// offsets and limits for variants and need to just serialize whole columns.
if ((offset == 0 && limit == col.size()) || col.hasOnlyNulls()) if ((offset == 0 && limit == col.size()))
{ {
/// First, serialize discriminators. /// First, serialize discriminators.
/// If we have only NULLs or local and global discriminators are the same, just serialize the column as is. /// Here we are sure that column contains different discriminators, use plain granule format in compact mode.
if (col.hasOnlyNulls() || col.hasGlobalVariantsOrder()) if (variant_state->discriminators_mode.value == DiscriminatorsSerializationMode::COMPACT)
writeBinaryLittleEndian(UInt8(CompactDiscriminatorsGranuleFormat::PLAIN), *discriminators_stream);
/// If local and global discriminators are the same, just serialize the column as is.
if (col.hasGlobalVariantsOrder())
{ {
SerializationNumber<ColumnVariant::Discriminator>().serializeBinaryBulk(col.getLocalDiscriminatorsColumn(), *discriminators_stream, offset, limit); SerializationNumber<ColumnVariant::Discriminator>().serializeBinaryBulk(col.getLocalDiscriminatorsColumn(), *discriminators_stream, offset, limit);
} }
@ -188,7 +314,7 @@ void SerializationVariant::serializeBinaryBulkWithMultipleStreamsAndUpdateVarian
for (size_t i = 0; i != variants.size(); ++i) for (size_t i = 0; i != variants.size(); ++i)
{ {
addVariantElementToPath(settings.path, i); addVariantElementToPath(settings.path, i);
variants[i]->serializeBinaryBulkWithMultipleStreams(col.getVariantByGlobalDiscriminator(i), 0, 0, settings, variant_state->states[i]); variants[i]->serializeBinaryBulkWithMultipleStreams(col.getVariantByGlobalDiscriminator(i), 0, 0, settings, variant_state->variant_states[i]);
variants_statistics[variant_names[i]] += col.getVariantByGlobalDiscriminator(i).size(); variants_statistics[variant_names[i]] += col.getVariantByGlobalDiscriminator(i).size();
settings.path.pop_back(); settings.path.pop_back();
} }
@ -196,36 +322,16 @@ void SerializationVariant::serializeBinaryBulkWithMultipleStreamsAndUpdateVarian
return; return;
} }
/// If we have only one non empty variant and no NULLs, we can use the same limit offset for this variant.
if (auto non_empty_local_discr = col.getLocalDiscriminatorOfOneNoneEmptyVariantNoNulls())
{
/// First, serialize discriminators.
/// We know that all discriminators are the same, so we just need to serialize this discriminator limit times.
auto non_empty_global_discr = col.globalDiscriminatorByLocal(*non_empty_local_discr);
for (size_t i = 0; i != limit; ++i)
writeBinaryLittleEndian(non_empty_global_discr, *discriminators_stream);
/// Second, serialize non-empty variant (other variants are empty and we can skip their serialization).
settings.path.push_back(Substream::VariantElements);
addVariantElementToPath(settings.path, non_empty_global_discr);
/// We can use the same offset/limit as for whole Variant column
variants[non_empty_global_discr]->serializeBinaryBulkWithMultipleStreams(col.getVariantByGlobalDiscriminator(non_empty_global_discr), offset, limit, settings, variant_state->states[non_empty_global_discr]);
variants_statistics[variant_names[non_empty_global_discr]] += limit;
settings.path.pop_back();
settings.path.pop_back();
return;
}
/// In general case we should iterate through local discriminators in range [offset, offset + limit] to serialize global discriminators and calculate offset/limit pair for each variant. /// In general case we should iterate through local discriminators in range [offset, offset + limit] to serialize global discriminators and calculate offset/limit pair for each variant.
const auto & local_discriminators = col.getLocalDiscriminators(); const auto & local_discriminators = col.getLocalDiscriminators();
const auto & offsets = col.getOffsets(); const auto & offsets = col.getOffsets();
std::vector<std::pair<size_t, size_t>> variant_offsets_and_limits(variants.size(), {0, 0}); std::vector<std::pair<size_t, size_t>> variant_offsets_and_limits(variants.size(), {0, 0});
size_t end = offset + limit; size_t end = offset + limit;
size_t num_non_empty_variants_in_range = 0;
ColumnVariant::Discriminator last_non_empty_variant_discr = 0;
for (size_t i = offset; i < end; ++i) for (size_t i = offset; i < end; ++i)
{ {
auto global_discr = col.globalDiscriminatorByLocal(local_discriminators[i]); auto global_discr = col.globalDiscriminatorByLocal(local_discriminators[i]);
writeBinaryLittleEndian(global_discr, *discriminators_stream);
if (global_discr != ColumnVariant::NULL_DISCRIMINATOR) if (global_discr != ColumnVariant::NULL_DISCRIMINATOR)
{ {
/// If we see this discriminator for the first time, update offset /// If we see this discriminator for the first time, update offset
@ -233,9 +339,38 @@ void SerializationVariant::serializeBinaryBulkWithMultipleStreamsAndUpdateVarian
variant_offsets_and_limits[global_discr].first = offsets[i]; variant_offsets_and_limits[global_discr].first = offsets[i];
/// Update limit for this discriminator. /// Update limit for this discriminator.
++variant_offsets_and_limits[global_discr].second; ++variant_offsets_and_limits[global_discr].second;
++num_non_empty_variants_in_range;
last_non_empty_variant_discr = global_discr;
} }
} }
/// In basic mode just serialize discriminators as is row by row.
if (variant_state->discriminators_mode.value == DiscriminatorsSerializationMode::BASIC)
{
for (size_t i = offset; i < end; ++i)
writeBinaryLittleEndian(col.globalDiscriminatorByLocal(local_discriminators[i]), *discriminators_stream);
}
/// In compact mode check if we have the same discriminator for all rows in this granule.
/// First, check if all values in granule are NULLs.
else if (num_non_empty_variants_in_range == 0)
{
writeBinaryLittleEndian(UInt8(CompactDiscriminatorsGranuleFormat::COMPACT), *discriminators_stream);
writeBinaryLittleEndian(ColumnVariant::NULL_DISCRIMINATOR, *discriminators_stream);
}
/// Then, check if there is only 1 variant and no NULLs in this granule.
else if (num_non_empty_variants_in_range == 1 && variant_offsets_and_limits[last_non_empty_variant_discr].second == limit)
{
writeBinaryLittleEndian(UInt8(CompactDiscriminatorsGranuleFormat::COMPACT), *discriminators_stream);
writeBinaryLittleEndian(last_non_empty_variant_discr, *discriminators_stream);
}
/// Otherwise there are different discriminators in this granule.
else
{
writeBinaryLittleEndian(UInt8(CompactDiscriminatorsGranuleFormat::PLAIN), *discriminators_stream);
for (size_t i = offset; i < end; ++i)
writeBinaryLittleEndian(col.globalDiscriminatorByLocal(local_discriminators[i]), *discriminators_stream);
}
/// Serialize variants in global order. /// Serialize variants in global order.
settings.path.push_back(Substream::VariantElements); settings.path.push_back(Substream::VariantElements);
for (size_t i = 0; i != variants.size(); ++i) for (size_t i = 0; i != variants.size(); ++i)
@ -249,7 +384,7 @@ void SerializationVariant::serializeBinaryBulkWithMultipleStreamsAndUpdateVarian
variant_offsets_and_limits[i].first, variant_offsets_and_limits[i].first,
variant_offsets_and_limits[i].second, variant_offsets_and_limits[i].second,
settings, settings,
variant_state->states[i]); variant_state->variant_states[i]);
variants_statistics[variant_names[i]] += variant_offsets_and_limits[i].second; variants_statistics[variant_names[i]] += variant_offsets_and_limits[i].second;
settings.path.pop_back(); settings.path.pop_back();
} }
@ -284,39 +419,68 @@ void SerializationVariant::deserializeBinaryBulkWithMultipleStreams(
/// First, deserialize discriminators. /// First, deserialize discriminators.
settings.path.push_back(Substream::VariantDiscriminators); settings.path.push_back(Substream::VariantDiscriminators);
DeserializeBinaryBulkStateVariant * variant_state = nullptr;
std::vector<size_t> variant_limits;
if (auto cached_discriminators = getFromSubstreamsCache(cache, settings.path)) if (auto cached_discriminators = getFromSubstreamsCache(cache, settings.path))
{ {
variant_state = checkAndGetState<DeserializeBinaryBulkStateVariant>(state);
col.getLocalDiscriminatorsPtr() = cached_discriminators; col.getLocalDiscriminatorsPtr() = cached_discriminators;
} }
else else if (auto * discriminators_stream = settings.getter(settings.path))
{ {
auto * discriminators_stream = settings.getter(settings.path); variant_state = checkAndGetState<DeserializeBinaryBulkStateVariant>(state);
if (!discriminators_stream) auto * discriminators_state = checkAndGetState<DeserializeBinaryBulkStateVariantDiscriminators>(variant_state->discriminators_state);
return;
/// Deserialize discriminators according to serialization mode.
if (discriminators_state->mode.value == DiscriminatorsSerializationMode::BASIC)
SerializationNumber<ColumnVariant::Discriminator>().deserializeBinaryBulk(*col.getLocalDiscriminatorsPtr()->assumeMutable(), *discriminators_stream, limit, 0);
else
variant_limits = deserializeCompactDiscriminators(col.getLocalDiscriminatorsPtr(), limit, discriminators_stream, settings.continuous_reading, *discriminators_state);
SerializationNumber<ColumnVariant::Discriminator>().deserializeBinaryBulk(*col.getLocalDiscriminatorsPtr()->assumeMutable(), *discriminators_stream, limit, 0);
addToSubstreamsCache(cache, settings.path, col.getLocalDiscriminatorsPtr()); addToSubstreamsCache(cache, settings.path, col.getLocalDiscriminatorsPtr());
} }
/// It may happen that there is no such stream, in this case just do nothing.
else
{
settings.path.pop_back();
return;
}
settings.path.pop_back(); settings.path.pop_back();
/// Second, calculate limits for each variant by iterating through new discriminators. /// Second, calculate limits for each variant by iterating through new discriminators
std::vector<size_t> variant_limits(variants.size(), 0); /// if we didn't do it during discriminators deserialization.
auto & discriminators_data = col.getLocalDiscriminators(); if (variant_limits.empty())
size_t discriminators_offset = discriminators_data.size() - limit;
for (size_t i = discriminators_offset; i != discriminators_data.size(); ++i)
{ {
ColumnVariant::Discriminator discr = discriminators_data[i]; variant_limits.resize(variants.size(), 0);
if (discr != ColumnVariant::NULL_DISCRIMINATOR) auto & discriminators_data = col.getLocalDiscriminators();
++variant_limits[discr];
/// We can actually read less than limit discriminators and we cannot determine the actual number of read rows
/// by discriminators column as it could be taken from the substreams cache. And we need actual number of read
/// rows to fill offsets correctly later if they are not in the cache. We can determine if offsets column is in cache
/// or not by comparing it with discriminators column size (they should be the same when offsets are in cache).
/// If offsets are not in the cache, we can use it's size to determine the actual number of read rows.
size_t num_new_discriminators = limit;
size_t offsets_size = col.getOffsetsPtr()->size();
if (discriminators_data.size() > offsets_size)
num_new_discriminators = discriminators_data.size() - offsets_size;
size_t discriminators_offset = discriminators_data.size() - num_new_discriminators;
for (size_t i = discriminators_offset; i != discriminators_data.size(); ++i)
{
ColumnVariant::Discriminator discr = discriminators_data[i];
if (discr != ColumnVariant::NULL_DISCRIMINATOR)
++variant_limits[discr];
}
} }
/// Now we can deserialize variants according to their limits. /// Now we can deserialize variants according to their limits.
auto * variant_state = checkAndGetState<DeserializeBinaryBulkStateVariant>(state);
settings.path.push_back(Substream::VariantElements); settings.path.push_back(Substream::VariantElements);
for (size_t i = 0; i != variants.size(); ++i) for (size_t i = 0; i != variants.size(); ++i)
{ {
addVariantElementToPath(settings.path, i); addVariantElementToPath(settings.path, i);
variants[i]->deserializeBinaryBulkWithMultipleStreams(col.getVariantPtrByLocalDiscriminator(i), variant_limits[i], settings, variant_state->states[i], cache); variants[i]->deserializeBinaryBulkWithMultipleStreams(col.getVariantPtrByLocalDiscriminator(i), variant_limits[i], settings, variant_state->variant_states[i], cache);
settings.path.pop_back(); settings.path.pop_back();
} }
settings.path.pop_back(); settings.path.pop_back();
@ -336,20 +500,49 @@ void SerializationVariant::deserializeBinaryBulkWithMultipleStreams(
} }
else else
{ {
auto & offsets = col.getOffsets();
offsets.reserve(offsets.size() + limit);
std::vector<size_t> variant_offsets; std::vector<size_t> variant_offsets;
variant_offsets.reserve(variants.size()); variant_offsets.reserve(variants.size());
size_t num_non_empty_variants = 0;
ColumnVariant::Discriminator last_non_empty_discr = 0;
for (size_t i = 0; i != variants.size(); ++i) for (size_t i = 0; i != variants.size(); ++i)
variant_offsets.push_back(col.getVariantByLocalDiscriminator(i).size() - variant_limits[i]);
for (size_t i = discriminators_offset; i != discriminators_data.size(); ++i)
{ {
ColumnVariant::Discriminator discr = discriminators_data[i]; if (variant_limits[i])
if (discr == ColumnVariant::NULL_DISCRIMINATOR) {
offsets.emplace_back(); ++num_non_empty_variants;
else last_non_empty_discr = i;
offsets.push_back(variant_offsets[discr]++); }
variant_offsets.push_back(col.getVariantByLocalDiscriminator(i).size() - variant_limits[i]);
}
auto & discriminators_data = col.getLocalDiscriminators();
auto & offsets = col.getOffsets();
size_t num_new_offsets = discriminators_data.size() - offsets.size();
offsets.reserve(offsets.size() + num_new_offsets);
/// If there are only NULLs were read, fill offsets with 0.
if (num_non_empty_variants == 0)
{
offsets.resize_fill(discriminators_data.size(), 0);
}
/// If there is only 1 variant and no NULLs was read, fill offsets with sequential offsets of this variant.
else if (num_non_empty_variants == 1 && variant_limits[last_non_empty_discr] == num_new_offsets)
{
size_t first_offset = col.getVariantByLocalDiscriminator(last_non_empty_discr).size() - num_new_offsets;
for (size_t i = 0; i != num_new_offsets; ++i)
offsets.push_back(first_offset + i);
}
/// Otherwise iterate through discriminators and fill offsets accordingly.
else
{
size_t start = offsets.size();
for (size_t i = start; i != discriminators_data.size(); ++i)
{
ColumnVariant::Discriminator discr = discriminators_data[i];
if (discr == ColumnVariant::NULL_DISCRIMINATOR)
offsets.emplace_back();
else
offsets.push_back(variant_offsets[discr]++);
}
} }
addToSubstreamsCache(cache, settings.path, col.getOffsetsPtr()); addToSubstreamsCache(cache, settings.path, col.getOffsetsPtr());
@ -357,6 +550,72 @@ void SerializationVariant::deserializeBinaryBulkWithMultipleStreams(
settings.path.pop_back(); settings.path.pop_back();
} }
std::vector<size_t> SerializationVariant::deserializeCompactDiscriminators(
DB::ColumnPtr & discriminators_column,
size_t limit,
ReadBuffer * stream,
bool continuous_reading,
DeserializeBinaryBulkStateVariantDiscriminators & state) const
{
auto & discriminators = assert_cast<ColumnVariant::ColumnDiscriminators &>(*discriminators_column->assumeMutable());
auto & discriminators_data = discriminators.getData();
/// Reset state if we are reading from the start of the granule and not from the previous position in the file.
if (!continuous_reading)
state.remaining_rows_in_granule = 0;
/// Calculate limits for variants during discriminators deserialization.
std::vector<size_t> variant_limits(variants.size(), 0);
while (limit)
{
/// If we read all rows from current granule, start reading the next one.
if (state.remaining_rows_in_granule == 0)
{
if (stream->eof())
return variant_limits;
readDiscriminatorsGranuleStart(state, stream);
}
size_t limit_in_granule = std::min(limit, state.remaining_rows_in_granule);
if (state.granule_format == CompactDiscriminatorsGranuleFormat::COMPACT)
{
auto & data = discriminators.getData();
data.resize_fill(data.size() + limit_in_granule, state.compact_discr);
if (state.compact_discr != ColumnVariant::NULL_DISCRIMINATOR)
variant_limits[state.compact_discr] += limit_in_granule;
}
else
{
SerializationNumber<ColumnVariant::Discriminator>().deserializeBinaryBulk(discriminators, *stream, limit_in_granule, 0);
size_t start = discriminators_data.size() - limit_in_granule;
for (size_t i = start; i != discriminators_data.size(); ++i)
{
ColumnVariant::Discriminator discr = discriminators_data[i];
if (discr != ColumnVariant::NULL_DISCRIMINATOR)
++variant_limits[discr];
}
}
state.remaining_rows_in_granule -= limit_in_granule;
limit -= limit_in_granule;
}
return variant_limits;
}
void SerializationVariant::readDiscriminatorsGranuleStart(DeserializeBinaryBulkStateVariantDiscriminators & state, DB::ReadBuffer * stream)
{
UInt64 granule_size;
readVarUInt(granule_size, *stream);
state.remaining_rows_in_granule = granule_size;
UInt8 granule_format;
readBinaryLittleEndian(granule_format, *stream);
state.granule_format = static_cast<CompactDiscriminatorsGranuleFormat>(granule_format);
if (granule_format == CompactDiscriminatorsGranuleFormat::COMPACT)
readBinaryLittleEndian(state.compact_discr, *stream);
}
void SerializationVariant::addVariantElementToPath(DB::ISerialization::SubstreamPath & path, size_t i) const void SerializationVariant::addVariantElementToPath(DB::ISerialization::SubstreamPath & path, size_t i) const
{ {
path.push_back(Substream::VariantElement); path.push_back(Substream::VariantElement);

View File

@ -2,10 +2,18 @@
#include <DataTypes/Serializations/ISerialization.h> #include <DataTypes/Serializations/ISerialization.h>
#include <DataTypes/Serializations/SerializationVariantElement.h> #include <DataTypes/Serializations/SerializationVariantElement.h>
#include <DataTypes/Serializations/SerializationVariantElementNullMap.h>
namespace DB namespace DB
{ {
namespace ErrorCodes
{
extern const int INCORRECT_DATA;
}
/// Class for serializing/deserializing column with Variant type. /// Class for serializing/deserializing column with Variant type.
/// It supports both text and binary bulk serializations/deserializations. /// It supports both text and binary bulk serializations/deserializations.
/// ///
@ -18,6 +26,17 @@ namespace DB
/// ///
/// During binary bulk serialization it transforms local discriminators /// During binary bulk serialization it transforms local discriminators
/// to global and serializes them into a separate stream VariantDiscriminators. /// to global and serializes them into a separate stream VariantDiscriminators.
/// There are 2 modes of serialising discriminators:
/// Basic mode, when all discriminators are serialized as is row by row.
/// Compact mode, when we avoid writing the same discriminators in granules when there is
/// only one variant (or only NULLs) in the granule.
/// In compact mode we serialize granules in the following format:
/// <number of rows in granule><granule format><granule data>
/// There are 2 different formats of granule - plain and compact.
/// Plain format is used when there are different discriminators in this granule,
/// in this format all discriminators are serialized as is row by row.
/// Compact format is used when all discriminators are the same in this granule,
/// in this case only this single discriminator is serialized.
/// Each variant is serialized into a separate stream with path VariantElements/VariantElement /// Each variant is serialized into a separate stream with path VariantElements/VariantElement
/// (VariantElements stream is needed for correct sub-columns creation). We store and serialize /// (VariantElements stream is needed for correct sub-columns creation). We store and serialize
/// variants in a sparse form (the size of a variant column equals to the number of its discriminator /// variants in a sparse form (the size of a variant column equals to the number of its discriminator
@ -32,6 +51,25 @@ namespace DB
class SerializationVariant : public ISerialization class SerializationVariant : public ISerialization
{ {
public: public:
struct DiscriminatorsSerializationMode
{
enum Value
{
BASIC = 0, /// Store the whole discriminators column.
COMPACT = 1, /// Don't write discriminators in granule if all of them are the same.
};
static void checkMode(UInt64 mode)
{
if (mode > Value::COMPACT)
throw Exception(ErrorCodes::INCORRECT_DATA, "Invalid version for SerializationVariant discriminators column.");
}
explicit DiscriminatorsSerializationMode(UInt64 mode) : value(static_cast<Value>(mode)) { checkMode(mode); }
Value value;
};
using VariantSerializations = std::vector<SerializationPtr>; using VariantSerializations = std::vector<SerializationPtr>;
explicit SerializationVariant( explicit SerializationVariant(
@ -123,8 +161,44 @@ public:
static std::vector<size_t> getVariantsDeserializeTextOrder(const DataTypes & variant_types); static std::vector<size_t> getVariantsDeserializeTextOrder(const DataTypes & variant_types);
private: private:
friend SerializationVariantElement;
friend SerializationVariantElementNullMap;
void addVariantElementToPath(SubstreamPath & path, size_t i) const; void addVariantElementToPath(SubstreamPath & path, size_t i) const;
enum CompactDiscriminatorsGranuleFormat
{
PLAIN = 0, /// Granule has different discriminators and they are serialized as is row by row.
COMPACT = 1, /// Granule has single discriminator for all rows and it is serialized as single value.
};
struct DeserializeBinaryBulkStateVariantDiscriminators : public ISerialization::DeserializeBinaryBulkState
{
explicit DeserializeBinaryBulkStateVariantDiscriminators(UInt64 mode_) : mode(mode_)
{
}
DiscriminatorsSerializationMode mode;
/// Deserialize state of currently read granule in compact mode.
CompactDiscriminatorsGranuleFormat granule_format = CompactDiscriminatorsGranuleFormat::PLAIN;
size_t remaining_rows_in_granule = 0;
ColumnVariant::Discriminator compact_discr = 0;
};
static DeserializeBinaryBulkStatePtr deserializeDiscriminatorsStatePrefix(
DeserializeBinaryBulkSettings & settings,
SubstreamsDeserializeStatesCache * cache);
std::vector<size_t> deserializeCompactDiscriminators(
ColumnPtr & discriminators_column,
size_t limit,
ReadBuffer * stream,
bool continuous_reading,
DeserializeBinaryBulkStateVariantDiscriminators & state) const;
static void readDiscriminatorsGranuleStart(DeserializeBinaryBulkStateVariantDiscriminators & state, ReadBuffer * stream);
bool tryDeserializeTextEscapedImpl(IColumn & column, const String & field, const FormatSettings & settings) const; bool tryDeserializeTextEscapedImpl(IColumn & column, const String & field, const FormatSettings & settings) const;
bool tryDeserializeTextQuotedImpl(IColumn & column, const String & field, const FormatSettings & settings) const; bool tryDeserializeTextQuotedImpl(IColumn & column, const String & field, const FormatSettings & settings) const;
bool tryDeserializeWholeTextImpl(IColumn & column, const String & field, const FormatSettings & settings) const; bool tryDeserializeWholeTextImpl(IColumn & column, const String & field, const FormatSettings & settings) const;

View File

@ -1,5 +1,6 @@
#include <DataTypes/Serializations/SerializationVariantElement.h> #include <DataTypes/Serializations/SerializationVariantElement.h>
#include <DataTypes/Serializations/SerializationNumber.h> #include <DataTypes/Serializations/SerializationNumber.h>
#include <DataTypes/Serializations/SerializationVariant.h>
#include <Columns/ColumnLowCardinality.h> #include <Columns/ColumnLowCardinality.h>
#include <Columns/ColumnNullable.h> #include <Columns/ColumnNullable.h>
#include <IO/ReadHelpers.h> #include <IO/ReadHelpers.h>
@ -12,7 +13,7 @@ namespace ErrorCodes
extern const int NOT_IMPLEMENTED; extern const int NOT_IMPLEMENTED;
} }
struct DeserializeBinaryBulkStateVariantElement : public ISerialization::DeserializeBinaryBulkState struct SerializationVariantElement::DeserializeBinaryBulkStateVariantElement : public ISerialization::DeserializeBinaryBulkState
{ {
/// During deserialization discriminators and variant streams can be shared. /// During deserialization discriminators and variant streams can be shared.
/// For example we can read several variant elements together: "select v.UInt32, v.String from table", /// For example we can read several variant elements together: "select v.UInt32, v.String from table",
@ -24,7 +25,7 @@ struct DeserializeBinaryBulkStateVariantElement : public ISerialization::Deseria
/// substream cache correctly. /// substream cache correctly.
ColumnPtr discriminators; ColumnPtr discriminators;
ColumnPtr variant; ColumnPtr variant;
ISerialization::DeserializeBinaryBulkStatePtr discriminators_state;
ISerialization::DeserializeBinaryBulkStatePtr variant_element_state; ISerialization::DeserializeBinaryBulkStatePtr variant_element_state;
}; };
@ -65,7 +66,12 @@ void SerializationVariantElement::serializeBinaryBulkStateSuffix(SerializeBinary
void SerializationVariantElement::deserializeBinaryBulkStatePrefix( void SerializationVariantElement::deserializeBinaryBulkStatePrefix(
DeserializeBinaryBulkSettings & settings, DeserializeBinaryBulkStatePtr & state, SubstreamsDeserializeStatesCache * cache) const DeserializeBinaryBulkSettings & settings, DeserializeBinaryBulkStatePtr & state, SubstreamsDeserializeStatesCache * cache) const
{ {
DeserializeBinaryBulkStatePtr discriminators_state = SerializationVariant::deserializeDiscriminatorsStatePrefix(settings, cache);
if (!discriminators_state)
return;
auto variant_element_state = std::make_shared<DeserializeBinaryBulkStateVariantElement>(); auto variant_element_state = std::make_shared<DeserializeBinaryBulkStateVariantElement>();
variant_element_state->discriminators_state = discriminators_state;
addVariantToPath(settings.path); addVariantToPath(settings.path);
nested_serialization->deserializeBinaryBulkStatePrefix(settings, variant_element_state->variant_element_state, cache); nested_serialization->deserializeBinaryBulkStatePrefix(settings, variant_element_state->variant_element_state, cache);
@ -86,35 +92,61 @@ void SerializationVariantElement::deserializeBinaryBulkWithMultipleStreams(
DeserializeBinaryBulkStatePtr & state, DeserializeBinaryBulkStatePtr & state,
SubstreamsCache * cache) const SubstreamsCache * cache) const
{ {
auto * variant_element_state = checkAndGetState<DeserializeBinaryBulkStateVariantElement>(state);
/// First, deserialize discriminators from Variant column. /// First, deserialize discriminators from Variant column.
settings.path.push_back(Substream::VariantDiscriminators); settings.path.push_back(Substream::VariantDiscriminators);
DeserializeBinaryBulkStateVariantElement * variant_element_state = nullptr;
std::optional<size_t> variant_limit;
if (auto cached_discriminators = getFromSubstreamsCache(cache, settings.path)) if (auto cached_discriminators = getFromSubstreamsCache(cache, settings.path))
{ {
variant_element_state = checkAndGetState<DeserializeBinaryBulkStateVariantElement>(state);
variant_element_state->discriminators = cached_discriminators; variant_element_state->discriminators = cached_discriminators;
} }
else else if (auto * discriminators_stream = settings.getter(settings.path))
{ {
auto * discriminators_stream = settings.getter(settings.path); variant_element_state = checkAndGetState<DeserializeBinaryBulkStateVariantElement>(state);
if (!discriminators_stream) auto * discriminators_state = checkAndGetState<SerializationVariant::DeserializeBinaryBulkStateVariantDiscriminators>(variant_element_state->discriminators_state);
return;
/// If we started to read a new column, reinitialize discriminators column in deserialization state. /// If we started to read a new column, reinitialize discriminators column in deserialization state.
if (!variant_element_state->discriminators || result_column->empty()) if (!variant_element_state->discriminators || result_column->empty())
variant_element_state->discriminators = ColumnVariant::ColumnDiscriminators::create(); variant_element_state->discriminators = ColumnVariant::ColumnDiscriminators::create();
SerializationNumber<ColumnVariant::Discriminator>().deserializeBinaryBulk(*variant_element_state->discriminators->assumeMutable(), *discriminators_stream, limit, 0); /// Deserialize discriminators according to serialization mode.
if (discriminators_state->mode.value == SerializationVariant::DiscriminatorsSerializationMode::BASIC)
SerializationNumber<ColumnVariant::Discriminator>().deserializeBinaryBulk(*variant_element_state->discriminators->assumeMutable(), *discriminators_stream, limit, 0);
else
variant_limit = deserializeCompactDiscriminators(
variant_element_state->discriminators,
variant_discriminator,
limit,
discriminators_stream,
settings.continuous_reading,
variant_element_state->discriminators_state,
this);
addToSubstreamsCache(cache, settings.path, variant_element_state->discriminators); addToSubstreamsCache(cache, settings.path, variant_element_state->discriminators);
} }
else
{
settings.path.pop_back();
return;
}
settings.path.pop_back(); settings.path.pop_back();
/// Iterate through new discriminators to calculate the limit for our variant. /// We could read less than limit discriminators, but we will need actual number of read rows later.
size_t num_new_discriminators = variant_element_state->discriminators->size() - result_column->size();
/// Iterate through new discriminators to calculate the limit for our variant
/// if we didn't do it during discriminators deserialization.
const auto & discriminators_data = assert_cast<const ColumnVariant::ColumnDiscriminators &>(*variant_element_state->discriminators).getData(); const auto & discriminators_data = assert_cast<const ColumnVariant::ColumnDiscriminators &>(*variant_element_state->discriminators).getData();
size_t discriminators_offset = variant_element_state->discriminators->size() - limit; size_t discriminators_offset = variant_element_state->discriminators->size() - num_new_discriminators;
size_t variant_limit = 0; if (!variant_limit)
for (size_t i = discriminators_offset; i != discriminators_data.size(); ++i) {
variant_limit += (discriminators_data[i] == variant_discriminator); variant_limit = 0;
for (size_t i = discriminators_offset; i != discriminators_data.size(); ++i)
*variant_limit += (discriminators_data[i] == variant_discriminator);
}
/// Now we know the limit for our variant and can deserialize it. /// Now we know the limit for our variant and can deserialize it.
@ -125,19 +157,19 @@ void SerializationVariantElement::deserializeBinaryBulkWithMultipleStreams(
auto & nullable_column = assert_cast<ColumnNullable &>(*mutable_column); auto & nullable_column = assert_cast<ColumnNullable &>(*mutable_column);
NullMap & null_map = nullable_column.getNullMapData(); NullMap & null_map = nullable_column.getNullMapData();
/// If we have only our discriminator in range, fill null map with 0. /// If we have only our discriminator in range, fill null map with 0.
if (variant_limit == limit) if (variant_limit == num_new_discriminators)
{ {
null_map.resize_fill(null_map.size() + limit, 0); null_map.resize_fill(null_map.size() + num_new_discriminators, 0);
} }
/// If no our discriminator in current range, fill null map with 1. /// If no our discriminator in current range, fill null map with 1.
else if (variant_limit == 0) else if (variant_limit == 0)
{ {
null_map.resize_fill(null_map.size() + limit, 1); null_map.resize_fill(null_map.size() + num_new_discriminators, 1);
} }
/// Otherwise we should iterate through discriminators to fill null map. /// Otherwise we should iterate through discriminators to fill null map.
else else
{ {
null_map.reserve(null_map.size() + limit); null_map.reserve(null_map.size() + num_new_discriminators);
for (size_t i = discriminators_offset; i != discriminators_data.size(); ++i) for (size_t i = discriminators_offset; i != discriminators_data.size(); ++i)
null_map.push_back(discriminators_data[i] != variant_discriminator); null_map.push_back(discriminators_data[i] != variant_discriminator);
} }
@ -159,12 +191,12 @@ void SerializationVariantElement::deserializeBinaryBulkWithMultipleStreams(
/// If nothing to deserialize, just insert defaults. /// If nothing to deserialize, just insert defaults.
if (variant_limit == 0) if (variant_limit == 0)
{ {
mutable_column->insertManyDefaults(limit); mutable_column->insertManyDefaults(num_new_discriminators);
return; return;
} }
addVariantToPath(settings.path); addVariantToPath(settings.path);
nested_serialization->deserializeBinaryBulkWithMultipleStreams(variant_element_state->variant, variant_limit, settings, variant_element_state->variant_element_state, cache); nested_serialization->deserializeBinaryBulkWithMultipleStreams(variant_element_state->variant, *variant_limit, settings, variant_element_state->variant_element_state, cache);
removeVariantFromPath(settings.path); removeVariantFromPath(settings.path);
/// If nothing was deserialized when variant_limit > 0 /// If nothing was deserialized when variant_limit > 0
@ -173,16 +205,16 @@ void SerializationVariantElement::deserializeBinaryBulkWithMultipleStreams(
/// In this case we should just insert default values. /// In this case we should just insert default values.
if (variant_element_state->variant->empty()) if (variant_element_state->variant->empty())
{ {
mutable_column->insertManyDefaults(limit); mutable_column->insertManyDefaults(num_new_discriminators);
return; return;
} }
size_t variant_offset = variant_element_state->variant->size() - variant_limit; size_t variant_offset = variant_element_state->variant->size() - *variant_limit;
/// If we have only our discriminator in range, insert the whole range to result column. /// If we have only our discriminator in range, insert the whole range to result column.
if (variant_limit == limit) if (variant_limit == num_new_discriminators)
{ {
mutable_column->insertRangeFrom(*variant_element_state->variant, variant_offset, variant_limit); mutable_column->insertRangeFrom(*variant_element_state->variant, variant_offset, *variant_limit);
} }
/// Otherwise iterate through discriminators and insert value from variant or default value depending on the discriminator. /// Otherwise iterate through discriminators and insert value from variant or default value depending on the discriminator.
else else
@ -197,6 +229,59 @@ void SerializationVariantElement::deserializeBinaryBulkWithMultipleStreams(
} }
} }
size_t SerializationVariantElement::deserializeCompactDiscriminators(
DB::ColumnPtr & discriminators_column,
ColumnVariant::Discriminator variant_discriminator,
size_t limit,
DB::ReadBuffer * stream,
bool continuous_reading,
DeserializeBinaryBulkStatePtr & discriminators_state_,
const ISerialization * serialization)
{
auto * discriminators_state = checkAndGetState<SerializationVariant::DeserializeBinaryBulkStateVariantDiscriminators>(discriminators_state_, serialization);
auto & discriminators = assert_cast<ColumnVariant::ColumnDiscriminators &>(*discriminators_column->assumeMutable());
auto & discriminators_data = discriminators.getData();
/// Reset state if we are reading from the start of the granule and not from the previous position in the file.
if (!continuous_reading)
discriminators_state->remaining_rows_in_granule = 0;
/// Calculate our variant limit during discriminators deserialization.
size_t variant_limit = 0;
while (limit)
{
/// If we read all rows from current granule, start reading the next one.
if (discriminators_state->remaining_rows_in_granule == 0)
{
if (stream->eof())
return variant_limit;
SerializationVariant::readDiscriminatorsGranuleStart(*discriminators_state, stream);
}
size_t limit_in_granule = std::min(limit, discriminators_state->remaining_rows_in_granule);
if (discriminators_state->granule_format == SerializationVariant::CompactDiscriminatorsGranuleFormat::COMPACT)
{
auto & data = discriminators.getData();
data.resize_fill(data.size() + limit_in_granule, discriminators_state->compact_discr);
if (discriminators_state->compact_discr == variant_discriminator)
variant_limit += limit_in_granule;
}
else
{
SerializationNumber<ColumnVariant::Discriminator>().deserializeBinaryBulk(discriminators, *stream, limit_in_granule, 0);
size_t start = discriminators_data.size() - limit_in_granule;
for (size_t i = start; i != discriminators_data.size(); ++i)
variant_limit += (discriminators_data[i] == variant_discriminator);
}
discriminators_state->remaining_rows_in_granule -= limit_in_granule;
limit -= limit_in_granule;
}
return variant_limit;
}
void SerializationVariantElement::addVariantToPath(DB::ISerialization::SubstreamPath & path) const void SerializationVariantElement::addVariantToPath(DB::ISerialization::SubstreamPath & path) const
{ {
path.push_back(Substream::VariantElements); path.push_back(Substream::VariantElements);
@ -214,17 +299,19 @@ SerializationVariantElement::VariantSubcolumnCreator::VariantSubcolumnCreator(
const ColumnPtr & local_discriminators_, const ColumnPtr & local_discriminators_,
const String & variant_element_name_, const String & variant_element_name_,
ColumnVariant::Discriminator global_variant_discriminator_, ColumnVariant::Discriminator global_variant_discriminator_,
ColumnVariant::Discriminator local_variant_discriminator_) ColumnVariant::Discriminator local_variant_discriminator_,
bool make_nullable_)
: local_discriminators(local_discriminators_) : local_discriminators(local_discriminators_)
, variant_element_name(variant_element_name_) , variant_element_name(variant_element_name_)
, global_variant_discriminator(global_variant_discriminator_) , global_variant_discriminator(global_variant_discriminator_)
, local_variant_discriminator(local_variant_discriminator_) , local_variant_discriminator(local_variant_discriminator_)
, make_nullable(make_nullable_)
{ {
} }
DataTypePtr SerializationVariantElement::VariantSubcolumnCreator::create(const DB::DataTypePtr & prev) const DataTypePtr SerializationVariantElement::VariantSubcolumnCreator::create(const DB::DataTypePtr & prev) const
{ {
return makeNullableOrLowCardinalityNullableSafe(prev); return make_nullable ? makeNullableOrLowCardinalityNullableSafe(prev) : prev;
} }
SerializationPtr SerializationVariantElement::VariantSubcolumnCreator::create(const DB::SerializationPtr & prev) const SerializationPtr SerializationVariantElement::VariantSubcolumnCreator::create(const DB::SerializationPtr & prev) const
@ -237,12 +324,12 @@ ColumnPtr SerializationVariantElement::VariantSubcolumnCreator::create(const DB:
/// Case when original Variant column contained only one non-empty variant and no NULLs. /// Case when original Variant column contained only one non-empty variant and no NULLs.
/// In this case just use this variant. /// In this case just use this variant.
if (prev->size() == local_discriminators->size()) if (prev->size() == local_discriminators->size())
return makeNullableOrLowCardinalityNullableSafe(prev); return make_nullable ? makeNullableOrLowCardinalityNullableSafe(prev) : prev;
/// If this variant is empty, fill result column with default values. /// If this variant is empty, fill result column with default values.
if (prev->empty()) if (prev->empty())
{ {
auto res = makeNullableOrLowCardinalityNullableSafe(prev)->cloneEmpty(); auto res = make_nullable ? makeNullableOrLowCardinalityNullableSafe(prev)->cloneEmpty() : prev->cloneEmpty();
res->insertManyDefaults(local_discriminators->size()); res->insertManyDefaults(local_discriminators->size());
return res; return res;
} }
@ -257,16 +344,16 @@ ColumnPtr SerializationVariantElement::VariantSubcolumnCreator::create(const DB:
/// Now we can create new column from null-map and variant column using IColumn::expand. /// Now we can create new column from null-map and variant column using IColumn::expand.
auto res_column = IColumn::mutate(prev); auto res_column = IColumn::mutate(prev);
/// Special case for LowCardinality. We want the result to be LowCardinality(Nullable), /// Special case for LowCardinality when we want the result to be LowCardinality(Nullable),
/// but we don't have a good way to apply null-mask for LowCardinality(), so, we first /// but we don't have a good way to apply null-mask for LowCardinality(), so, we first
/// convert our column to LowCardinality(Nullable()) and then use expand which will /// convert our column to LowCardinality(Nullable()) and then use expand which will
/// fill rows with 0 in mask with default value (that is NULL). /// fill rows with 0 in mask with default value (that is NULL).
if (prev->lowCardinality()) if (make_nullable && prev->lowCardinality())
res_column = assert_cast<ColumnLowCardinality &>(*res_column).cloneNullable(); res_column = assert_cast<ColumnLowCardinality &>(*res_column).cloneNullable();
res_column->expand(null_map, /*inverted = */ true); res_column->expand(null_map, /*inverted = */ true);
if (res_column->canBeInsideNullable()) if (make_nullable && prev->canBeInsideNullable())
{ {
auto null_map_col = ColumnUInt8::create(); auto null_map_col = ColumnUInt8::create();
null_map_col->getData() = std::move(null_map); null_map_col->getData() = std::move(null_map);

View File

@ -9,6 +9,7 @@ namespace DB
{ {
class SerializationVariant; class SerializationVariant;
class SerializationVariantElementNullMap;
/// Serialization for Variant element when we read it as a subcolumn. /// Serialization for Variant element when we read it as a subcolumn.
class SerializationVariantElement final : public SerializationWrapper class SerializationVariantElement final : public SerializationWrapper
@ -66,12 +67,14 @@ public:
const String variant_element_name; const String variant_element_name;
const ColumnVariant::Discriminator global_variant_discriminator; const ColumnVariant::Discriminator global_variant_discriminator;
const ColumnVariant::Discriminator local_variant_discriminator; const ColumnVariant::Discriminator local_variant_discriminator;
bool make_nullable;
VariantSubcolumnCreator( VariantSubcolumnCreator(
const ColumnPtr & local_discriminators_, const ColumnPtr & local_discriminators_,
const String & variant_element_name_, const String & variant_element_name_,
ColumnVariant::Discriminator global_variant_discriminator_, ColumnVariant::Discriminator global_variant_discriminator_,
ColumnVariant::Discriminator local_variant_discriminator_); ColumnVariant::Discriminator local_variant_discriminator_,
bool make_nullable_);
DataTypePtr create(const DataTypePtr & prev) const override; DataTypePtr create(const DataTypePtr & prev) const override;
ColumnPtr create(const ColumnPtr & prev) const override; ColumnPtr create(const ColumnPtr & prev) const override;
@ -79,6 +82,18 @@ public:
}; };
private: private:
friend SerializationVariant; friend SerializationVariant;
friend SerializationVariantElementNullMap;
struct DeserializeBinaryBulkStateVariantElement;
static size_t deserializeCompactDiscriminators(
ColumnPtr & discriminators_column,
ColumnVariant::Discriminator variant_discriminator,
size_t limit,
ReadBuffer * stream,
bool continuous_reading,
DeserializeBinaryBulkStatePtr & discriminators_state_,
const ISerialization * serialization);
void addVariantToPath(SubstreamPath & path) const; void addVariantToPath(SubstreamPath & path) const;
void removeVariantFromPath(SubstreamPath & path) const; void removeVariantFromPath(SubstreamPath & path) const;

View File

@ -0,0 +1,190 @@
#include <Columns/ColumnLowCardinality.h>
#include <Columns/ColumnNullable.h>
#include <DataTypes/DataTypesNumber.h>
#include <DataTypes/Serializations/SerializationNumber.h>
#include <DataTypes/Serializations/SerializationVariant.h>
#include <DataTypes/Serializations/SerializationVariantElement.h>
#include <DataTypes/Serializations/SerializationVariantElementNullMap.h>
#include <IO/ReadHelpers.h>
namespace DB
{
namespace ErrorCodes
{
extern const int NOT_IMPLEMENTED;
}
struct DeserializeBinaryBulkStateVariantElementNullMap : public ISerialization::DeserializeBinaryBulkState
{
/// During deserialization discriminators streams can be shared.
/// For example we can read several variant elements together: "select v.UInt32, v.String.null from table",
/// or we can read the whole variant and some of variant elements or their subcolumns: "select v, v.UInt32.null from table".
/// To read the same column from the same stream more than once we use substream cache,
/// but this cache stores the whole column, not only the current range.
/// During deserialization of variant elements or their subcolumns discriminators column is not stored
/// in the result column, so we need to store them inside deserialization state, so we can use
/// substream cache correctly.
ColumnPtr discriminators;
ISerialization::DeserializeBinaryBulkStatePtr discriminators_state;
};
void SerializationVariantElementNullMap::enumerateStreams(
DB::ISerialization::EnumerateStreamsSettings & settings,
const DB::ISerialization::StreamCallback & callback,
const DB::ISerialization::SubstreamData &) const
{
/// We will need stream for discriminators during deserialization.
settings.path.push_back(Substream::VariantDiscriminators);
callback(settings.path);
settings.path.pop_back();
}
void SerializationVariantElementNullMap::serializeBinaryBulkStatePrefix(
const IColumn &, SerializeBinaryBulkSettings &, SerializeBinaryBulkStatePtr &) const
{
throw Exception(
ErrorCodes::NOT_IMPLEMENTED, "Method serializeBinaryBulkStatePrefix is not implemented for SerializationVariantElementNullMap");
}
void SerializationVariantElementNullMap::serializeBinaryBulkStateSuffix(SerializeBinaryBulkSettings &, SerializeBinaryBulkStatePtr &) const
{
throw Exception(
ErrorCodes::NOT_IMPLEMENTED, "Method serializeBinaryBulkStateSuffix is not implemented for SerializationVariantElementNullMap");
}
void SerializationVariantElementNullMap::deserializeBinaryBulkStatePrefix(
DeserializeBinaryBulkSettings & settings, DeserializeBinaryBulkStatePtr & state, SubstreamsDeserializeStatesCache * cache) const
{
DeserializeBinaryBulkStatePtr discriminators_state = SerializationVariant::deserializeDiscriminatorsStatePrefix(settings, cache);
if (!discriminators_state)
return;
auto variant_element_null_map_state = std::make_shared<DeserializeBinaryBulkStateVariantElementNullMap>();
variant_element_null_map_state->discriminators_state = std::move(discriminators_state);
state = std::move(variant_element_null_map_state);
}
void SerializationVariantElementNullMap::serializeBinaryBulkWithMultipleStreams(
const IColumn &, size_t, size_t, SerializeBinaryBulkSettings &, SerializeBinaryBulkStatePtr &) const
{
throw Exception(
ErrorCodes::NOT_IMPLEMENTED,
"Method serializeBinaryBulkWithMultipleStreams is not implemented for SerializationVariantElementNullMap");
}
void SerializationVariantElementNullMap::deserializeBinaryBulkWithMultipleStreams(
ColumnPtr & result_column,
size_t limit,
DeserializeBinaryBulkSettings & settings,
DeserializeBinaryBulkStatePtr & state,
SubstreamsCache * cache) const
{
/// Deserialize discriminators from Variant column.
settings.path.push_back(Substream::VariantDiscriminators);
DeserializeBinaryBulkStateVariantElementNullMap * variant_element_null_map_state = nullptr;
std::optional<size_t> variant_limit;
if (auto cached_discriminators = getFromSubstreamsCache(cache, settings.path))
{
variant_element_null_map_state = checkAndGetState<DeserializeBinaryBulkStateVariantElementNullMap>(state);
variant_element_null_map_state->discriminators = cached_discriminators;
}
else if (auto * discriminators_stream = settings.getter(settings.path))
{
variant_element_null_map_state = checkAndGetState<DeserializeBinaryBulkStateVariantElementNullMap>(state);
auto * discriminators_state = checkAndGetState<SerializationVariant::DeserializeBinaryBulkStateVariantDiscriminators>(
variant_element_null_map_state->discriminators_state);
/// If we started to read a new column, reinitialize discriminators column in deserialization state.
if (!variant_element_null_map_state->discriminators || result_column->empty())
variant_element_null_map_state->discriminators = ColumnVariant::ColumnDiscriminators::create();
/// Deserialize discriminators according to serialization mode.
if (discriminators_state->mode.value == SerializationVariant::DiscriminatorsSerializationMode::BASIC)
SerializationNumber<ColumnVariant::Discriminator>().deserializeBinaryBulk(
*variant_element_null_map_state->discriminators->assumeMutable(), *discriminators_stream, limit, 0);
else
variant_limit = SerializationVariantElement::deserializeCompactDiscriminators(
variant_element_null_map_state->discriminators,
variant_discriminator,
limit,
discriminators_stream,
settings.continuous_reading,
variant_element_null_map_state->discriminators_state,
this);
addToSubstreamsCache(cache, settings.path, variant_element_null_map_state->discriminators);
}
else
{
/// There is no such stream or cached data, it means that there is no Variant column in this part (it could happen after alter table add column).
/// In such cases columns are filled with default values, but for null-map column default value should be 1, not 0. Fill column with 1 here instead.
MutableColumnPtr mutable_column = result_column->assumeMutable();
auto & data = assert_cast<ColumnUInt8 &>(*mutable_column).getData();
data.resize_fill(data.size() + limit, 1);
settings.path.pop_back();
return;
}
settings.path.pop_back();
MutableColumnPtr mutable_column = result_column->assumeMutable();
auto & data = assert_cast<ColumnUInt8 &>(*mutable_column).getData();
/// Check if there are no such variant in read range.
if (variant_limit && *variant_limit == 0)
{
data.resize_fill(data.size() + limit, 1);
}
/// Check if there is only our variant in read range.
else if (variant_limit && *variant_limit == limit)
{
data.resize_fill(data.size() + limit, 0);
}
/// Iterate through new discriminators to calculate the null map of our variant.
else
{
const auto & discriminators_data
= assert_cast<const ColumnVariant::ColumnDiscriminators &>(*variant_element_null_map_state->discriminators).getData();
size_t discriminators_offset = variant_element_null_map_state->discriminators->size() - limit;
for (size_t i = discriminators_offset; i != discriminators_data.size(); ++i)
data.push_back(discriminators_data[i] != variant_discriminator);
}
}
SerializationVariantElementNullMap::VariantNullMapSubcolumnCreator::VariantNullMapSubcolumnCreator(
const ColumnPtr & local_discriminators_,
const String & variant_element_name_,
ColumnVariant::Discriminator global_variant_discriminator_,
ColumnVariant::Discriminator local_variant_discriminator_)
: local_discriminators(local_discriminators_)
, variant_element_name(variant_element_name_)
, global_variant_discriminator(global_variant_discriminator_)
, local_variant_discriminator(local_variant_discriminator_)
{
}
DataTypePtr SerializationVariantElementNullMap::VariantNullMapSubcolumnCreator::create(const DB::DataTypePtr &) const
{
return std::make_shared<DataTypeUInt8>();
}
SerializationPtr SerializationVariantElementNullMap::VariantNullMapSubcolumnCreator::create(const DB::SerializationPtr &) const
{
return std::make_shared<SerializationVariantElementNullMap>(variant_element_name, global_variant_discriminator);
}
ColumnPtr SerializationVariantElementNullMap::VariantNullMapSubcolumnCreator::create(const DB::ColumnPtr &) const
{
/// Iterate through discriminators and create null-map for our variant.
auto null_map_col = ColumnUInt8::create();
auto & null_map_data = null_map_col->getData();
null_map_data.reserve(local_discriminators->size());
const auto & local_discriminators_data = assert_cast<const ColumnVariant::ColumnDiscriminators &>(*local_discriminators).getData();
for (auto local_discr : local_discriminators_data)
null_map_data.push_back(local_discr != local_variant_discriminator);
return null_map_col;
}
}

View File

@ -0,0 +1,107 @@
#pragma once
#include <DataTypes/Serializations/SimpleTextSerialization.h>
#include <DataTypes/DataTypeNullable.h>
#include <Columns/ColumnNullable.h>
#include <Columns/ColumnVariant.h>
namespace DB
{
namespace ErrorCodes
{
extern const int NOT_IMPLEMENTED;
}
class SerializationVariant;
class SerializationVariantElement;
/// Serialization for Variant element null map when we read it as a subcolumn.
/// For example, variant.UInt64.null.
/// It requires separate serialization because there is no actual Nullable column
/// and we should construct null map from variant discriminators.
/// The implementation of deserializeBinaryBulk* methods is similar to SerializationVariantElement,
/// but differs in that there is no need to read the actual data of the variant, only discriminators.
class SerializationVariantElementNullMap final : public SimpleTextSerialization
{
public:
SerializationVariantElementNullMap(const String & variant_element_name_, ColumnVariant::Discriminator variant_discriminator_)
: variant_element_name(variant_element_name_), variant_discriminator(variant_discriminator_)
{
}
void enumerateStreams(
EnumerateStreamsSettings & settings,
const StreamCallback & callback,
const SubstreamData & data) const override;
void serializeBinaryBulkStatePrefix(
const IColumn & column,
SerializeBinaryBulkSettings & settings,
SerializeBinaryBulkStatePtr & state) const override;
void serializeBinaryBulkStateSuffix(
SerializeBinaryBulkSettings & settings,
SerializeBinaryBulkStatePtr & state) const override;
void deserializeBinaryBulkStatePrefix(
DeserializeBinaryBulkSettings & settings,
DeserializeBinaryBulkStatePtr & state,
SubstreamsDeserializeStatesCache * cache) const override;
void serializeBinaryBulkWithMultipleStreams(
const IColumn & column,
size_t offset,
size_t limit,
SerializeBinaryBulkSettings & settings,
SerializeBinaryBulkStatePtr & state) const override;
void deserializeBinaryBulkWithMultipleStreams(
ColumnPtr & column,
size_t limit,
DeserializeBinaryBulkSettings & settings,
DeserializeBinaryBulkStatePtr & state,
SubstreamsCache * cache) const override;
void serializeBinary(const Field &, WriteBuffer &, const FormatSettings &) const override { throwNoSerialization(); }
void deserializeBinary(Field &, ReadBuffer &, const FormatSettings &) const override { throwNoSerialization(); }
void serializeBinary(const IColumn &, size_t, WriteBuffer &, const FormatSettings &) const override { throwNoSerialization(); }
void deserializeBinary(IColumn &, ReadBuffer &, const FormatSettings &) const override { throwNoSerialization(); }
void serializeText(const IColumn &, size_t, WriteBuffer &, const FormatSettings &) const override { throwNoSerialization(); }
void deserializeText(IColumn &, ReadBuffer &, const FormatSettings &, bool) const override { throwNoSerialization(); }
bool tryDeserializeText(IColumn &, ReadBuffer &, const FormatSettings &, bool) const override { throwNoSerialization(); }
struct VariantNullMapSubcolumnCreator : public ISubcolumnCreator
{
const ColumnPtr local_discriminators;
const String variant_element_name;
const ColumnVariant::Discriminator global_variant_discriminator;
const ColumnVariant::Discriminator local_variant_discriminator;
VariantNullMapSubcolumnCreator(
const ColumnPtr & local_discriminators_,
const String & variant_element_name_,
ColumnVariant::Discriminator global_variant_discriminator_,
ColumnVariant::Discriminator local_variant_discriminator_);
DataTypePtr create(const DataTypePtr & prev) const override;
ColumnPtr create(const ColumnPtr & prev) const override;
SerializationPtr create(const SerializationPtr & prev) const override;
};
private:
[[noreturn]] static void throwNoSerialization()
{
throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Text/binary serialization is not implemented for variant element null map subcolumn");
}
friend SerializationVariant;
friend SerializationVariantElement;
/// To be able to deserialize Variant element null map as a subcolumn
/// we need variant element type name and global discriminator.
String variant_element_name;
ColumnVariant::Discriminator variant_discriminator;
};
}

View File

@ -3,14 +3,14 @@
#include <Columns/ColumnString.h> #include <Columns/ColumnString.h>
#include <Columns/ColumnVector.h> #include <Columns/ColumnVector.h>
#include <Columns/ColumnsNumber.h> #include <Columns/ColumnsNumber.h>
#include <Common/BitHelpers.h>
#include <Common/BinStringDecodeHelper.h>
#include <DataTypes/DataTypeString.h> #include <DataTypes/DataTypeString.h>
#include <Functions/FunctionFactory.h> #include <Functions/FunctionFactory.h>
#include <Functions/IFunction.h> #include <Functions/IFunction.h>
#include <IO/WriteHelpers.h> #include <IO/WriteHelpers.h>
#include <Interpreters/Context_fwd.h> #include <Interpreters/Context_fwd.h>
#include <Interpreters/castColumn.h> #include <Interpreters/castColumn.h>
#include <Common/BinStringDecodeHelper.h>
#include <Common/BitHelpers.h>
namespace DB namespace DB
{ {
@ -218,10 +218,7 @@ struct UnbinImpl
static constexpr auto name = "unbin"; static constexpr auto name = "unbin";
static constexpr size_t word_size = 8; static constexpr size_t word_size = 8;
static void decode(const char * pos, const char * end, char *& out) static void decode(const char * pos, const char * end, char *& out) { binStringDecode(pos, end, out, word_size); }
{
binStringDecode(pos, end, out);
}
}; };
/// Encode number or string to string with binary or hexadecimal representation /// Encode number or string to string with binary or hexadecimal representation
@ -651,7 +648,15 @@ public:
size_t size = in_offsets.size(); size_t size = in_offsets.size();
out_offsets.resize(size); out_offsets.resize(size);
out_vec.resize(in_vec.size() / word_size + size);
size_t max_out_len = 0;
for (size_t i = 0; i < in_offsets.size(); ++i)
{
const size_t len = in_offsets[i] - (i == 0 ? 0 : in_offsets[i - 1])
- /* trailing zero symbol that is always added in ColumnString and that is ignored while decoding */ 1;
max_out_len += (len + word_size - 1) / word_size + /* trailing zero symbol that is always added by Impl::decode */ 1;
}
out_vec.resize(max_out_len);
char * begin = reinterpret_cast<char *>(out_vec.data()); char * begin = reinterpret_cast<char *>(out_vec.data());
char * pos = begin; char * pos = begin;
@ -661,6 +666,7 @@ public:
{ {
size_t new_offset = in_offsets[i]; size_t new_offset = in_offsets[i];
/// `new_offset - 1` because in ColumnString each string is stored with trailing zero byte
Impl::decode(reinterpret_cast<const char *>(&in_vec[prev_offset]), reinterpret_cast<const char *>(&in_vec[new_offset - 1]), pos); Impl::decode(reinterpret_cast<const char *>(&in_vec[prev_offset]), reinterpret_cast<const char *>(&in_vec[new_offset - 1]), pos);
out_offsets[i] = pos - begin; out_offsets[i] = pos - begin;
@ -668,6 +674,9 @@ public:
prev_offset = new_offset; prev_offset = new_offset;
} }
chassert(
static_cast<size_t>(pos - begin) <= out_vec.size(),
fmt::format("too small amount of memory was preallocated: needed {}, but have only {}", pos - begin, out_vec.size()));
out_vec.resize(pos - begin); out_vec.resize(pos - begin);
return col_res; return col_res;
@ -680,11 +689,11 @@ public:
ColumnString::Offsets & out_offsets = col_res->getOffsets(); ColumnString::Offsets & out_offsets = col_res->getOffsets();
const ColumnString::Chars & in_vec = col_fix_string->getChars(); const ColumnString::Chars & in_vec = col_fix_string->getChars();
size_t n = col_fix_string->getN(); const size_t n = col_fix_string->getN();
size_t size = col_fix_string->size(); size_t size = col_fix_string->size();
out_offsets.resize(size); out_offsets.resize(size);
out_vec.resize(in_vec.size() / word_size + size); out_vec.resize(((n + word_size - 1) / word_size + /* trailing zero symbol that is always added by Impl::decode */ 1) * size);
char * begin = reinterpret_cast<char *>(out_vec.data()); char * begin = reinterpret_cast<char *>(out_vec.data());
char * pos = begin; char * pos = begin;
@ -694,6 +703,7 @@ public:
{ {
size_t new_offset = prev_offset + n; size_t new_offset = prev_offset + n;
/// here we don't subtract 1 from `new_offset` because in ColumnFixedString strings are stored without trailing zero byte
Impl::decode(reinterpret_cast<const char *>(&in_vec[prev_offset]), reinterpret_cast<const char *>(&in_vec[new_offset]), pos); Impl::decode(reinterpret_cast<const char *>(&in_vec[prev_offset]), reinterpret_cast<const char *>(&in_vec[new_offset]), pos);
out_offsets[i] = pos - begin; out_offsets[i] = pos - begin;
@ -701,6 +711,9 @@ public:
prev_offset = new_offset; prev_offset = new_offset;
} }
chassert(
static_cast<size_t>(pos - begin) <= out_vec.size(),
fmt::format("too small amount of memory was preallocated: needed {}, but have only {}", pos - begin, out_vec.size()));
out_vec.resize(pos - begin); out_vec.resize(pos - begin);
return col_res; return col_res;

View File

@ -44,7 +44,7 @@ struct Memory : boost::noncopyable, Allocator
char * m_data = nullptr; char * m_data = nullptr;
size_t alignment = 0; size_t alignment = 0;
[[maybe_unused]] bool allow_gwp_asan_force_sample; [[maybe_unused]] bool allow_gwp_asan_force_sample{false};
Memory() = default; Memory() = default;

View File

@ -301,13 +301,7 @@ void AsynchronousInsertQueue::preprocessInsertQuery(const ASTPtr & query, const
auto & insert_query = query->as<ASTInsertQuery &>(); auto & insert_query = query->as<ASTInsertQuery &>();
insert_query.async_insert_flush = true; insert_query.async_insert_flush = true;
InterpreterInsertQuery interpreter( InterpreterInsertQuery interpreter(query, query_context, query_context->getSettingsRef().insert_allow_materialized_columns);
query,
query_context,
query_context->getSettingsRef().insert_allow_materialized_columns,
/* no_squash */ false,
/* no_destination */ false,
/* async_insert */ false);
auto table = interpreter.getTable(insert_query); auto table = interpreter.getTable(insert_query);
auto sample_block = InterpreterInsertQuery::getSampleBlock(insert_query, table, table->getInMemoryMetadataPtr(), query_context); auto sample_block = InterpreterInsertQuery::getSampleBlock(insert_query, table, table->getInMemoryMetadataPtr(), query_context);
@ -787,12 +781,7 @@ try
try try
{ {
interpreter = std::make_unique<InterpreterInsertQuery>( interpreter = std::make_unique<InterpreterInsertQuery>(
key.query, key.query, insert_context, key.settings.insert_allow_materialized_columns, false, false, true);
insert_context,
key.settings.insert_allow_materialized_columns,
false,
false,
true);
pipeline = interpreter->execute().pipeline; pipeline = interpreter->execute().pipeline;
chassert(pipeline.pushing()); chassert(pipeline.pushing());
@ -1011,7 +1000,7 @@ Chunk AsynchronousInsertQueue::processEntriesWithParsing(
} }
Chunk chunk(executor.getResultColumns(), total_rows); Chunk chunk(executor.getResultColumns(), total_rows);
chunk.getChunkInfos().add(std::move(chunk_info)); chunk.setChunkInfo(std::move(chunk_info));
return chunk; return chunk;
} }
@ -1063,7 +1052,7 @@ Chunk AsynchronousInsertQueue::processPreprocessedEntries(
} }
Chunk chunk(std::move(result_columns), total_rows); Chunk chunk(std::move(result_columns), total_rows);
chunk.getChunkInfos().add(std::move(chunk_info)); chunk.setChunkInfo(std::move(chunk_info));
return chunk; return chunk;
} }

View File

@ -2,7 +2,6 @@
#include <Interpreters/InterpreterFactory.h> #include <Interpreters/InterpreterFactory.h>
#include <algorithm> #include <algorithm>
#include <memory>
#include <Access/Common/AccessFlags.h> #include <Access/Common/AccessFlags.h>
@ -23,7 +22,6 @@
#include <Parsers/ASTCheckQuery.h> #include <Parsers/ASTCheckQuery.h>
#include <Parsers/ASTSetQuery.h> #include <Parsers/ASTSetQuery.h>
#include <Processors/Chunk.h>
#include <Processors/IAccumulatingTransform.h> #include <Processors/IAccumulatingTransform.h>
#include <Processors/IInflatingTransform.h> #include <Processors/IInflatingTransform.h>
#include <Processors/ISimpleTransform.h> #include <Processors/ISimpleTransform.h>
@ -93,7 +91,7 @@ Chunk getChunkFromCheckResult(const String & database, const String & table, con
return Chunk(std::move(columns), 1); return Chunk(std::move(columns), 1);
} }
class TableCheckTask : public ChunkInfoCloneable<TableCheckTask> class TableCheckTask : public ChunkInfo
{ {
public: public:
TableCheckTask(StorageID table_id, const std::variant<std::monostate, ASTPtr, String> & partition_or_part, ContextPtr context) TableCheckTask(StorageID table_id, const std::variant<std::monostate, ASTPtr, String> & partition_or_part, ContextPtr context)
@ -112,12 +110,6 @@ public:
context->checkAccess(AccessType::SHOW_TABLES, table_->getStorageID()); context->checkAccess(AccessType::SHOW_TABLES, table_->getStorageID());
} }
TableCheckTask(const TableCheckTask & other)
: table(other.table)
, check_data_tasks(other.check_data_tasks)
, is_finished(other.is_finished.load())
{}
std::optional<CheckResult> checkNext() const std::optional<CheckResult> checkNext() const
{ {
if (isFinished()) if (isFinished())
@ -129,8 +121,8 @@ public:
std::this_thread::sleep_for(sleep_time); std::this_thread::sleep_for(sleep_time);
}); });
IStorage::DataValidationTasksPtr tmp = check_data_tasks; IStorage::DataValidationTasksPtr check_data_tasks_ = check_data_tasks;
auto result = table->checkDataNext(tmp); auto result = table->checkDataNext(check_data_tasks_);
is_finished = !result.has_value(); is_finished = !result.has_value();
return result; return result;
} }
@ -188,7 +180,7 @@ protected:
/// source should return at least one row to start pipeline /// source should return at least one row to start pipeline
result.addColumn(ColumnUInt8::create(1, 1)); result.addColumn(ColumnUInt8::create(1, 1));
/// actual data stored in chunk info /// actual data stored in chunk info
result.getChunkInfos().add(std::move(current_check_task)); result.setChunkInfo(std::move(current_check_task));
return result; return result;
} }
@ -288,7 +280,7 @@ public:
protected: protected:
void transform(Chunk & chunk) override void transform(Chunk & chunk) override
{ {
auto table_check_task = chunk.getChunkInfos().get<TableCheckTask>(); auto table_check_task = std::dynamic_pointer_cast<const TableCheckTask>(chunk.getChunkInfo());
auto check_result = table_check_task->checkNext(); auto check_result = table_check_task->checkNext();
if (!check_result) if (!check_result)
{ {

View File

@ -1776,13 +1776,8 @@ BlockIO InterpreterCreateQuery::fillTableIfNeeded(const ASTCreateQuery & create)
else else
insert->select = create.select->clone(); insert->select = create.select->clone();
return InterpreterInsertQuery( return InterpreterInsertQuery(insert, getContext(),
insert, getContext()->getSettingsRef().insert_allow_materialized_columns).execute();
getContext(),
getContext()->getSettingsRef().insert_allow_materialized_columns,
/* no_squash */ false,
/* no_destination */ false,
/* async_isnert */ false).execute();
} }
return {}; return {};

View File

@ -534,13 +534,7 @@ QueryPipeline InterpreterExplainQuery::executeImpl()
} }
else if (dynamic_cast<const ASTInsertQuery *>(ast.getExplainedQuery().get())) else if (dynamic_cast<const ASTInsertQuery *>(ast.getExplainedQuery().get()))
{ {
InterpreterInsertQuery insert( InterpreterInsertQuery insert(ast.getExplainedQuery(), getContext());
ast.getExplainedQuery(),
getContext(),
/* allow_materialized */ false,
/* no_squash */ false,
/* no_destination */ false,
/* async_isnert */ false);
auto io = insert.execute(); auto io = insert.execute();
printPipeline(io.pipeline.getProcessors(), buf); printPipeline(io.pipeline.getProcessors(), buf);
} }

View File

@ -16,7 +16,6 @@
#include <Interpreters/getTableExpressions.h> #include <Interpreters/getTableExpressions.h>
#include <Interpreters/processColumnTransformers.h> #include <Interpreters/processColumnTransformers.h>
#include <Interpreters/InterpreterSelectQueryAnalyzer.h> #include <Interpreters/InterpreterSelectQueryAnalyzer.h>
#include <Interpreters/Context_fwd.h>
#include <Parsers/ASTFunction.h> #include <Parsers/ASTFunction.h>
#include <Parsers/ASTInsertQuery.h> #include <Parsers/ASTInsertQuery.h>
#include <Parsers/ASTSelectQuery.h> #include <Parsers/ASTSelectQuery.h>
@ -27,7 +26,6 @@
#include <Processors/Transforms/CountingTransform.h> #include <Processors/Transforms/CountingTransform.h>
#include <Processors/Transforms/ExpressionTransform.h> #include <Processors/Transforms/ExpressionTransform.h>
#include <Processors/Transforms/MaterializingTransform.h> #include <Processors/Transforms/MaterializingTransform.h>
#include <Processors/Transforms/DeduplicationTokenTransforms.h>
#include <Processors/Transforms/SquashingTransform.h> #include <Processors/Transforms/SquashingTransform.h>
#include <Processors/Transforms/PlanSquashingTransform.h> #include <Processors/Transforms/PlanSquashingTransform.h>
#include <Processors/Transforms/getSourceFromASTInsertQuery.h> #include <Processors/Transforms/getSourceFromASTInsertQuery.h>
@ -40,7 +38,6 @@
#include <Common/ThreadStatus.h> #include <Common/ThreadStatus.h>
#include <Common/checkStackSize.h> #include <Common/checkStackSize.h>
#include <Common/ProfileEvents.h> #include <Common/ProfileEvents.h>
#include "base/defines.h"
namespace ProfileEvents namespace ProfileEvents
@ -397,358 +394,28 @@ Chain InterpreterInsertQuery::buildPreSinkChain(
return out; return out;
} }
std::pair<std::vector<Chain>, std::vector<Chain>> InterpreterInsertQuery::buildPreAndSinkChains(size_t presink_streams, size_t sink_streams, StoragePtr table, const StorageMetadataPtr & metadata_snapshot, const Block & query_sample_block)
{
chassert(presink_streams > 0);
chassert(sink_streams > 0);
ThreadGroupPtr running_group;
if (current_thread)
running_group = current_thread->getThreadGroup();
if (!running_group)
running_group = std::make_shared<ThreadGroup>(getContext());
std::vector<Chain> sink_chains;
std::vector<Chain> presink_chains;
for (size_t i = 0; i < sink_streams; ++i)
{
auto out = buildSink(table, metadata_snapshot, /* thread_status_holder= */ nullptr,
running_group, /* elapsed_counter_ms= */ nullptr);
sink_chains.emplace_back(std::move(out));
}
for (size_t i = 0; i < presink_streams; ++i)
{
auto out = buildPreSinkChain(sink_chains[0].getInputHeader(), table, metadata_snapshot, query_sample_block);
presink_chains.emplace_back(std::move(out));
}
return {std::move(presink_chains), std::move(sink_chains)};
}
QueryPipeline InterpreterInsertQuery::buildInsertSelectPipeline(ASTInsertQuery & query, StoragePtr table)
{
const Settings & settings = getContext()->getSettingsRef();
auto metadata_snapshot = table->getInMemoryMetadataPtr();
auto query_sample_block = getSampleBlock(query, table, metadata_snapshot, getContext(), no_destination, allow_materialized);
bool is_trivial_insert_select = false;
if (settings.optimize_trivial_insert_select)
{
const auto & select_query = query.select->as<ASTSelectWithUnionQuery &>();
const auto & selects = select_query.list_of_selects->children;
const auto & union_modes = select_query.list_of_modes;
/// ASTSelectWithUnionQuery is not normalized now, so it may pass some queries which can be Trivial select queries
const auto mode_is_all = [](const auto & mode) { return mode == SelectUnionMode::UNION_ALL; };
is_trivial_insert_select =
std::all_of(union_modes.begin(), union_modes.end(), std::move(mode_is_all))
&& std::all_of(selects.begin(), selects.end(), isTrivialSelect);
}
ContextPtr select_context = getContext();
if (is_trivial_insert_select)
{
/** When doing trivial INSERT INTO ... SELECT ... FROM table,
* don't need to process SELECT with more than max_insert_threads
* and it's reasonable to set block size for SELECT to the desired block size for INSERT
* to avoid unnecessary squashing.
*/
Settings new_settings = select_context->getSettings();
new_settings.max_threads = std::max<UInt64>(1, settings.max_insert_threads);
if (table->prefersLargeBlocks())
{
if (settings.min_insert_block_size_rows)
new_settings.max_block_size = settings.min_insert_block_size_rows;
if (settings.min_insert_block_size_bytes)
new_settings.preferred_block_size_bytes = settings.min_insert_block_size_bytes;
}
auto context_for_trivial_select = Context::createCopy(context);
context_for_trivial_select->setSettings(new_settings);
context_for_trivial_select->setInsertionTable(getContext()->getInsertionTable(), getContext()->getInsertionTableColumnNames());
select_context = context_for_trivial_select;
}
QueryPipelineBuilder pipeline;
{
auto select_query_options = SelectQueryOptions(QueryProcessingStage::Complete, 1);
if (settings.allow_experimental_analyzer)
{
InterpreterSelectQueryAnalyzer interpreter_select_analyzer(query.select, select_context, select_query_options);
pipeline = interpreter_select_analyzer.buildQueryPipeline();
}
else
{
InterpreterSelectWithUnionQuery interpreter_select(query.select, select_context, select_query_options);
pipeline = interpreter_select.buildQueryPipeline();
}
}
pipeline.dropTotalsAndExtremes();
/// Allow to insert Nullable into non-Nullable columns, NULL values will be added as defaults values.
if (getContext()->getSettingsRef().insert_null_as_default)
{
const auto & input_columns = pipeline.getHeader().getColumnsWithTypeAndName();
const auto & query_columns = query_sample_block.getColumnsWithTypeAndName();
const auto & output_columns = metadata_snapshot->getColumns();
if (input_columns.size() == query_columns.size())
{
for (size_t col_idx = 0; col_idx < query_columns.size(); ++col_idx)
{
/// Change query sample block columns to Nullable to allow inserting nullable columns, where NULL values will be substituted with
/// default column values (in AddingDefaultsTransform), so all values will be cast correctly.
if (isNullableOrLowCardinalityNullable(input_columns[col_idx].type)
&& !isNullableOrLowCardinalityNullable(query_columns[col_idx].type)
&& !isVariant(query_columns[col_idx].type)
&& !isDynamic(query_columns[col_idx].type)
&& output_columns.has(query_columns[col_idx].name))
{
query_sample_block.setColumn(
col_idx,
ColumnWithTypeAndName(
makeNullableOrLowCardinalityNullable(query_columns[col_idx].column),
makeNullableOrLowCardinalityNullable(query_columns[col_idx].type),
query_columns[col_idx].name));
}
}
}
}
auto actions_dag = ActionsDAG::makeConvertingActions(
pipeline.getHeader().getColumnsWithTypeAndName(),
query_sample_block.getColumnsWithTypeAndName(),
ActionsDAG::MatchColumnsMode::Position);
auto actions = std::make_shared<ExpressionActions>(actions_dag, ExpressionActionsSettings::fromContext(getContext(), CompileExpressions::yes));
pipeline.addSimpleTransform([&](const Block & in_header) -> ProcessorPtr
{
return std::make_shared<ExpressionTransform>(in_header, actions);
});
/// We need to convert Sparse columns to full, because it's destination storage
/// may not support it or may have different settings for applying Sparse serialization.
pipeline.addSimpleTransform([&](const Block & in_header) -> ProcessorPtr
{
return std::make_shared<MaterializingTransform>(in_header);
});
pipeline.addSimpleTransform([&](const Block & in_header) -> ProcessorPtr
{
auto context_ptr = getContext();
auto counting = std::make_shared<CountingTransform>(in_header, nullptr, context_ptr->getQuota());
counting->setProcessListElement(context_ptr->getProcessListElement());
counting->setProgressCallback(context_ptr->getProgressCallback());
return counting;
});
size_t num_select_threads = pipeline.getNumThreads();
pipeline.resize(1);
if (shouldAddSquashingFroStorage(table))
{
pipeline.addSimpleTransform([&](const Block & in_header) -> ProcessorPtr
{
return std::make_shared<PlanSquashingTransform>(
in_header,
table->prefersLargeBlocks() ? settings.min_insert_block_size_rows : settings.max_block_size,
table->prefersLargeBlocks() ? settings.min_insert_block_size_bytes : 0ULL);
});
}
pipeline.addSimpleTransform([&](const Block &in_header) -> ProcessorPtr
{
return std::make_shared<DeduplicationToken::AddTokenInfoTransform>(in_header);
});
if (!settings.insert_deduplication_token.value.empty())
{
pipeline.addSimpleTransform([&](const Block & in_header) -> ProcessorPtr
{
return std::make_shared<DeduplicationToken::SetUserTokenTransform>(settings.insert_deduplication_token.value, in_header);
});
pipeline.addSimpleTransform([&](const Block & in_header) -> ProcessorPtr
{
return std::make_shared<DeduplicationToken::SetSourceBlockNumberTransform>(in_header);
});
}
/// Number of streams works like this:
/// * For the SELECT, use `max_threads`, or `max_insert_threads`, or whatever
/// InterpreterSelectQuery ends up with.
/// * Use `max_insert_threads` streams for various insert-preparation steps, e.g.
/// materializing and squashing (too slow to do in one thread). That's `presink_chains`.
/// * If the table supports parallel inserts, use max_insert_threads for writing to IStorage.
/// Otherwise ResizeProcessor them down to 1 stream.
size_t presink_streams_size = std::max<size_t>(settings.max_insert_threads, pipeline.getNumStreams());
size_t sink_streams_size = table->supportsParallelInsert() ? std::max<size_t>(1, settings.max_insert_threads) : 1;
if (!settings.parallel_view_processing)
{
auto table_id = table->getStorageID();
auto views = DatabaseCatalog::instance().getDependentViews(table_id);
if (table->isView() || !views.empty())
sink_streams_size = 1;
}
auto [presink_chains, sink_chains] = buildPreAndSinkChains(
presink_streams_size, sink_streams_size,
table, metadata_snapshot, query_sample_block);
pipeline.resize(presink_chains.size());
if (shouldAddSquashingFroStorage(table))
{
pipeline.addSimpleTransform([&](const Block & in_header) -> ProcessorPtr
{
return std::make_shared<ApplySquashingTransform>(
in_header,
table->prefersLargeBlocks() ? settings.min_insert_block_size_rows : settings.max_block_size,
table->prefersLargeBlocks() ? settings.min_insert_block_size_bytes : 0ULL);
});
}
for (auto & chain : presink_chains)
pipeline.addResources(chain.detachResources());
pipeline.addChains(std::move(presink_chains));
pipeline.resize(sink_streams_size);
for (auto & chain : sink_chains)
pipeline.addResources(chain.detachResources());
pipeline.addChains(std::move(sink_chains));
if (!settings.parallel_view_processing)
{
/// Don't use more threads for INSERT than for SELECT to reduce memory consumption.
if (pipeline.getNumThreads() > num_select_threads)
pipeline.setMaxThreads(num_select_threads);
}
else if (pipeline.getNumThreads() < settings.max_threads)
{
/// It is possible for query to have max_threads=1, due to optimize_trivial_insert_select,
/// however in case of parallel_view_processing and multiple views, views can still be processed in parallel.
///
/// Note, number of threads will be limited by buildPushingToViewsChain() to max_threads.
pipeline.setMaxThreads(settings.max_threads);
}
pipeline.setSinks([&](const Block & cur_header, QueryPipelineBuilder::StreamType) -> ProcessorPtr
{
return std::make_shared<EmptySink>(cur_header);
});
return QueryPipelineBuilder::getPipeline(std::move(pipeline));
}
QueryPipeline InterpreterInsertQuery::buildInsertPipeline(ASTInsertQuery & query, StoragePtr table)
{
const Settings & settings = getContext()->getSettingsRef();
auto metadata_snapshot = table->getInMemoryMetadataPtr();
auto query_sample_block = getSampleBlock(query, table, metadata_snapshot, getContext(), no_destination, allow_materialized);
Chain chain;
{
auto [presink_chains, sink_chains] = buildPreAndSinkChains(
/* presink_streams */1, /* sink_streams */1,
table, metadata_snapshot, query_sample_block);
chain = std::move(presink_chains.front());
chain.appendChain(std::move(sink_chains.front()));
}
if (!settings.insert_deduplication_token.value.empty())
{
chain.addSource(std::make_shared<DeduplicationToken::SetSourceBlockNumberTransform>(chain.getInputHeader()));
chain.addSource(std::make_shared<DeduplicationToken::SetUserTokenTransform>(settings.insert_deduplication_token.value, chain.getInputHeader()));
}
chain.addSource(std::make_shared<DeduplicationToken::AddTokenInfoTransform>(chain.getInputHeader()));
if (shouldAddSquashingFroStorage(table))
{
bool table_prefers_large_blocks = table->prefersLargeBlocks();
auto squashing = std::make_shared<ApplySquashingTransform>(
chain.getInputHeader(),
table_prefers_large_blocks ? settings.min_insert_block_size_rows : settings.max_block_size,
table_prefers_large_blocks ? settings.min_insert_block_size_bytes : 0ULL);
chain.addSource(std::move(squashing));
auto balancing = std::make_shared<PlanSquashingTransform>(
chain.getInputHeader(),
table_prefers_large_blocks ? settings.min_insert_block_size_rows : settings.max_block_size,
table_prefers_large_blocks ? settings.min_insert_block_size_bytes : 0ULL);
chain.addSource(std::move(balancing));
}
auto context_ptr = getContext();
auto counting = std::make_shared<CountingTransform>(chain.getInputHeader(), nullptr, context_ptr->getQuota());
counting->setProcessListElement(context_ptr->getProcessListElement());
counting->setProgressCallback(context_ptr->getProgressCallback());
chain.addSource(std::move(counting));
QueryPipeline pipeline = QueryPipeline(std::move(chain));
pipeline.setNumThreads(std::min<size_t>(pipeline.getNumThreads(), settings.max_threads));
pipeline.setConcurrencyControl(settings.use_concurrency_control);
if (query.hasInlinedData() && !async_insert)
{
/// can execute without additional data
auto format = getInputFormatFromASTInsertQuery(query_ptr, true, query_sample_block, getContext(), nullptr);
for (auto && buffer : owned_buffers)
format->addBuffer(std::move(buffer));
auto pipe = getSourceFromInputFormat(query_ptr, std::move(format), getContext(), nullptr);
pipeline.complete(std::move(pipe));
}
return pipeline;
}
BlockIO InterpreterInsertQuery::execute() BlockIO InterpreterInsertQuery::execute()
{ {
const Settings & settings = getContext()->getSettingsRef(); const Settings & settings = getContext()->getSettingsRef();
auto & query = query_ptr->as<ASTInsertQuery &>(); auto & query = query_ptr->as<ASTInsertQuery &>();
QueryPipelineBuilder pipeline;
std::optional<QueryPipeline> distributed_pipeline;
QueryPlanResourceHolder resources;
StoragePtr table = getTable(query); StoragePtr table = getTable(query);
checkStorageSupportsTransactionsIfNeeded(table, getContext()); checkStorageSupportsTransactionsIfNeeded(table, getContext());
StoragePtr inner_table;
if (const auto * mv = dynamic_cast<const StorageMaterializedView *>(table.get()))
inner_table = mv->getTargetTable();
if (query.partition_by && !table->supportsPartitionBy()) if (query.partition_by && !table->supportsPartitionBy())
throw Exception(ErrorCodes::NOT_IMPLEMENTED, "PARTITION BY clause is not supported by storage"); throw Exception(ErrorCodes::NOT_IMPLEMENTED, "PARTITION BY clause is not supported by storage");
auto table_lock = table->lockForShare(getContext()->getInitialQueryId(), settings.lock_acquire_timeout); auto table_lock = table->lockForShare(getContext()->getInitialQueryId(), settings.lock_acquire_timeout);
auto metadata_snapshot = table->getInMemoryMetadataPtr(); auto metadata_snapshot = table->getInMemoryMetadataPtr();
auto query_sample_block = getSampleBlock(query, table, metadata_snapshot, getContext(), no_destination, allow_materialized); auto query_sample_block = getSampleBlock(query, table, metadata_snapshot, getContext(), no_destination, allow_materialized);
/// For table functions we check access while executing /// For table functions we check access while executing
@ -756,43 +423,320 @@ BlockIO InterpreterInsertQuery::execute()
if (!query.table_function) if (!query.table_function)
getContext()->checkAccess(AccessType::INSERT, query.table_id, query_sample_block.getNames()); getContext()->checkAccess(AccessType::INSERT, query.table_id, query_sample_block.getNames());
if (!allow_materialized) if (query.select && settings.parallel_distributed_insert_select)
// Distributed INSERT SELECT
distributed_pipeline = table->distributedWrite(query, getContext());
std::vector<Chain> presink_chains;
std::vector<Chain> sink_chains;
if (!distributed_pipeline)
{ {
for (const auto & column : metadata_snapshot->getColumns()) /// Number of streams works like this:
if (column.default_desc.kind == ColumnDefaultKind::Materialized && query_sample_block.has(column.name)) /// * For the SELECT, use `max_threads`, or `max_insert_threads`, or whatever
throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Cannot insert column {}, because it is MATERIALIZED column.", column.name); /// InterpreterSelectQuery ends up with.
/// * Use `max_insert_threads` streams for various insert-preparation steps, e.g.
/// materializing and squashing (too slow to do in one thread). That's `presink_chains`.
/// * If the table supports parallel inserts, use the same streams for writing to IStorage.
/// Otherwise ResizeProcessor them down to 1 stream.
/// * If it's not an INSERT SELECT, forget all that and use one stream.
size_t pre_streams_size = 1;
size_t sink_streams_size = 1;
if (query.select)
{
bool is_trivial_insert_select = false;
if (settings.optimize_trivial_insert_select)
{
const auto & select_query = query.select->as<ASTSelectWithUnionQuery &>();
const auto & selects = select_query.list_of_selects->children;
const auto & union_modes = select_query.list_of_modes;
/// ASTSelectWithUnionQuery is not normalized now, so it may pass some queries which can be Trivial select queries
const auto mode_is_all = [](const auto & mode) { return mode == SelectUnionMode::UNION_ALL; };
is_trivial_insert_select =
std::all_of(union_modes.begin(), union_modes.end(), std::move(mode_is_all))
&& std::all_of(selects.begin(), selects.end(), isTrivialSelect);
}
if (is_trivial_insert_select)
{
/** When doing trivial INSERT INTO ... SELECT ... FROM table,
* don't need to process SELECT with more than max_insert_threads
* and it's reasonable to set block size for SELECT to the desired block size for INSERT
* to avoid unnecessary squashing.
*/
Settings new_settings = getContext()->getSettings();
new_settings.max_threads = std::max<UInt64>(1, settings.max_insert_threads);
if (table->prefersLargeBlocks())
{
if (settings.min_insert_block_size_rows)
new_settings.max_block_size = settings.min_insert_block_size_rows;
if (settings.min_insert_block_size_bytes)
new_settings.preferred_block_size_bytes = settings.min_insert_block_size_bytes;
}
auto new_context = Context::createCopy(context);
new_context->setSettings(new_settings);
new_context->setInsertionTable(getContext()->getInsertionTable(), getContext()->getInsertionTableColumnNames());
auto select_query_options = SelectQueryOptions(QueryProcessingStage::Complete, 1);
if (settings.allow_experimental_analyzer)
{
InterpreterSelectQueryAnalyzer interpreter_select_analyzer(query.select, new_context, select_query_options);
pipeline = interpreter_select_analyzer.buildQueryPipeline();
}
else
{
InterpreterSelectWithUnionQuery interpreter_select(query.select, new_context, select_query_options);
pipeline = interpreter_select.buildQueryPipeline();
}
}
else
{
/// Passing 1 as subquery_depth will disable limiting size of intermediate result.
auto select_query_options = SelectQueryOptions(QueryProcessingStage::Complete, 1);
if (settings.allow_experimental_analyzer)
{
InterpreterSelectQueryAnalyzer interpreter_select_analyzer(query.select, getContext(), select_query_options);
pipeline = interpreter_select_analyzer.buildQueryPipeline();
}
else
{
InterpreterSelectWithUnionQuery interpreter_select(query.select, getContext(), select_query_options);
pipeline = interpreter_select.buildQueryPipeline();
}
}
pipeline.dropTotalsAndExtremes();
if (settings.max_insert_threads > 1)
{
auto table_id = table->getStorageID();
auto views = DatabaseCatalog::instance().getDependentViews(table_id);
/// It breaks some views-related tests and we have dedicated `parallel_view_processing` for views, so let's just skip them.
/// Also it doesn't make sense to reshuffle data if storage doesn't support parallel inserts.
const bool resize_to_max_insert_threads = !table->isView() && views.empty() && table->supportsParallelInsert();
pre_streams_size = resize_to_max_insert_threads ? settings.max_insert_threads
: std::min<size_t>(settings.max_insert_threads, pipeline.getNumStreams());
/// Deduplication when passing insert_deduplication_token breaks if using more than one thread
if (!settings.insert_deduplication_token.toString().empty())
{
LOG_DEBUG(
getLogger("InsertQuery"),
"Insert-select query using insert_deduplication_token, setting streams to 1 to avoid deduplication issues");
pre_streams_size = 1;
}
if (table->supportsParallelInsert())
sink_streams_size = pre_streams_size;
}
pipeline.resize(pre_streams_size);
/// Allow to insert Nullable into non-Nullable columns, NULL values will be added as defaults values.
if (getContext()->getSettingsRef().insert_null_as_default)
{
const auto & input_columns = pipeline.getHeader().getColumnsWithTypeAndName();
const auto & query_columns = query_sample_block.getColumnsWithTypeAndName();
const auto & output_columns = metadata_snapshot->getColumns();
if (input_columns.size() == query_columns.size())
{
for (size_t col_idx = 0; col_idx < query_columns.size(); ++col_idx)
{
/// Change query sample block columns to Nullable to allow inserting nullable columns, where NULL values will be substituted with
/// default column values (in AddingDefaultsTransform), so all values will be cast correctly.
if (isNullableOrLowCardinalityNullable(input_columns[col_idx].type)
&& !isNullableOrLowCardinalityNullable(query_columns[col_idx].type)
&& !isVariant(query_columns[col_idx].type)
&& !isDynamic(query_columns[col_idx].type)
&& output_columns.has(query_columns[col_idx].name))
query_sample_block.setColumn(col_idx, ColumnWithTypeAndName(makeNullableOrLowCardinalityNullable(query_columns[col_idx].column), makeNullableOrLowCardinalityNullable(query_columns[col_idx].type), query_columns[col_idx].name));
}
}
}
}
ThreadGroupPtr running_group;
if (current_thread)
running_group = current_thread->getThreadGroup();
if (!running_group)
running_group = std::make_shared<ThreadGroup>(getContext());
for (size_t i = 0; i < sink_streams_size; ++i)
{
auto out = buildSink(table, metadata_snapshot, /* thread_status_holder= */ nullptr,
running_group, /* elapsed_counter_ms= */ nullptr);
sink_chains.emplace_back(std::move(out));
}
for (size_t i = 0; i < pre_streams_size; ++i)
{
auto out = buildPreSinkChain(sink_chains[0].getInputHeader(), table, metadata_snapshot, query_sample_block);
presink_chains.emplace_back(std::move(out));
}
} }
BlockIO res; BlockIO res;
if (query.select) /// What type of query: INSERT or INSERT SELECT or INSERT WATCH?
if (distributed_pipeline)
{ {
if (settings.parallel_distributed_insert_select) res.pipeline = std::move(*distributed_pipeline);
}
else if (query.select)
{
const auto & header = presink_chains.at(0).getInputHeader();
auto actions_dag = ActionsDAG::makeConvertingActions(
pipeline.getHeader().getColumnsWithTypeAndName(),
header.getColumnsWithTypeAndName(),
ActionsDAG::MatchColumnsMode::Position);
auto actions = std::make_shared<ExpressionActions>(actions_dag, ExpressionActionsSettings::fromContext(getContext(), CompileExpressions::yes));
pipeline.addSimpleTransform([&](const Block & in_header) -> ProcessorPtr
{ {
auto distributed = table->distributedWrite(query, getContext()); return std::make_shared<ExpressionTransform>(in_header, actions);
if (distributed) });
{
res.pipeline = std::move(*distributed); /// We need to convert Sparse columns to full, because it's destination storage
} /// may not support it or may have different settings for applying Sparse serialization.
else pipeline.addSimpleTransform([&](const Block & in_header) -> ProcessorPtr
{
res.pipeline = buildInsertSelectPipeline(query, table);
}
}
else
{ {
res.pipeline = buildInsertSelectPipeline(query, table); return std::make_shared<MaterializingTransform>(in_header);
});
pipeline.addSimpleTransform([&](const Block & in_header) -> ProcessorPtr
{
auto context_ptr = getContext();
auto counting = std::make_shared<CountingTransform>(in_header, nullptr, context_ptr->getQuota());
counting->setProcessListElement(context_ptr->getProcessListElement());
counting->setProgressCallback(context_ptr->getProgressCallback());
return counting;
});
if (shouldAddSquashingFroStorage(table))
{
bool table_prefers_large_blocks = table->prefersLargeBlocks();
size_t threads = presink_chains.size();
pipeline.resize(1);
pipeline.addTransform(std::make_shared<PlanSquashingTransform>(
header,
table_prefers_large_blocks ? settings.min_insert_block_size_rows : settings.max_block_size,
table_prefers_large_blocks ? settings.min_insert_block_size_bytes : 0ULL));
pipeline.resize(threads);
pipeline.addSimpleTransform([&](const Block & in_header) -> ProcessorPtr
{
return std::make_shared<ApplySquashingTransform>(
in_header,
table_prefers_large_blocks ? settings.min_insert_block_size_rows : settings.max_block_size,
table_prefers_large_blocks ? settings.min_insert_block_size_bytes : 0ULL);
});
} }
size_t num_select_threads = pipeline.getNumThreads();
for (auto & chain : presink_chains)
resources = chain.detachResources();
for (auto & chain : sink_chains)
resources = chain.detachResources();
pipeline.addChains(std::move(presink_chains));
pipeline.resize(sink_chains.size());
pipeline.addChains(std::move(sink_chains));
if (!settings.parallel_view_processing)
{
/// Don't use more threads for INSERT than for SELECT to reduce memory consumption.
if (pipeline.getNumThreads() > num_select_threads)
pipeline.setMaxThreads(num_select_threads);
}
else if (pipeline.getNumThreads() < settings.max_threads)
{
/// It is possible for query to have max_threads=1, due to optimize_trivial_insert_select,
/// however in case of parallel_view_processing and multiple views, views can still be processed in parallel.
///
/// Note, number of threads will be limited by buildPushingToViewsChain() to max_threads.
pipeline.setMaxThreads(settings.max_threads);
}
pipeline.setSinks([&](const Block & cur_header, QueryPipelineBuilder::StreamType) -> ProcessorPtr
{
return std::make_shared<EmptySink>(cur_header);
});
if (!allow_materialized)
{
for (const auto & column : metadata_snapshot->getColumns())
if (column.default_desc.kind == ColumnDefaultKind::Materialized && header.has(column.name))
throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Cannot insert column {}, because it is MATERIALIZED column.", column.name);
}
res.pipeline = QueryPipelineBuilder::getPipeline(std::move(pipeline));
} }
else else
{ {
res.pipeline = buildInsertPipeline(query, table); auto & chain = presink_chains.at(0);
chain.appendChain(std::move(sink_chains.at(0)));
if (shouldAddSquashingFroStorage(table))
{
bool table_prefers_large_blocks = table->prefersLargeBlocks();
auto squashing = std::make_shared<ApplySquashingTransform>(
chain.getInputHeader(),
table_prefers_large_blocks ? settings.min_insert_block_size_rows : settings.max_block_size,
table_prefers_large_blocks ? settings.min_insert_block_size_bytes : 0ULL);
chain.addSource(std::move(squashing));
auto balancing = std::make_shared<PlanSquashingTransform>(
chain.getInputHeader(),
table_prefers_large_blocks ? settings.min_insert_block_size_rows : settings.max_block_size,
table_prefers_large_blocks ? settings.min_insert_block_size_bytes : 0ULL);
chain.addSource(std::move(balancing));
}
auto context_ptr = getContext();
auto counting = std::make_shared<CountingTransform>(chain.getInputHeader(), nullptr, context_ptr->getQuota());
counting->setProcessListElement(context_ptr->getProcessListElement());
counting->setProgressCallback(context_ptr->getProgressCallback());
chain.addSource(std::move(counting));
res.pipeline = QueryPipeline(std::move(presink_chains[0]));
res.pipeline.setNumThreads(std::min<size_t>(res.pipeline.getNumThreads(), settings.max_threads));
res.pipeline.setConcurrencyControl(settings.use_concurrency_control);
if (query.hasInlinedData() && !async_insert)
{
/// can execute without additional data
auto format = getInputFormatFromASTInsertQuery(query_ptr, true, query_sample_block, getContext(), nullptr);
for (auto && buffer : owned_buffers)
format->addBuffer(std::move(buffer));
auto pipe = getSourceFromInputFormat(query_ptr, std::move(format), getContext(), nullptr);
res.pipeline.complete(std::move(pipe));
}
} }
res.pipeline.addStorageHolder(table); res.pipeline.addResources(std::move(resources));
if (const auto * mv = dynamic_cast<const StorageMaterializedView *>(table.get())) res.pipeline.addStorageHolder(table);
res.pipeline.addStorageHolder(mv->getTargetTable()); if (inner_table)
res.pipeline.addStorageHolder(inner_table);
return res; return res;
} }
@ -813,27 +757,17 @@ void InterpreterInsertQuery::extendQueryLogElemImpl(QueryLogElement & elem, Cont
} }
} }
void InterpreterInsertQuery::extendQueryLogElemImpl(QueryLogElement & elem, const ASTPtr &, ContextPtr context_) const void InterpreterInsertQuery::extendQueryLogElemImpl(QueryLogElement & elem, const ASTPtr &, ContextPtr context_) const
{ {
extendQueryLogElemImpl(elem, context_); extendQueryLogElemImpl(elem, context_);
} }
void registerInterpreterInsertQuery(InterpreterFactory & factory) void registerInterpreterInsertQuery(InterpreterFactory & factory)
{ {
auto create_fn = [] (const InterpreterFactory::Arguments & args) auto create_fn = [] (const InterpreterFactory::Arguments & args)
{ {
return std::make_unique<InterpreterInsertQuery>( return std::make_unique<InterpreterInsertQuery>(args.query, args.context, args.allow_materialized);
args.query,
args.context,
args.allow_materialized,
/* no_squash */false,
/* no_destination */false,
/* async_insert */false);
}; };
factory.registerInterpreter("InterpreterInsertQuery", create_fn); factory.registerInterpreter("InterpreterInsertQuery", create_fn);
} }
} }

View File

@ -23,10 +23,10 @@ public:
InterpreterInsertQuery( InterpreterInsertQuery(
const ASTPtr & query_ptr_, const ASTPtr & query_ptr_,
ContextPtr context_, ContextPtr context_,
bool allow_materialized_, bool allow_materialized_ = false,
bool no_squash_, bool no_squash_ = false,
bool no_destination, bool no_destination_ = false,
bool async_insert_); bool async_insert_ = false);
/** Prepare a request for execution. Return block streams /** Prepare a request for execution. Return block streams
* - the stream into which you can write data to execute the query, if INSERT; * - the stream into which you can write data to execute the query, if INSERT;
@ -73,17 +73,12 @@ private:
ASTPtr query_ptr; ASTPtr query_ptr;
const bool allow_materialized; const bool allow_materialized;
bool no_squash = false; const bool no_squash;
bool no_destination = false; const bool no_destination;
const bool async_insert; const bool async_insert;
std::vector<std::unique_ptr<ReadBuffer>> owned_buffers; std::vector<std::unique_ptr<ReadBuffer>> owned_buffers;
std::pair<std::vector<Chain>, std::vector<Chain>> buildPreAndSinkChains(size_t presink_streams, size_t sink_streams, StoragePtr table, const StorageMetadataPtr & metadata_snapshot, const Block & query_sample_block);
QueryPipeline buildInsertSelectPipeline(ASTInsertQuery & query, StoragePtr table);
QueryPipeline buildInsertPipeline(ASTInsertQuery & query, StoragePtr table);
Chain buildSink( Chain buildSink(
const StoragePtr & table, const StoragePtr & table,
const StorageMetadataPtr & metadata_snapshot, const StorageMetadataPtr & metadata_snapshot,

View File

@ -1,7 +1,6 @@
#include <vector> #include <vector>
#include <Interpreters/Squashing.h> #include <Interpreters/Squashing.h>
#include <Common/CurrentThread.h> #include <Common/CurrentThread.h>
#include <base/defines.h>
namespace DB namespace DB
@ -12,33 +11,24 @@ namespace ErrorCodes
} }
Squashing::Squashing(Block header_, size_t min_block_size_rows_, size_t min_block_size_bytes_) Squashing::Squashing(Block header_, size_t min_block_size_rows_, size_t min_block_size_bytes_)
: min_block_size_rows(min_block_size_rows_) : header(header_)
, min_block_size_rows(min_block_size_rows_)
, min_block_size_bytes(min_block_size_bytes_) , min_block_size_bytes(min_block_size_bytes_)
, header(header_)
{ {
} }
Chunk Squashing::flush() Chunk Squashing::flush()
{ {
if (!accumulated) return convertToChunk(std::move(chunks_to_merge_vec));
return {};
auto result = convertToChunk(accumulated.extract());
chassert(result);
return result;
} }
Chunk Squashing::squash(Chunk && input_chunk) Chunk Squashing::squash(Chunk && input_chunk)
{ {
if (!input_chunk) if (!input_chunk.hasChunkInfo())
return Chunk(); return Chunk();
auto squash_info = input_chunk.getChunkInfos().extract<ChunksToSquash>(); const auto *info = getInfoFromChunk(input_chunk);
return squash(info->chunks);
if (!squash_info)
throw Exception(ErrorCodes::LOGICAL_ERROR, "There is no ChunksToSquash in ChunkInfoPtr");
return squash(std::move(squash_info->chunks), std::move(input_chunk.getChunkInfos()));
} }
Chunk Squashing::add(Chunk && input_chunk) Chunk Squashing::add(Chunk && input_chunk)
@ -47,37 +37,48 @@ Chunk Squashing::add(Chunk && input_chunk)
return {}; return {};
/// Just read block is already enough. /// Just read block is already enough.
if (isEnoughSize(input_chunk)) if (isEnoughSize(input_chunk.getNumRows(), input_chunk.bytes()))
{ {
/// If no accumulated data, return just read block. /// If no accumulated data, return just read block.
if (!accumulated) if (chunks_to_merge_vec.empty())
{ {
accumulated.add(std::move(input_chunk)); chunks_to_merge_vec.push_back(std::move(input_chunk));
return convertToChunk(accumulated.extract()); Chunk res_chunk = convertToChunk(std::move(chunks_to_merge_vec));
chunks_to_merge_vec.clear();
return res_chunk;
} }
/// Return accumulated data (maybe it has small size) and place new block to accumulated data. /// Return accumulated data (maybe it has small size) and place new block to accumulated data.
Chunk res_chunk = convertToChunk(accumulated.extract()); Chunk res_chunk = convertToChunk(std::move(chunks_to_merge_vec));
accumulated.add(std::move(input_chunk)); chunks_to_merge_vec.clear();
changeCurrentSize(input_chunk.getNumRows(), input_chunk.bytes());
chunks_to_merge_vec.push_back(std::move(input_chunk));
return res_chunk; return res_chunk;
} }
/// Accumulated block is already enough. /// Accumulated block is already enough.
if (isEnoughSize()) if (isEnoughSize(accumulated_size.rows, accumulated_size.bytes))
{ {
/// Return accumulated data and place new block to accumulated data. /// Return accumulated data and place new block to accumulated data.
Chunk res_chunk = convertToChunk(accumulated.extract()); Chunk res_chunk = convertToChunk(std::move(chunks_to_merge_vec));
accumulated.add(std::move(input_chunk)); chunks_to_merge_vec.clear();
changeCurrentSize(input_chunk.getNumRows(), input_chunk.bytes());
chunks_to_merge_vec.push_back(std::move(input_chunk));
return res_chunk; return res_chunk;
} }
/// Pushing data into accumulating vector /// Pushing data into accumulating vector
accumulated.add(std::move(input_chunk)); expandCurrentSize(input_chunk.getNumRows(), input_chunk.bytes());
chunks_to_merge_vec.push_back(std::move(input_chunk));
/// If accumulated data is big enough, we send it /// If accumulated data is big enough, we send it
if (isEnoughSize()) if (isEnoughSize(accumulated_size.rows, accumulated_size.bytes))
return convertToChunk(accumulated.extract()); {
Chunk res_chunk = convertToChunk(std::move(chunks_to_merge_vec));
changeCurrentSize(0, 0);
chunks_to_merge_vec.clear();
return res_chunk;
}
return {}; return {};
} }
@ -89,15 +90,14 @@ Chunk Squashing::convertToChunk(std::vector<Chunk> && chunks) const
auto info = std::make_shared<ChunksToSquash>(); auto info = std::make_shared<ChunksToSquash>();
info->chunks = std::move(chunks); info->chunks = std::move(chunks);
// It is imortant that chunk is not empty, it has to have columns even if they are empty chunks.clear();
auto aggr_chunk = Chunk(header.getColumns(), 0);
aggr_chunk.getChunkInfos().add(std::move(info)); return Chunk(header.cloneEmptyColumns(), 0, info);
chassert(aggr_chunk);
return aggr_chunk;
} }
Chunk Squashing::squash(std::vector<Chunk> && input_chunks, Chunk::ChunkInfoCollection && infos) Chunk Squashing::squash(std::vector<Chunk> & input_chunks)
{ {
Chunk accumulated_chunk;
std::vector<IColumn::MutablePtr> mutable_columns = {}; std::vector<IColumn::MutablePtr> mutable_columns = {};
size_t rows = 0; size_t rows = 0;
for (const Chunk & chunk : input_chunks) for (const Chunk & chunk : input_chunks)
@ -119,17 +119,35 @@ Chunk Squashing::squash(std::vector<Chunk> && input_chunks, Chunk::ChunkInfoColl
for (size_t j = 0, size = mutable_columns.size(); j < size; ++j) for (size_t j = 0, size = mutable_columns.size(); j < size; ++j)
{ {
const auto source_column = columns[j]; const auto source_column = columns[j];
mutable_columns[j]->insertRangeFrom(*source_column, 0, source_column->size()); mutable_columns[j]->insertRangeFrom(*source_column, 0, source_column->size());
} }
} }
accumulated_chunk.setColumns(std::move(mutable_columns), rows);
return accumulated_chunk;
}
Chunk result; const ChunksToSquash* Squashing::getInfoFromChunk(const Chunk & chunk)
result.setColumns(std::move(mutable_columns), rows); {
result.setChunkInfos(infos); const auto& info = chunk.getChunkInfo();
result.getChunkInfos().append(std::move(input_chunks.back().getChunkInfos())); const auto * agg_info = typeid_cast<const ChunksToSquash *>(info.get());
chassert(result); if (!agg_info)
return result; throw Exception(ErrorCodes::LOGICAL_ERROR, "There is no ChunksToSquash in ChunkInfoPtr");
return agg_info;
}
void Squashing::expandCurrentSize(size_t rows, size_t bytes)
{
accumulated_size.rows += rows;
accumulated_size.bytes += bytes;
}
void Squashing::changeCurrentSize(size_t rows, size_t bytes)
{
accumulated_size.rows = rows;
accumulated_size.bytes = bytes;
} }
bool Squashing::isEnoughSize(size_t rows, size_t bytes) const bool Squashing::isEnoughSize(size_t rows, size_t bytes) const
@ -138,28 +156,4 @@ bool Squashing::isEnoughSize(size_t rows, size_t bytes) const
|| (min_block_size_rows && rows >= min_block_size_rows) || (min_block_size_rows && rows >= min_block_size_rows)
|| (min_block_size_bytes && bytes >= min_block_size_bytes); || (min_block_size_bytes && bytes >= min_block_size_bytes);
} }
bool Squashing::isEnoughSize() const
{
return isEnoughSize(accumulated.getRows(), accumulated.getBytes());
};
bool Squashing::isEnoughSize(const Chunk & chunk) const
{
return isEnoughSize(chunk.getNumRows(), chunk.bytes());
}
void Squashing::CurrentSize::add(Chunk && chunk)
{
rows += chunk.getNumRows();
bytes += chunk.bytes();
chunks.push_back(std::move(chunk));
}
std::vector<Chunk> Squashing::CurrentSize::extract()
{
auto result = std::move(chunks);
*this = {};
return result;
}
} }

View File

@ -8,18 +8,9 @@
namespace DB namespace DB
{ {
class ChunksToSquash : public ChunkInfoCloneable<ChunksToSquash> struct ChunksToSquash : public ChunkInfo
{ {
public: mutable std::vector<Chunk> chunks = {};
ChunksToSquash() = default;
ChunksToSquash(const ChunksToSquash & other)
{
chunks.reserve(other.chunks.size());
for (const auto & chunk: other.chunks)
chunks.push_back(chunk.clone());
}
std::vector<Chunk> chunks = {};
}; };
/** Merging consecutive passed blocks to specified minimum size. /** Merging consecutive passed blocks to specified minimum size.
@ -45,35 +36,32 @@ public:
static Chunk squash(Chunk && input_chunk); static Chunk squash(Chunk && input_chunk);
Chunk flush(); Chunk flush();
void setHeader(Block header_) { header = std::move(header_); } bool isDataLeft()
const Block & getHeader() const { return header; } {
return !chunks_to_merge_vec.empty();
private: }
class CurrentSize
Block header;
private:
struct CurrentSize
{ {
std::vector<Chunk> chunks = {};
size_t rows = 0; size_t rows = 0;
size_t bytes = 0; size_t bytes = 0;
public:
explicit operator bool () const { return !chunks.empty(); }
size_t getRows() const { return rows; }
size_t getBytes() const { return bytes; }
void add(Chunk && chunk);
std::vector<Chunk> extract();
}; };
const size_t min_block_size_rows; std::vector<Chunk> chunks_to_merge_vec = {};
const size_t min_block_size_bytes; size_t min_block_size_rows;
Block header; size_t min_block_size_bytes;
CurrentSize accumulated; CurrentSize accumulated_size;
static Chunk squash(std::vector<Chunk> && input_chunks, Chunk::ChunkInfoCollection && infos); static const ChunksToSquash * getInfoFromChunk(const Chunk & chunk);
bool isEnoughSize() const; static Chunk squash(std::vector<Chunk> & input_chunks);
void expandCurrentSize(size_t rows, size_t bytes);
void changeCurrentSize(size_t rows, size_t bytes);
bool isEnoughSize(size_t rows, size_t bytes) const; bool isEnoughSize(size_t rows, size_t bytes) const;
bool isEnoughSize(const Chunk & chunk) const;
Chunk convertToChunk(std::vector<Chunk> && chunks) const; Chunk convertToChunk(std::vector<Chunk> && chunks) const;
}; };

View File

@ -538,13 +538,7 @@ void SystemLog<LogElement>::flushImpl(const std::vector<LogElement> & to_flush,
insert_context->makeQueryContext(); insert_context->makeQueryContext();
addSettingsForQuery(insert_context, IAST::QueryKind::Insert); addSettingsForQuery(insert_context, IAST::QueryKind::Insert);
InterpreterInsertQuery interpreter( InterpreterInsertQuery interpreter(query_ptr, insert_context);
query_ptr,
insert_context,
/* allow_materialized */ false,
/* no_squash */ false,
/* no_destination */ false,
/* async_isnert */ false);
BlockIO io = interpreter.execute(); BlockIO io = interpreter.execute();
PushingPipelineExecutor executor(io.pipeline); PushingPipelineExecutor executor(io.pipeline);

View File

@ -1188,7 +1188,7 @@ bool TreeRewriterResult::collectUsedColumns(const ASTPtr & query, bool is_select
} }
} }
/// Check for dynamic subcolumns in unknown required columns. /// Check for dynamic subcolums in unknown required columns.
if (!unknown_required_source_columns.empty()) if (!unknown_required_source_columns.empty())
{ {
for (const NameAndTypePair & pair : source_columns_ordinary) for (const NameAndTypePair & pair : source_columns_ordinary)

View File

@ -1129,11 +1129,11 @@ inline static bool makeHexOrBinStringLiteral(IParser::Pos & pos, ASTPtr & node,
if (hex) if (hex)
{ {
hexStringDecode(str_begin, str_end, res_pos); hexStringDecode(str_begin, str_end, res_pos, word_size);
} }
else else
{ {
binStringDecode(str_begin, str_end, res_pos); binStringDecode(str_begin, str_end, res_pos, word_size);
} }
return makeStringLiteral(pos, node, String(reinterpret_cast<char *>(res.data()), (res_pos - res_begin - 1))); return makeStringLiteral(pos, node, String(reinterpret_cast<char *>(res.data()), (res_pos - res_begin - 1)));

View File

@ -19,6 +19,14 @@ Chunk::Chunk(DB::Columns columns_, UInt64 num_rows_) : columns(std::move(columns
checkNumRowsIsConsistent(); checkNumRowsIsConsistent();
} }
Chunk::Chunk(Columns columns_, UInt64 num_rows_, ChunkInfoPtr chunk_info_)
: columns(std::move(columns_))
, num_rows(num_rows_)
, chunk_info(std::move(chunk_info_))
{
checkNumRowsIsConsistent();
}
static Columns unmuteColumns(MutableColumns && mutable_columns) static Columns unmuteColumns(MutableColumns && mutable_columns)
{ {
Columns columns; Columns columns;
@ -35,11 +43,17 @@ Chunk::Chunk(MutableColumns columns_, UInt64 num_rows_)
checkNumRowsIsConsistent(); checkNumRowsIsConsistent();
} }
Chunk::Chunk(MutableColumns columns_, UInt64 num_rows_, ChunkInfoPtr chunk_info_)
: columns(unmuteColumns(std::move(columns_)))
, num_rows(num_rows_)
, chunk_info(std::move(chunk_info_))
{
checkNumRowsIsConsistent();
}
Chunk Chunk::clone() const Chunk Chunk::clone() const
{ {
auto tmp = Chunk(getColumns(), getNumRows()); return Chunk(getColumns(), getNumRows(), chunk_info);
tmp.setChunkInfos(chunk_infos.clone());
return tmp;
} }
void Chunk::setColumns(Columns columns_, UInt64 num_rows_) void Chunk::setColumns(Columns columns_, UInt64 num_rows_)

View File

@ -1,9 +1,7 @@
#pragma once #pragma once
#include <Common/CollectionOfDerived.h>
#include <Columns/IColumn.h> #include <Columns/IColumn.h>
#include <unordered_map>
#include <memory>
namespace DB namespace DB
{ {
@ -11,29 +9,11 @@ namespace DB
class ChunkInfo class ChunkInfo
{ {
public: public:
using Ptr = std::shared_ptr<ChunkInfo>;
ChunkInfo() = default;
ChunkInfo(const ChunkInfo&) = default;
ChunkInfo(ChunkInfo&&) = default;
virtual Ptr clone() const = 0;
virtual ~ChunkInfo() = default; virtual ~ChunkInfo() = default;
ChunkInfo() = default;
}; };
using ChunkInfoPtr = std::shared_ptr<const ChunkInfo>;
template<class Derived>
class ChunkInfoCloneable : public ChunkInfo
{
public:
ChunkInfoCloneable() = default;
ChunkInfoCloneable(const ChunkInfoCloneable & other) = default;
Ptr clone() const override
{
return std::static_pointer_cast<ChunkInfo>(std::make_shared<Derived>(*static_cast<const Derived*>(this)));
}
};
/** /**
* Chunk is a list of columns with the same length. * Chunk is a list of columns with the same length.
@ -52,26 +32,26 @@ public:
class Chunk class Chunk
{ {
public: public:
using ChunkInfoCollection = CollectionOfDerivedItems<ChunkInfo>;
Chunk() = default; Chunk() = default;
Chunk(const Chunk & other) = delete; Chunk(const Chunk & other) = delete;
Chunk(Chunk && other) noexcept Chunk(Chunk && other) noexcept
: columns(std::move(other.columns)) : columns(std::move(other.columns))
, num_rows(other.num_rows) , num_rows(other.num_rows)
, chunk_infos(std::move(other.chunk_infos)) , chunk_info(std::move(other.chunk_info))
{ {
other.num_rows = 0; other.num_rows = 0;
} }
Chunk(Columns columns_, UInt64 num_rows_); Chunk(Columns columns_, UInt64 num_rows_);
Chunk(Columns columns_, UInt64 num_rows_, ChunkInfoPtr chunk_info_);
Chunk(MutableColumns columns_, UInt64 num_rows_); Chunk(MutableColumns columns_, UInt64 num_rows_);
Chunk(MutableColumns columns_, UInt64 num_rows_, ChunkInfoPtr chunk_info_);
Chunk & operator=(const Chunk & other) = delete; Chunk & operator=(const Chunk & other) = delete;
Chunk & operator=(Chunk && other) noexcept Chunk & operator=(Chunk && other) noexcept
{ {
columns = std::move(other.columns); columns = std::move(other.columns);
chunk_infos = std::move(other.chunk_infos); chunk_info = std::move(other.chunk_info);
num_rows = other.num_rows; num_rows = other.num_rows;
other.num_rows = 0; other.num_rows = 0;
return *this; return *this;
@ -82,15 +62,15 @@ public:
void swap(Chunk & other) noexcept void swap(Chunk & other) noexcept
{ {
columns.swap(other.columns); columns.swap(other.columns);
chunk_info.swap(other.chunk_info);
std::swap(num_rows, other.num_rows); std::swap(num_rows, other.num_rows);
chunk_infos.swap(other.chunk_infos);
} }
void clear() void clear()
{ {
num_rows = 0; num_rows = 0;
columns.clear(); columns.clear();
chunk_infos.clear(); chunk_info.reset();
} }
const Columns & getColumns() const { return columns; } const Columns & getColumns() const { return columns; }
@ -101,9 +81,9 @@ public:
/** Get empty columns with the same types as in block. */ /** Get empty columns with the same types as in block. */
MutableColumns cloneEmptyColumns() const; MutableColumns cloneEmptyColumns() const;
ChunkInfoCollection & getChunkInfos() { return chunk_infos; } const ChunkInfoPtr & getChunkInfo() const { return chunk_info; }
const ChunkInfoCollection & getChunkInfos() const { return chunk_infos; } bool hasChunkInfo() const { return chunk_info != nullptr; }
void setChunkInfos(ChunkInfoCollection chunk_infos_) { chunk_infos = std::move(chunk_infos_); } void setChunkInfo(ChunkInfoPtr chunk_info_) { chunk_info = std::move(chunk_info_); }
UInt64 getNumRows() const { return num_rows; } UInt64 getNumRows() const { return num_rows; }
UInt64 getNumColumns() const { return columns.size(); } UInt64 getNumColumns() const { return columns.size(); }
@ -127,7 +107,7 @@ public:
private: private:
Columns columns; Columns columns;
UInt64 num_rows = 0; UInt64 num_rows = 0;
ChunkInfoCollection chunk_infos; ChunkInfoPtr chunk_info;
void checkNumRowsIsConsistent(); void checkNumRowsIsConsistent();
}; };
@ -137,15 +117,11 @@ using Chunks = std::vector<Chunk>;
/// AsyncInsert needs two kinds of information: /// AsyncInsert needs two kinds of information:
/// - offsets of different sub-chunks /// - offsets of different sub-chunks
/// - tokens of different sub-chunks, which are assigned by setting `insert_deduplication_token`. /// - tokens of different sub-chunks, which are assigned by setting `insert_deduplication_token`.
class AsyncInsertInfo : public ChunkInfoCloneable<AsyncInsertInfo> class AsyncInsertInfo : public ChunkInfo
{ {
public: public:
AsyncInsertInfo() = default; AsyncInsertInfo() = default;
AsyncInsertInfo(const AsyncInsertInfo & other) = default; explicit AsyncInsertInfo(const std::vector<size_t> & offsets_, const std::vector<String> & tokens_) : offsets(offsets_), tokens(tokens_) {}
AsyncInsertInfo(const std::vector<size_t> & offsets_, const std::vector<String> & tokens_)
: offsets(offsets_)
, tokens(tokens_)
{}
std::vector<size_t> offsets; std::vector<size_t> offsets;
std::vector<String> tokens; std::vector<String> tokens;
@ -154,11 +130,9 @@ public:
using AsyncInsertInfoPtr = std::shared_ptr<AsyncInsertInfo>; using AsyncInsertInfoPtr = std::shared_ptr<AsyncInsertInfo>;
/// Extension to support delayed defaults. AddingDefaultsProcessor uses it to replace missing values with column defaults. /// Extension to support delayed defaults. AddingDefaultsProcessor uses it to replace missing values with column defaults.
class ChunkMissingValues : public ChunkInfoCloneable<ChunkMissingValues> class ChunkMissingValues : public ChunkInfo
{ {
public: public:
ChunkMissingValues(const ChunkMissingValues & other) = default;
using RowsBitMask = std::vector<bool>; /// a bit per row for a column using RowsBitMask = std::vector<bool>; /// a bit per row for a column
const RowsBitMask & getDefaultsBitmask(size_t column_idx) const; const RowsBitMask & getDefaultsBitmask(size_t column_idx) const;

View File

@ -147,10 +147,13 @@ bool PullingAsyncPipelineExecutor::pull(Block & block, uint64_t milliseconds)
block = lazy_format->getPort(IOutputFormat::PortKind::Main).getHeader().cloneWithColumns(chunk.detachColumns()); block = lazy_format->getPort(IOutputFormat::PortKind::Main).getHeader().cloneWithColumns(chunk.detachColumns());
if (auto agg_info = chunk.getChunkInfos().get<AggregatedChunkInfo>()) if (auto chunk_info = chunk.getChunkInfo())
{ {
block.info.bucket_num = agg_info->bucket_num; if (const auto * agg_info = typeid_cast<const AggregatedChunkInfo *>(chunk_info.get()))
block.info.is_overflows = agg_info->is_overflows; {
block.info.bucket_num = agg_info->bucket_num;
block.info.is_overflows = agg_info->is_overflows;
}
} }
return true; return true;

View File

@ -73,10 +73,13 @@ bool PullingPipelineExecutor::pull(Block & block)
} }
block = pulling_format->getPort(IOutputFormat::PortKind::Main).getHeader().cloneWithColumns(chunk.detachColumns()); block = pulling_format->getPort(IOutputFormat::PortKind::Main).getHeader().cloneWithColumns(chunk.detachColumns());
if (auto agg_info = chunk.getChunkInfos().get<AggregatedChunkInfo>()) if (auto chunk_info = chunk.getChunkInfo())
{ {
block.info.bucket_num = agg_info->bucket_num; if (const auto * agg_info = typeid_cast<const AggregatedChunkInfo *>(chunk_info.get()))
block.info.is_overflows = agg_info->is_overflows; {
block.info.bucket_num = agg_info->bucket_num;
block.info.is_overflows = agg_info->is_overflows;
}
} }
return true; return true;

View File

@ -179,9 +179,7 @@ void ParquetBlockOutputFormat::consume(Chunk chunk)
columns[i]->insertRangeFrom(*concatenated.getColumns()[i], offset, count); columns[i]->insertRangeFrom(*concatenated.getColumns()[i], offset, count);
Chunks piece; Chunks piece;
piece.emplace_back(std::move(columns), count); piece.emplace_back(std::move(columns), count, concatenated.getChunkInfo());
piece.back().setChunkInfos(concatenated.getChunkInfos());
writeRowGroup(std::move(piece)); writeRowGroup(std::move(piece));
} }
} }

View File

@ -8,9 +8,8 @@ namespace ErrorCodes
} }
IAccumulatingTransform::IAccumulatingTransform(Block input_header, Block output_header) IAccumulatingTransform::IAccumulatingTransform(Block input_header, Block output_header)
: IProcessor({std::move(input_header)}, {std::move(output_header)}) : IProcessor({std::move(input_header)}, {std::move(output_header)}),
, input(inputs.front()) input(inputs.front()), output(outputs.front())
, output(outputs.front())
{ {
} }

View File

@ -53,11 +53,13 @@ void FinishAggregatingInOrderAlgorithm::consume(Input & input, size_t source_num
if (!input.chunk.hasRows()) if (!input.chunk.hasRows())
return; return;
if (input.chunk.getChunkInfos().empty()) const auto & info = input.chunk.getChunkInfo();
if (!info)
throw Exception(ErrorCodes::LOGICAL_ERROR, "Chunk info was not set for chunk in FinishAggregatingInOrderAlgorithm"); throw Exception(ErrorCodes::LOGICAL_ERROR, "Chunk info was not set for chunk in FinishAggregatingInOrderAlgorithm");
Int64 allocated_bytes = 0; Int64 allocated_bytes = 0;
if (auto arenas_info = input.chunk.getChunkInfos().get<ChunkInfoWithAllocatedBytes>()) /// Will be set by AggregatingInOrderTransform during local aggregation; will be nullptr during merging on initiator.
if (const auto * arenas_info = typeid_cast<const ChunkInfoWithAllocatedBytes *>(info.get()))
allocated_bytes = arenas_info->allocated_bytes; allocated_bytes = arenas_info->allocated_bytes;
states[source_num] = State{input.chunk, description, allocated_bytes}; states[source_num] = State{input.chunk, description, allocated_bytes};
@ -134,7 +136,7 @@ Chunk FinishAggregatingInOrderAlgorithm::prepareToMerge()
info->chunk_num = chunk_num++; info->chunk_num = chunk_num++;
Chunk chunk; Chunk chunk;
chunk.getChunkInfos().add(std::move(info)); chunk.setChunkInfo(std::move(info));
return chunk; return chunk;
} }
@ -161,7 +163,7 @@ void FinishAggregatingInOrderAlgorithm::addToAggregation()
chunks.emplace_back(std::move(new_columns), current_rows); chunks.emplace_back(std::move(new_columns), current_rows);
} }
chunks.back().getChunkInfos().add(std::make_shared<AggregatedChunkInfo>()); chunks.back().setChunkInfo(std::make_shared<AggregatedChunkInfo>());
states[i].current_row = states[i].to_row; states[i].current_row = states[i].to_row;
/// We assume that sizes in bytes of rows are almost the same. /// We assume that sizes in bytes of rows are almost the same.

View File

@ -6,22 +6,18 @@ namespace DB
{ {
/// To carry part level if chunk is produced by a merge tree source /// To carry part level if chunk is produced by a merge tree source
class MergeTreePartLevelInfo : public ChunkInfoCloneable<MergeTreePartLevelInfo> class MergeTreePartLevelInfo : public ChunkInfo
{ {
public: public:
MergeTreePartLevelInfo() = delete; MergeTreePartLevelInfo() = delete;
explicit MergeTreePartLevelInfo(ssize_t part_level) explicit MergeTreePartLevelInfo(ssize_t part_level) : origin_merge_tree_part_level(part_level) { }
: origin_merge_tree_part_level(part_level)
{ }
MergeTreePartLevelInfo(const MergeTreePartLevelInfo & other) = default;
size_t origin_merge_tree_part_level = 0; size_t origin_merge_tree_part_level = 0;
}; };
inline size_t getPartLevelFromChunk(const Chunk & chunk) inline size_t getPartLevelFromChunk(const Chunk & chunk)
{ {
const auto part_level_info = chunk.getChunkInfos().get<MergeTreePartLevelInfo>(); const auto & info = chunk.getChunkInfo();
if (part_level_info) if (const auto * part_level_info = typeid_cast<const MergeTreePartLevelInfo *>(info.get()))
return part_level_info->origin_merge_tree_part_level; return part_level_info->origin_merge_tree_part_level;
return 0; return 0;
} }

View File

@ -17,7 +17,7 @@ namespace ErrorCodes
static IMergingAlgorithm::Status emitChunk(detail::SharedChunkPtr & chunk, bool finished = false) static IMergingAlgorithm::Status emitChunk(detail::SharedChunkPtr & chunk, bool finished = false)
{ {
chunk->getChunkInfos().add(std::make_shared<ChunkSelectFinalIndices>(std::move(chunk->replace_final_selection))); chunk->setChunkInfo(std::make_shared<ChunkSelectFinalIndices>(std::move(chunk->replace_final_selection)));
return IMergingAlgorithm::Status(std::move(*chunk), finished); return IMergingAlgorithm::Status(std::move(*chunk), finished);
} }

View File

@ -3,7 +3,6 @@
#include <Processors/Merges/Algorithms/MergedData.h> #include <Processors/Merges/Algorithms/MergedData.h>
#include <Processors/Transforms/ColumnGathererTransform.h> #include <Processors/Transforms/ColumnGathererTransform.h>
#include <Processors/Merges/Algorithms/RowRef.h> #include <Processors/Merges/Algorithms/RowRef.h>
#include <Processors/Chunk.h>
namespace Poco namespace Poco
{ {
@ -15,13 +14,11 @@ namespace DB
/** Use in skipping final to keep list of indices of selected row after merging final /** Use in skipping final to keep list of indices of selected row after merging final
*/ */
struct ChunkSelectFinalIndices : public ChunkInfoCloneable<ChunkSelectFinalIndices> struct ChunkSelectFinalIndices : public ChunkInfo
{ {
explicit ChunkSelectFinalIndices(MutableColumnPtr select_final_indices_);
ChunkSelectFinalIndices(const ChunkSelectFinalIndices & other) = default;
const ColumnPtr column_holder; const ColumnPtr column_holder;
const ColumnUInt64 * select_final_indices = nullptr; const ColumnUInt64 * select_final_indices = nullptr;
explicit ChunkSelectFinalIndices(MutableColumnPtr select_final_indices_);
}; };
/** Merges several sorted inputs into one. /** Merges several sorted inputs into one.

View File

@ -157,7 +157,7 @@ IProcessor::Status IMergingTransformBase::prepare()
bool is_port_full = !output.canPush(); bool is_port_full = !output.canPush();
/// Push if has data. /// Push if has data.
if ((state.output_chunk || !state.output_chunk.getChunkInfos().empty()) && !is_port_full) if ((state.output_chunk || state.output_chunk.hasChunkInfo()) && !is_port_full)
output.push(std::move(state.output_chunk)); output.push(std::move(state.output_chunk));
if (!is_initialized) if (!is_initialized)

View File

@ -129,7 +129,7 @@ public:
IMergingAlgorithm::Status status = algorithm.merge(); IMergingAlgorithm::Status status = algorithm.merge();
if ((status.chunk && status.chunk.hasRows()) || !status.chunk.getChunkInfos().empty()) if ((status.chunk && status.chunk.hasRows()) || status.chunk.hasChunkInfo())
{ {
// std::cerr << "Got chunk with " << status.chunk.getNumRows() << " rows" << std::endl; // std::cerr << "Got chunk with " << status.chunk.getNumRows() << " rows" << std::endl;
state.output_chunk = std::move(status.chunk); state.output_chunk = std::move(status.chunk);

View File

@ -20,7 +20,7 @@ public:
} }
String getName() const override { return "RemoteSink"; } String getName() const override { return "RemoteSink"; }
void consume (Chunk & chunk) override { write(RemoteInserter::getHeader().cloneWithColumns(chunk.getColumns())); } void consume (Chunk chunk) override { write(RemoteInserter::getHeader().cloneWithColumns(chunk.detachColumns())); }
void onFinish() override { RemoteInserter::onFinish(); } void onFinish() override { RemoteInserter::onFinish(); }
}; };

View File

@ -15,8 +15,9 @@ void SinkToStorage::onConsume(Chunk chunk)
*/ */
Nested::validateArraySizes(getHeader().cloneWithColumns(chunk.getColumns())); Nested::validateArraySizes(getHeader().cloneWithColumns(chunk.getColumns()));
consume(chunk); consume(chunk.clone());
cur_chunk = std::move(chunk); if (!lastBlockIsDuplicate())
cur_chunk = std::move(chunk);
} }
SinkToStorage::GenerateResult SinkToStorage::onGenerate() SinkToStorage::GenerateResult SinkToStorage::onGenerate()

View File

@ -18,7 +18,8 @@ public:
void addTableLock(const TableLockHolder & lock) { table_locks.push_back(lock); } void addTableLock(const TableLockHolder & lock) { table_locks.push_back(lock); }
protected: protected:
virtual void consume(Chunk & chunk) = 0; virtual void consume(Chunk chunk) = 0;
virtual bool lastBlockIsDuplicate() const { return false; }
private: private:
std::vector<TableLockHolder> table_locks; std::vector<TableLockHolder> table_locks;
@ -37,7 +38,7 @@ class NullSinkToStorage : public SinkToStorage
public: public:
using SinkToStorage::SinkToStorage; using SinkToStorage::SinkToStorage;
std::string getName() const override { return "NullSinkToStorage"; } std::string getName() const override { return "NullSinkToStorage"; }
void consume(Chunk &) override {} void consume(Chunk) override {}
}; };
using SinkPtr = std::shared_ptr<SinkToStorage>; using SinkPtr = std::shared_ptr<SinkToStorage>;

View File

@ -43,10 +43,7 @@ protected:
info->bucket_num = res.info.bucket_num; info->bucket_num = res.info.bucket_num;
info->is_overflows = res.info.is_overflows; info->is_overflows = res.info.is_overflows;
auto chunk = Chunk(res.getColumns(), res.rows()); return Chunk(res.getColumns(), res.rows(), std::move(info));
chunk.getChunkInfos().add(std::move(info));
return chunk;
} }
private: private:

View File

@ -176,7 +176,7 @@ std::optional<Chunk> RemoteSource::tryGenerate()
auto info = std::make_shared<AggregatedChunkInfo>(); auto info = std::make_shared<AggregatedChunkInfo>();
info->bucket_num = block.info.bucket_num; info->bucket_num = block.info.bucket_num;
info->is_overflows = block.info.is_overflows; info->is_overflows = block.info.is_overflows;
chunk.getChunkInfos().add(std::move(info)); chunk.setChunkInfo(std::move(info));
} }
return chunk; return chunk;

View File

@ -5,9 +5,7 @@
namespace DB namespace DB
{ {
SourceFromSingleChunk::SourceFromSingleChunk(Block header, Chunk chunk_) : ISource(std::move(header)), chunk(std::move(chunk_)) SourceFromSingleChunk::SourceFromSingleChunk(Block header, Chunk chunk_) : ISource(std::move(header)), chunk(std::move(chunk_)) {}
{
}
SourceFromSingleChunk::SourceFromSingleChunk(Block data) : ISource(data.cloneEmpty()), chunk(data.getColumns(), data.rows()) SourceFromSingleChunk::SourceFromSingleChunk(Block data) : ISource(data.cloneEmpty()), chunk(data.getColumns(), data.rows())
{ {
@ -22,7 +20,7 @@ SourceFromSingleChunk::SourceFromSingleChunk(Block data) : ISource(data.cloneEmp
auto info = std::make_shared<AggregatedChunkInfo>(); auto info = std::make_shared<AggregatedChunkInfo>();
info->bucket_num = data.info.bucket_num; info->bucket_num = data.info.bucket_num;
info->is_overflows = data.info.is_overflows; info->is_overflows = data.info.is_overflows;
chunk.getChunkInfos().add(std::move(info)); chunk.setChunkInfo(std::move(info));
} }
} }

View File

@ -332,7 +332,7 @@ void AggregatingInOrderTransform::generate()
variants.aggregates_pool = variants.aggregates_pools.at(0).get(); variants.aggregates_pool = variants.aggregates_pools.at(0).get();
/// Pass info about used memory by aggregate functions further. /// Pass info about used memory by aggregate functions further.
to_push_chunk.getChunkInfos().add(std::make_shared<ChunkInfoWithAllocatedBytes>(cur_block_bytes)); to_push_chunk.setChunkInfo(std::make_shared<ChunkInfoWithAllocatedBytes>(cur_block_bytes));
cur_block_bytes = 0; cur_block_bytes = 0;
cur_block_size = 0; cur_block_size = 0;
@ -351,12 +351,11 @@ FinalizeAggregatedTransform::FinalizeAggregatedTransform(Block header, Aggregati
void FinalizeAggregatedTransform::transform(Chunk & chunk) void FinalizeAggregatedTransform::transform(Chunk & chunk)
{ {
if (params->final) if (params->final)
{
finalizeChunk(chunk, aggregates_mask); finalizeChunk(chunk, aggregates_mask);
} else if (!chunk.getChunkInfo())
else if (!chunk.getChunkInfos().get<AggregatedChunkInfo>())
{ {
chunk.getChunkInfos().add(std::make_shared<AggregatedChunkInfo>()); auto info = std::make_shared<AggregatedChunkInfo>();
chunk.setChunkInfo(std::move(info));
} }
} }

View File

@ -5,7 +5,6 @@
#include <Processors/ISimpleTransform.h> #include <Processors/ISimpleTransform.h>
#include <Processors/Transforms/AggregatingTransform.h> #include <Processors/Transforms/AggregatingTransform.h>
#include <Processors/Transforms/finalizeChunk.h> #include <Processors/Transforms/finalizeChunk.h>
#include <Processors/Chunk.h>
namespace DB namespace DB
{ {
@ -13,12 +12,10 @@ namespace DB
struct InputOrderInfo; struct InputOrderInfo;
using InputOrderInfoPtr = std::shared_ptr<const InputOrderInfo>; using InputOrderInfoPtr = std::shared_ptr<const InputOrderInfo>;
struct ChunkInfoWithAllocatedBytes : public ChunkInfoCloneable<ChunkInfoWithAllocatedBytes> struct ChunkInfoWithAllocatedBytes : public ChunkInfo
{ {
ChunkInfoWithAllocatedBytes(const ChunkInfoWithAllocatedBytes & other) = default;
explicit ChunkInfoWithAllocatedBytes(Int64 allocated_bytes_) explicit ChunkInfoWithAllocatedBytes(Int64 allocated_bytes_)
: allocated_bytes(allocated_bytes_) {} : allocated_bytes(allocated_bytes_) {}
Int64 allocated_bytes; Int64 allocated_bytes;
}; };

View File

@ -35,7 +35,7 @@ Chunk convertToChunk(const Block & block)
UInt64 num_rows = block.rows(); UInt64 num_rows = block.rows();
Chunk chunk(block.getColumns(), num_rows); Chunk chunk(block.getColumns(), num_rows);
chunk.getChunkInfos().add(std::move(info)); chunk.setChunkInfo(std::move(info));
return chunk; return chunk;
} }
@ -44,11 +44,15 @@ namespace
{ {
const AggregatedChunkInfo * getInfoFromChunk(const Chunk & chunk) const AggregatedChunkInfo * getInfoFromChunk(const Chunk & chunk)
{ {
auto agg_info = chunk.getChunkInfos().get<AggregatedChunkInfo>(); const auto & info = chunk.getChunkInfo();
if (!info)
throw Exception(ErrorCodes::LOGICAL_ERROR, "Chunk info was not set for chunk.");
const auto * agg_info = typeid_cast<const AggregatedChunkInfo *>(info.get());
if (!agg_info) if (!agg_info)
throw Exception(ErrorCodes::LOGICAL_ERROR, "Chunk should have AggregatedChunkInfo."); throw Exception(ErrorCodes::LOGICAL_ERROR, "Chunk should have AggregatedChunkInfo.");
return agg_info.get(); return agg_info;
} }
/// Reads chunks from file in native format. Provide chunks with aggregation info. /// Reads chunks from file in native format. Provide chunks with aggregation info.
@ -206,7 +210,11 @@ private:
void process(Chunk && chunk) void process(Chunk && chunk)
{ {
auto chunks_to_merge = chunk.getChunkInfos().get<ChunksToMerge>(); if (!chunk.hasChunkInfo())
throw Exception(ErrorCodes::LOGICAL_ERROR, "Expected chunk with chunk info in {}", getName());
const auto & info = chunk.getChunkInfo();
const auto * chunks_to_merge = typeid_cast<const ChunksToMerge *>(info.get());
if (!chunks_to_merge) if (!chunks_to_merge)
throw Exception(ErrorCodes::LOGICAL_ERROR, "Expected chunk with ChunksToMerge info in {}", getName()); throw Exception(ErrorCodes::LOGICAL_ERROR, "Expected chunk with ChunksToMerge info in {}", getName());
@ -775,7 +783,7 @@ void AggregatingTransform::initGenerate()
{ {
/// Just a reasonable constant, matches default value for the setting `preferred_block_size_bytes` /// Just a reasonable constant, matches default value for the setting `preferred_block_size_bytes`
static constexpr size_t oneMB = 1024 * 1024; static constexpr size_t oneMB = 1024 * 1024;
return std::make_shared<SimpleSquashingTransform>(header, params->params.max_block_size, oneMB); return std::make_shared<SimpleSquashingChunksTransform>(header, params->params.max_block_size, oneMB);
}); });
} }
/// AggregatingTransform::expandPipeline expects single output port. /// AggregatingTransform::expandPipeline expects single output port.

View File

@ -2,7 +2,6 @@
#include <Compression/CompressedReadBuffer.h> #include <Compression/CompressedReadBuffer.h>
#include <IO/ReadBufferFromFile.h> #include <IO/ReadBufferFromFile.h>
#include <Interpreters/Aggregator.h> #include <Interpreters/Aggregator.h>
#include <Processors/Chunk.h>
#include <Processors/IAccumulatingTransform.h> #include <Processors/IAccumulatingTransform.h>
#include <Common/Stopwatch.h> #include <Common/Stopwatch.h>
#include <Common/setThreadName.h> #include <Common/setThreadName.h>
@ -20,7 +19,7 @@ namespace CurrentMetrics
namespace DB namespace DB
{ {
class AggregatedChunkInfo : public ChunkInfoCloneable<AggregatedChunkInfo> class AggregatedChunkInfo : public ChunkInfo
{ {
public: public:
bool is_overflows = false; bool is_overflows = false;

View File

@ -27,12 +27,18 @@ public:
} }
ExceptionKeepingTransform::work(); ExceptionKeepingTransform::work();
if (finish_chunk)
{
data.chunk = std::move(finish_chunk);
ready_output = true;
}
} }
protected: protected:
void onConsume(Chunk chunk) override void onConsume(Chunk chunk) override
{ {
cur_chunk = Squashing::squash(std::move(chunk)); if (auto res_chunk = DB::Squashing::squash(std::move(chunk)))
cur_chunk.setColumns(res_chunk.getColumns(), res_chunk.getNumRows());
} }
GenerateResult onGenerate() override GenerateResult onGenerate() override
@ -42,10 +48,16 @@ protected:
res.is_done = true; res.is_done = true;
return res; return res;
} }
void onFinish() override
{
auto chunk = DB::Squashing::squash({});
finish_chunk.setColumns(chunk.getColumns(), chunk.getNumRows());
}
private: private:
Squashing squashing; Squashing squashing;
Chunk cur_chunk; Chunk cur_chunk;
Chunk finish_chunk;
}; };
} }

View File

@ -1,7 +1,6 @@
#include <Processors/Transforms/CountingTransform.h>
#include <IO/Progress.h>
#include <Interpreters/ProcessList.h> #include <Interpreters/ProcessList.h>
#include <Processors/Transforms/CountingTransform.h>
#include <Common/ProfileEvents.h> #include <Common/ProfileEvents.h>
#include <Common/ThreadStatus.h> #include <Common/ThreadStatus.h>

Some files were not shown because too many files have changed in this diff Show More