mirror of
https://github.com/ClickHouse/ClickHouse.git
synced 2024-11-22 15:42:02 +00:00
Merge remote-tracking branch 'upstream/master' into table-override-pg
This commit is contained in:
commit
dd902b6875
4
.github/workflows/anchore-analysis.yml
vendored
4
.github/workflows/anchore-analysis.yml
vendored
@ -8,6 +8,10 @@
|
||||
|
||||
name: Docker Container Scan (clickhouse-server)
|
||||
|
||||
env:
|
||||
# Force the stdout and stderr streams to be unbuffered
|
||||
PYTHONUNBUFFERED: 1
|
||||
|
||||
"on":
|
||||
pull_request:
|
||||
paths:
|
||||
|
6
.github/workflows/backport.yml
vendored
6
.github/workflows/backport.yml
vendored
@ -1,4 +1,9 @@
|
||||
name: CherryPick
|
||||
|
||||
env:
|
||||
# Force the stdout and stderr streams to be unbuffered
|
||||
PYTHONUNBUFFERED: 1
|
||||
|
||||
concurrency:
|
||||
group: cherry-pick
|
||||
on: # yamllint disable-line rule:truthy
|
||||
@ -9,6 +14,7 @@ jobs:
|
||||
runs-on: [self-hosted, style-checker]
|
||||
steps:
|
||||
- name: Set envs
|
||||
# https://docs.github.com/en/actions/learn-github-actions/workflow-commands-for-github-actions#multiline-strings
|
||||
run: |
|
||||
cat >> "$GITHUB_ENV" << 'EOF'
|
||||
TEMP_PATH=${{runner.temp}}/cherry_pick
|
||||
|
5
.github/workflows/backport_branches.yml
vendored
5
.github/workflows/backport_branches.yml
vendored
@ -1,4 +1,9 @@
|
||||
name: BackportPR
|
||||
|
||||
env:
|
||||
# Force the stdout and stderr streams to be unbuffered
|
||||
PYTHONUNBUFFERED: 1
|
||||
|
||||
on: # yamllint disable-line rule:truthy
|
||||
push:
|
||||
branches:
|
||||
|
5
.github/workflows/cancel.yml
vendored
5
.github/workflows/cancel.yml
vendored
@ -1,4 +1,9 @@
|
||||
name: Cancel
|
||||
|
||||
env:
|
||||
# Force the stdout and stderr streams to be unbuffered
|
||||
PYTHONUNBUFFERED: 1
|
||||
|
||||
on: # yamllint disable-line rule:truthy
|
||||
workflow_run:
|
||||
workflows: ["CIGithubActions", "ReleaseCI", "DocsCheck", "BackportPR"]
|
||||
|
5
.github/workflows/docs_check.yml
vendored
5
.github/workflows/docs_check.yml
vendored
@ -1,4 +1,9 @@
|
||||
name: DocsCheck
|
||||
|
||||
env:
|
||||
# Force the stdout and stderr streams to be unbuffered
|
||||
PYTHONUNBUFFERED: 1
|
||||
|
||||
on: # yamllint disable-line rule:truthy
|
||||
pull_request:
|
||||
types:
|
||||
|
47
.github/workflows/main.yml
vendored
47
.github/workflows/main.yml
vendored
@ -1,4 +1,9 @@
|
||||
name: CIGithubActions
|
||||
|
||||
env:
|
||||
# Force the stdout and stderr streams to be unbuffered
|
||||
PYTHONUNBUFFERED: 1
|
||||
|
||||
on: # yamllint disable-line rule:truthy
|
||||
pull_request:
|
||||
types:
|
||||
@ -329,6 +334,47 @@ jobs:
|
||||
docker kill $(docker ps -q) ||:
|
||||
docker rm -f $(docker ps -a -q) ||:
|
||||
sudo rm -fr $TEMP_PATH $CACHES_PATH
|
||||
BuilderDebAarch64:
|
||||
needs: [DockerHubPush, FastTest]
|
||||
runs-on: [self-hosted, builder]
|
||||
steps:
|
||||
- name: Set envs
|
||||
run: |
|
||||
cat >> "$GITHUB_ENV" << 'EOF'
|
||||
TEMP_PATH=${{runner.temp}}/build_check
|
||||
IMAGES_PATH=${{runner.temp}}/images_path
|
||||
REPO_COPY=${{runner.temp}}/build_check/ClickHouse
|
||||
CACHES_PATH=${{runner.temp}}/../ccaches
|
||||
CHECK_NAME=ClickHouse build check (actions)
|
||||
BUILD_NAME=package_aarch64
|
||||
EOF
|
||||
- name: Download changed images
|
||||
uses: actions/download-artifact@v2
|
||||
with:
|
||||
name: changed_images
|
||||
path: ${{ runner.temp }}/images_path
|
||||
- name: Check out repository code
|
||||
uses: actions/checkout@v2
|
||||
with:
|
||||
submodules: 'true'
|
||||
fetch-depth: 0 # otherwise we will have no info about contributors
|
||||
- name: Build
|
||||
run: |
|
||||
sudo rm -fr $TEMP_PATH
|
||||
mkdir -p $TEMP_PATH
|
||||
cp -r $GITHUB_WORKSPACE $TEMP_PATH
|
||||
cd $REPO_COPY/tests/ci && python3 build_check.py "$CHECK_NAME" $BUILD_NAME
|
||||
- name: Upload build URLs to artifacts
|
||||
uses: actions/upload-artifact@v2
|
||||
with:
|
||||
name: ${{ env.BUILD_NAME }}
|
||||
path: ${{ runner.temp }}/build_check/${{ env.BUILD_NAME }}.json
|
||||
- name: Cleanup
|
||||
if: always()
|
||||
run: |
|
||||
docker kill $(docker ps -q) ||:
|
||||
docker rm -f $(docker ps -a -q) ||:
|
||||
sudo rm -fr $TEMP_PATH $CACHES_PATH
|
||||
BuilderDebAsan:
|
||||
needs: [DockerHubPush, FastTest]
|
||||
runs-on: [self-hosted, builder]
|
||||
@ -867,6 +913,7 @@ jobs:
|
||||
needs:
|
||||
- BuilderDebRelease
|
||||
- BuilderBinRelease
|
||||
- BuilderDebAarch64
|
||||
- BuilderDebAsan
|
||||
- BuilderDebTsan
|
||||
- BuilderDebUBsan
|
||||
|
5
.github/workflows/master.yml
vendored
5
.github/workflows/master.yml
vendored
@ -1,4 +1,9 @@
|
||||
name: MasterCI
|
||||
|
||||
env:
|
||||
# Force the stdout and stderr streams to be unbuffered
|
||||
PYTHONUNBUFFERED: 1
|
||||
|
||||
on: # yamllint disable-line rule:truthy
|
||||
push:
|
||||
branches:
|
||||
|
21
.github/workflows/release.yml
vendored
21
.github/workflows/release.yml
vendored
@ -1,4 +1,9 @@
|
||||
name: DocsReleaseChecks
|
||||
|
||||
env:
|
||||
# Force the stdout and stderr streams to be unbuffered
|
||||
PYTHONUNBUFFERED: 1
|
||||
|
||||
concurrency:
|
||||
group: master-release
|
||||
cancel-in-progress: true
|
||||
@ -35,6 +40,17 @@ jobs:
|
||||
needs: DockerHubPush
|
||||
runs-on: [self-hosted, func-tester]
|
||||
steps:
|
||||
- name: Set envs
|
||||
# https://docs.github.com/en/actions/learn-github-actions/workflow-commands-for-github-actions#multiline-strings
|
||||
run: |
|
||||
cat >> "$GITHUB_ENV" << 'EOF'
|
||||
TEMP_PATH=${{runner.temp}}/docs_release
|
||||
REPO_COPY=${{runner.temp}}/docs_release/ClickHouse
|
||||
CLOUDFLARE_TOKEN=${{secrets.CLOUDFLARE}}
|
||||
ROBOT_CLICKHOUSE_SSH_KEY<<RCSK
|
||||
${{secrets.ROBOT_CLICKHOUSE_SSH_KEY}}
|
||||
RCSK
|
||||
EOF
|
||||
- name: Clear repository
|
||||
run: |
|
||||
sudo rm -fr $GITHUB_WORKSPACE && mkdir $GITHUB_WORKSPACE
|
||||
@ -46,11 +62,6 @@ jobs:
|
||||
name: changed_images
|
||||
path: ${{ env.TEMP_PATH }}
|
||||
- name: Docs Release
|
||||
env:
|
||||
TEMP_PATH: ${{runner.temp}}/docs_release
|
||||
REPO_COPY: ${{runner.temp}}/docs_release/ClickHouse
|
||||
CLOUDFLARE_TOKEN: ${{secrets.CLOUDFLARE}}
|
||||
ROBOT_CLICKHOUSE_SSH_KEY: ${{secrets.ROBOT_CLICKHOUSE_SSH_KEY}}
|
||||
run: |
|
||||
sudo rm -fr $TEMP_PATH
|
||||
mkdir -p $TEMP_PATH
|
||||
|
5
.github/workflows/release_branches.yml
vendored
5
.github/workflows/release_branches.yml
vendored
@ -1,4 +1,9 @@
|
||||
name: ReleaseCI
|
||||
|
||||
env:
|
||||
# Force the stdout and stderr streams to be unbuffered
|
||||
PYTHONUNBUFFERED: 1
|
||||
|
||||
on: # yamllint disable-line rule:truthy
|
||||
push:
|
||||
branches:
|
||||
|
@ -82,7 +82,9 @@ PoolWithFailover::PoolWithFailover(
|
||||
unsigned default_connections_,
|
||||
unsigned max_connections_,
|
||||
size_t max_tries_,
|
||||
uint64_t wait_timeout_)
|
||||
uint64_t wait_timeout_,
|
||||
size_t connect_timeout_,
|
||||
size_t rw_timeout_)
|
||||
: max_tries(max_tries_)
|
||||
, shareable(false)
|
||||
, wait_timeout(wait_timeout_)
|
||||
@ -93,8 +95,8 @@ PoolWithFailover::PoolWithFailover(
|
||||
replicas_by_priority[0].emplace_back(std::make_shared<Pool>(database,
|
||||
host, user, password, port,
|
||||
/* socket_ = */ "",
|
||||
MYSQLXX_DEFAULT_TIMEOUT,
|
||||
MYSQLXX_DEFAULT_RW_TIMEOUT,
|
||||
connect_timeout_,
|
||||
rw_timeout_,
|
||||
default_connections_,
|
||||
max_connections_));
|
||||
}
|
||||
|
@ -6,6 +6,7 @@
|
||||
#define MYSQLXX_POOL_WITH_FAILOVER_DEFAULT_START_CONNECTIONS 1
|
||||
#define MYSQLXX_POOL_WITH_FAILOVER_DEFAULT_MAX_CONNECTIONS 16
|
||||
#define MYSQLXX_POOL_WITH_FAILOVER_DEFAULT_MAX_TRIES 3
|
||||
#define MYSQLXX_POOL_WITH_FAILOVER_DEFAULT_CONNECTION_WAIT_TIMEOUT 5 /// in seconds
|
||||
|
||||
|
||||
namespace mysqlxx
|
||||
@ -121,7 +122,9 @@ namespace mysqlxx
|
||||
unsigned default_connections_ = MYSQLXX_POOL_WITH_FAILOVER_DEFAULT_START_CONNECTIONS,
|
||||
unsigned max_connections_ = MYSQLXX_POOL_WITH_FAILOVER_DEFAULT_MAX_CONNECTIONS,
|
||||
size_t max_tries_ = MYSQLXX_POOL_WITH_FAILOVER_DEFAULT_MAX_TRIES,
|
||||
uint64_t wait_timeout_ = UINT64_MAX);
|
||||
uint64_t wait_timeout_ = MYSQLXX_POOL_WITH_FAILOVER_DEFAULT_CONNECTION_WAIT_TIMEOUT,
|
||||
size_t connect_timeout = MYSQLXX_DEFAULT_TIMEOUT,
|
||||
size_t rw_timeout = MYSQLXX_DEFAULT_RW_TIMEOUT);
|
||||
|
||||
PoolWithFailover(const PoolWithFailover & other);
|
||||
|
||||
|
4
debian/rules
vendored
4
debian/rules
vendored
@ -45,6 +45,10 @@ ifdef DEB_CXX
|
||||
ifeq ($(DEB_BUILD_GNU_TYPE),$(DEB_HOST_GNU_TYPE))
|
||||
CC := $(DEB_CC)
|
||||
CXX := $(DEB_CXX)
|
||||
else ifeq (clang,$(findstring clang,$(DEB_CXX)))
|
||||
# If we crosscompile with clang, it knows what to do
|
||||
CC := $(DEB_CC)
|
||||
CXX := $(DEB_CXX)
|
||||
else
|
||||
CC := $(DEB_HOST_GNU_TYPE)-$(DEB_CC)
|
||||
CXX := $(DEB_HOST_GNU_TYPE)-$(DEB_CXX)
|
||||
|
@ -24,40 +24,34 @@ RUN apt-get update \
|
||||
&& apt-key add /tmp/llvm-snapshot.gpg.key \
|
||||
&& export CODENAME="$(lsb_release --codename --short | tr 'A-Z' 'a-z')" \
|
||||
&& echo "deb [trusted=yes] https://apt.llvm.org/${CODENAME}/ llvm-toolchain-${CODENAME}-${LLVM_VERSION} main" >> \
|
||||
/etc/apt/sources.list
|
||||
/etc/apt/sources.list \
|
||||
&& apt-get clean
|
||||
|
||||
# initial packages
|
||||
RUN apt-get update \
|
||||
&& apt-get install \
|
||||
bash \
|
||||
fakeroot \
|
||||
ccache \
|
||||
curl \
|
||||
software-properties-common \
|
||||
--yes --no-install-recommends
|
||||
|
||||
RUN apt-get update \
|
||||
&& apt-get install \
|
||||
bash \
|
||||
build-essential \
|
||||
ccache \
|
||||
clang-${LLVM_VERSION} \
|
||||
clang-tidy-${LLVM_VERSION} \
|
||||
cmake \
|
||||
curl \
|
||||
fakeroot \
|
||||
gdb \
|
||||
git \
|
||||
gperf \
|
||||
clang-${LLVM_VERSION} \
|
||||
clang-tidy-${LLVM_VERSION} \
|
||||
lld-${LLVM_VERSION} \
|
||||
llvm-${LLVM_VERSION} \
|
||||
llvm-${LLVM_VERSION}-dev \
|
||||
libicu-dev \
|
||||
moreutils \
|
||||
ninja-build \
|
||||
pigz \
|
||||
rename \
|
||||
software-properties-common \
|
||||
tzdata \
|
||||
--yes --no-install-recommends
|
||||
--yes --no-install-recommends \
|
||||
&& apt-get clean
|
||||
|
||||
# This symlink required by gcc to find lld compiler
|
||||
RUN ln -s /usr/bin/lld-${LLVM_VERSION} /usr/bin/ld.lld
|
||||
@ -66,7 +60,7 @@ ENV CC=clang-${LLVM_VERSION}
|
||||
ENV CXX=clang++-${LLVM_VERSION}
|
||||
|
||||
# libtapi is required to support .tbh format from recent MacOS SDKs
|
||||
RUN git clone https://github.com/tpoechtrager/apple-libtapi.git \
|
||||
RUN git clone --depth 1 https://github.com/tpoechtrager/apple-libtapi.git \
|
||||
&& cd apple-libtapi \
|
||||
&& INSTALLPREFIX=/cctools ./build.sh \
|
||||
&& ./install.sh \
|
||||
@ -74,7 +68,7 @@ RUN git clone https://github.com/tpoechtrager/apple-libtapi.git \
|
||||
&& rm -rf apple-libtapi
|
||||
|
||||
# Build and install tools for cross-linking to Darwin (x86-64)
|
||||
RUN git clone https://github.com/tpoechtrager/cctools-port.git \
|
||||
RUN git clone --depth 1 https://github.com/tpoechtrager/cctools-port.git \
|
||||
&& cd cctools-port/cctools \
|
||||
&& ./configure --prefix=/cctools --with-libtapi=/cctools \
|
||||
--target=x86_64-apple-darwin \
|
||||
@ -83,7 +77,7 @@ RUN git clone https://github.com/tpoechtrager/cctools-port.git \
|
||||
&& rm -rf cctools-port
|
||||
|
||||
# Build and install tools for cross-linking to Darwin (aarch64)
|
||||
RUN git clone https://github.com/tpoechtrager/cctools-port.git \
|
||||
RUN git clone --depth 1 https://github.com/tpoechtrager/cctools-port.git \
|
||||
&& cd cctools-port/cctools \
|
||||
&& ./configure --prefix=/cctools --with-libtapi=/cctools \
|
||||
--target=aarch64-apple-darwin \
|
||||
@ -97,7 +91,8 @@ RUN wget -nv https://github.com/phracker/MacOSX-SDKs/releases/download/11.3/MacO
|
||||
# NOTE: Seems like gcc-11 is too new for ubuntu20 repository
|
||||
RUN add-apt-repository ppa:ubuntu-toolchain-r/test --yes \
|
||||
&& apt-get update \
|
||||
&& apt-get install gcc-11 g++-11 --yes
|
||||
&& apt-get install gcc-11 g++-11 --yes \
|
||||
&& apt-get clean
|
||||
|
||||
|
||||
COPY build.sh /
|
||||
|
@ -64,8 +64,14 @@ RUN add-apt-repository ppa:ubuntu-toolchain-r/test --yes \
|
||||
&& apt-get install gcc-11 g++-11 --yes
|
||||
|
||||
|
||||
# This symlink required by gcc to find lld compiler
|
||||
RUN ln -s /usr/bin/lld-${LLVM_VERSION} /usr/bin/ld.lld
|
||||
# These symlinks are required:
|
||||
# /usr/bin/ld.lld: by gcc to find lld compiler
|
||||
# /usr/bin/aarch64-linux-gnu-obj*: for debug symbols stripping
|
||||
RUN ln -s /usr/bin/lld-${LLVM_VERSION} /usr/bin/ld.lld \
|
||||
&& ln -sf /usr/lib/llvm-${LLVM_VERSION}/bin/llvm-objcopy /usr/bin/aarch64-linux-gnu-strip \
|
||||
&& ln -sf /usr/lib/llvm-${LLVM_VERSION}/bin/llvm-objcopy /usr/bin/aarch64-linux-gnu-objcopy \
|
||||
&& ln -sf /usr/lib/llvm-${LLVM_VERSION}/bin/llvm-objdump /usr/bin/aarch64-linux-gnu-objdump
|
||||
|
||||
|
||||
COPY build.sh /
|
||||
|
||||
|
@ -29,7 +29,13 @@ def pull_image(image_name):
|
||||
return False
|
||||
|
||||
def build_image(image_name, filepath):
|
||||
subprocess.check_call("docker build --network=host -t {} -f {} .".format(image_name, filepath), shell=True)
|
||||
context = os.path.dirname(filepath)
|
||||
subprocess.check_call(
|
||||
"docker build --network=host -t {} -f {} {}".format(
|
||||
image_name, filepath, context
|
||||
),
|
||||
shell=True,
|
||||
)
|
||||
|
||||
def run_docker_image_with_env(image_name, output, env_variables, ch_root, ccache_dir, docker_image_version):
|
||||
env_part = " -e ".join(env_variables)
|
||||
@ -90,6 +96,7 @@ def parse_env_variables(build_type, compiler, sanitizer, package_type, image_typ
|
||||
elif is_cross_arm:
|
||||
cc = compiler[:-len(ARM_SUFFIX)]
|
||||
cmake_flags.append("-DCMAKE_TOOLCHAIN_FILE=/build/cmake/linux/toolchain-aarch64.cmake")
|
||||
result.append("DEB_ARCH_FLAG=-aarm64")
|
||||
elif is_cross_freebsd:
|
||||
cc = compiler[:-len(FREEBSD_SUFFIX)]
|
||||
cmake_flags.append("-DCMAKE_TOOLCHAIN_FILE=/build/cmake/freebsd/toolchain-x86_64.cmake")
|
||||
@ -98,6 +105,7 @@ def parse_env_variables(build_type, compiler, sanitizer, package_type, image_typ
|
||||
cmake_flags.append("-DCMAKE_TOOLCHAIN_FILE=/build/cmake/linux/toolchain-ppc64le.cmake")
|
||||
else:
|
||||
cc = compiler
|
||||
result.append("DEB_ARCH_FLAG=-aamd64")
|
||||
|
||||
cxx = cc.replace('gcc', 'g++').replace('clang', 'clang++')
|
||||
|
||||
|
@ -77,6 +77,7 @@ RUN python3 -m pip install \
|
||||
psycopg2-binary==2.8.6 \
|
||||
pymongo==3.11.0 \
|
||||
pytest \
|
||||
pytest-order==1.0.0 \
|
||||
pytest-timeout \
|
||||
pytest-xdist \
|
||||
pytest-repeat \
|
||||
|
@ -8,8 +8,8 @@ echo '{
|
||||
"ip-forward": true,
|
||||
"log-level": "debug",
|
||||
"storage-driver": "overlay2",
|
||||
"insecure-registries" : ["dockerhub-proxy.sas.yp-c.yandex.net:5000"],
|
||||
"registry-mirrors" : ["http://dockerhub-proxy.sas.yp-c.yandex.net:5000"]
|
||||
"insecure-registries" : ["dockerhub-proxy.dockerhub-proxy-zone:5000"],
|
||||
"registry-mirrors" : ["http://dockerhub-proxy.dockerhub-proxy-zone:5000"]
|
||||
}' | dd of=/etc/docker/daemon.json 2>/dev/null
|
||||
|
||||
dockerd --host=unix:///var/run/docker.sock --host=tcp://0.0.0.0:2375 --default-address-pool base=172.17.0.0/12,size=24 &>/ClickHouse/tests/integration/dockerd.log &
|
||||
|
@ -11,6 +11,20 @@ if [[ $S3_URL == *"s3.amazonaws.com"* ]]; then
|
||||
COMMON_BUILD_PREFIX=""
|
||||
fi
|
||||
|
||||
# Sometimes AWS responde with DNS error and it's impossible to retry it with
|
||||
# current curl version options.
|
||||
function curl_with_retry
|
||||
{
|
||||
for _ in 1 2 3 4; do
|
||||
if curl --fail --head "$1";then
|
||||
return 0
|
||||
else
|
||||
sleep 0.5
|
||||
fi
|
||||
done
|
||||
return 1
|
||||
}
|
||||
|
||||
# Use the packaged repository to find the revision we will compare to.
|
||||
function find_reference_sha
|
||||
{
|
||||
@ -55,7 +69,7 @@ function find_reference_sha
|
||||
)
|
||||
for path in "${urls_to_try[@]}"
|
||||
do
|
||||
if curl --fail --retry 5 --retry-delay 1 --retry-max-time 15 --head "$path"
|
||||
if curl_with_retry "$path"
|
||||
then
|
||||
found="$path"
|
||||
break
|
||||
@ -76,7 +90,7 @@ chmod 777 workspace output
|
||||
cd workspace
|
||||
|
||||
# Download the package for the version we are going to test.
|
||||
if curl --fail --retry 5 --retry-delay 1 --retry-max-time 15 --head "$S3_URL/$PR_TO_TEST/$SHA_TO_TEST$COMMON_BUILD_PREFIX/performance/performance.tgz"
|
||||
if curl_with_retry "$S3_URL/$PR_TO_TEST/$SHA_TO_TEST$COMMON_BUILD_PREFIX/performance/performance.tgz"
|
||||
then
|
||||
right_path="$S3_URL/$PR_TO_TEST/$SHA_TO_TEST$COMMON_BUILD_PREFIX/performance/performance.tgz"
|
||||
fi
|
||||
|
@ -5,8 +5,8 @@ echo "Configure to use Yandex dockerhub-proxy"
|
||||
mkdir -p /etc/docker/
|
||||
cat > /etc/docker/daemon.json << EOF
|
||||
{
|
||||
"insecure-registries" : ["dockerhub-proxy.sas.yp-c.yandex.net:5000"],
|
||||
"registry-mirrors" : ["http://dockerhub-proxy.sas.yp-c.yandex.net:5000"]
|
||||
"insecure-registries" : ["dockerhub-proxy.dockerhub-proxy-zone:5000"],
|
||||
"registry-mirrors" : ["http://dockerhub-proxy.dockerhub-proxy-zone:5000"]
|
||||
}
|
||||
EOF
|
||||
|
||||
|
@ -16,6 +16,11 @@ ZooKeeper is one of the first well-known open-source coordination systems. It's
|
||||
|
||||
By default, ClickHouse Keeper provides the same guarantees as ZooKeeper (linearizable writes, non-linearizable reads). It has a compatible client-server protocol, so any standard ZooKeeper client can be used to interact with ClickHouse Keeper. Snapshots and logs have an incompatible format with ZooKeeper, but `clickhouse-keeper-converter` tool allows to convert ZooKeeper data to ClickHouse Keeper snapshot. Interserver protocol in ClickHouse Keeper is also incompatible with ZooKeeper so mixed ZooKeeper / ClickHouse Keeper cluster is impossible.
|
||||
|
||||
ClickHouse Keeper supports Access Control List (ACL) the same way as [ZooKeeper](https://zookeeper.apache.org/doc/r3.1.2/zookeeperProgrammers.html#sc_ZooKeeperAccessControl) does. ClickHouse Keeper supports the same set of permissions and has the identical built-in schemes: `world`, `auth`, `digest`, `host` and `ip`. Digest authentication scheme uses pair `username:password`. Password is encoded in Base64.
|
||||
|
||||
!!! info "Note"
|
||||
External integrations are not supported.
|
||||
|
||||
## Configuration
|
||||
|
||||
ClickHouse Keeper can be used as a standalone replacement for ZooKeeper or as an internal part of the ClickHouse server, but in both cases configuration is almost the same `.xml` file. The main ClickHouse Keeper configuration tag is `<keeper_server>`. Keeper configuration has the following parameters:
|
||||
@ -118,13 +123,13 @@ echo mntr | nc localhost 9181
|
||||
|
||||
Bellow is the detailed 4lw commands:
|
||||
|
||||
- ruok : Tests if server is running in a non-error state. The server will respond with imok if it is running. Otherwise it will not respond at all. A response of "imok" does not necessarily indicate that the server has joined the quorum, just that the server process is active and bound to the specified client port. Use "stat" for details on state wrt quorum and client connection information.
|
||||
- `ruok`: Tests if server is running in a non-error state. The server will respond with imok if it is running. Otherwise it will not respond at all. A response of "imok" does not necessarily indicate that the server has joined the quorum, just that the server process is active and bound to the specified client port. Use "stat" for details on state wrt quorum and client connection information.
|
||||
|
||||
```
|
||||
imok
|
||||
```
|
||||
|
||||
- mntr : Outputs a list of variables that could be used for monitoring the health of the cluster.
|
||||
- `mntr`: Outputs a list of variables that could be used for monitoring the health of the cluster.
|
||||
|
||||
```
|
||||
zk_version v21.11.1.1-prestable-7a4a0b0edef0ad6e0aa662cd3b90c3f4acf796e7
|
||||
@ -146,12 +151,11 @@ zk_followers 0
|
||||
zk_synced_followers 0
|
||||
```
|
||||
|
||||
- srvr : Lists full details for the server.
|
||||
- `srvr`: Lists full details for the server.
|
||||
|
||||
```
|
||||
ClickHouse Keeper version: v21.11.1.1-prestable-7a4a0b0edef0ad6e0aa662cd3b90c3f4acf796e7
|
||||
Latency min/avg/max: 0/0/0
|
||||
|
||||
Received: 2
|
||||
Sent : 2
|
||||
Connections: 1
|
||||
@ -161,16 +165,14 @@ Mode: leader
|
||||
Node count: 4
|
||||
```
|
||||
|
||||
- stat : Lists brief details for the server and connected clients.
|
||||
- `stat`: Lists brief details for the server and connected clients.
|
||||
|
||||
```
|
||||
ClickHouse Keeper version: v21.11.1.1-prestable-7a4a0b0edef0ad6e0aa662cd3b90c3f4acf796e7
|
||||
Clients:
|
||||
192.168.1.1:52852(recved=0,sent=0)
|
||||
192.168.1.1:52042(recved=24,sent=48)
|
||||
|
||||
Latency min/avg/max: 0/0/0
|
||||
|
||||
Received: 4
|
||||
Sent : 4
|
||||
Connections: 1
|
||||
@ -178,16 +180,15 @@ Outstanding: 0
|
||||
Zxid: 36
|
||||
Mode: leader
|
||||
Node count: 4
|
||||
|
||||
```
|
||||
|
||||
- srst : Reset server statistics. The command will affect the result of `srvr`, `mntr` and `stat`.
|
||||
- `srst`: Reset server statistics. The command will affect the result of `srvr`, `mntr` and `stat`.
|
||||
|
||||
```
|
||||
Server stats reset.
|
||||
```
|
||||
|
||||
- conf : Print details about serving configuration.
|
||||
- `conf`: Print details about serving configuration.
|
||||
|
||||
```
|
||||
server_id=1
|
||||
@ -220,20 +221,20 @@ compress_snapshots_with_zstd_format=true
|
||||
configuration_change_tries_count=20
|
||||
```
|
||||
|
||||
- cons : List full connection/session details for all clients connected to this server. Includes information on numbers of packets received/sent, session id, operation latencies, last operation performed, etc...
|
||||
- `cons`: List full connection/session details for all clients connected to this server. Includes information on numbers of packets received/sent, session id, operation latencies, last operation performed, etc...
|
||||
|
||||
```
|
||||
192.168.1.1:52163(recved=0,sent=0,sid=0xffffffffffffffff,lop=NA,est=1636454787393,to=30000,lzxid=0xffffffffffffffff,lresp=0,llat=0,minlat=0,avglat=0,maxlat=0)
|
||||
192.168.1.1:52042(recved=9,sent=18,sid=0x0000000000000001,lop=List,est=1636454739887,to=30000,lcxid=0x0000000000000005,lzxid=0x0000000000000005,lresp=1636454739892,llat=0,minlat=0,avglat=0,maxlat=0)
|
||||
```
|
||||
|
||||
- crst : Reset connection/session statistics for all connections.
|
||||
- `crst`: Reset connection/session statistics for all connections.
|
||||
|
||||
```
|
||||
Connection stats reset.
|
||||
```
|
||||
|
||||
- envi : Print details about serving environment
|
||||
- `envi`: Print details about serving environment
|
||||
|
||||
```
|
||||
Environment:
|
||||
@ -250,41 +251,41 @@ user.tmp=/var/folders/b4/smbq5mfj7578f2jzwn602tt40000gn/T/
|
||||
```
|
||||
|
||||
|
||||
- dirs : Shows the total size of snapshot and log files in bytes
|
||||
- `dirs`: Shows the total size of snapshot and log files in bytes
|
||||
|
||||
```
|
||||
snapshot_dir_size: 0
|
||||
log_dir_size: 3875
|
||||
```
|
||||
|
||||
- isro: Tests if server is running in read-only mode. The server will respond with "ro" if in read-only mode or "rw" if not in read-only mode.
|
||||
- `isro`: Tests if server is running in read-only mode. The server will respond with "ro" if in read-only mode or "rw" if not in read-only mode.
|
||||
|
||||
```
|
||||
rw
|
||||
```
|
||||
|
||||
- wchs : Lists brief information on watches for the server.
|
||||
- `wchs`: Lists brief information on watches for the server.
|
||||
|
||||
```
|
||||
1 connections watching 1 paths
|
||||
Total watches:1
|
||||
```
|
||||
|
||||
- wchc : Lists detailed information on watches for the server, by session. This outputs a list of sessions(connections) with associated watches (paths). Note, depending on the number of watches this operation may be expensive (ie impact server performance), use it carefully.
|
||||
- `wchc`: Lists detailed information on watches for the server, by session. This outputs a list of sessions (connections) with associated watches (paths). Note, depending on the number of watches this operation may be expensive (ie impact server performance), use it carefully.
|
||||
|
||||
```
|
||||
0x0000000000000001
|
||||
/clickhouse/task_queue/ddl
|
||||
```
|
||||
|
||||
- wchp : Lists detailed information on watches for the server, by path. This outputs a list of paths (znodes) with associated sessions. Note, depending on the number of watches this operation may be expensive (ie impact server performance), use it carefully.
|
||||
- `wchp`: Lists detailed information on watches for the server, by path. This outputs a list of paths (znodes) with associated sessions. Note, depending on the number of watches this operation may be expensive (i. e. impact server performance), use it carefully.
|
||||
|
||||
```
|
||||
/clickhouse/task_queue/ddl
|
||||
0x0000000000000001
|
||||
```
|
||||
|
||||
- dump : Lists the outstanding sessions and ephemeral nodes. This only works on the leader.
|
||||
- `dump`: Lists the outstanding sessions and ephemeral nodes. This only works on the leader.
|
||||
|
||||
```
|
||||
Sessions dump (2):
|
||||
|
@ -16,12 +16,17 @@ ZooKeeper — один из первых широко известных сер
|
||||
|
||||
По умолчанию ClickHouse Keeper предоставляет те же гарантии, что и ZooKeeper (линеаризуемость записей, последовательная согласованность чтений). У него есть совместимый клиент-серверный протокол, поэтому любой стандартный клиент ZooKeeper может использоваться для взаимодействия с ClickHouse Keeper. Снэпшоты и журналы имеют несовместимый с ZooKeeper формат, однако можно конвертировать данные Zookeeper в снэпшот ClickHouse Keeper с помощью `clickhouse-keeper-converter`. Межсерверный протокол ClickHouse Keeper также несовместим с ZooKeeper, поэтому создание смешанного кластера ZooKeeper / ClickHouse Keeper невозможно.
|
||||
|
||||
Система управления доступом (ACL) ClickHouse Keeper реализована так же, как в [ZooKeeper](https://zookeeper.apache.org/doc/r3.1.2/zookeeperProgrammers.html#sc_ZooKeeperAccessControl). ClickHouse Keeper поддерживает тот же набор разрешений и идентичные схемы: `world`, `auth`, `digest`, `host` и `ip`. Digest для аутентификации использует пару значений `username:password`. Пароль кодируется в Base64.
|
||||
|
||||
!!! info "Примечание"
|
||||
Внешние интеграции не поддерживаются.
|
||||
|
||||
## Конфигурация
|
||||
|
||||
ClickHouse Keeper может использоваться как равноценная замена ZooKeeper или как внутренняя часть сервера ClickHouse, но в обоих случаях конфигурация представлена файлом `.xml`. Главный тег конфигурации ClickHouse Keeper — это `<keeper_server>`. Параметры конфигурации:
|
||||
|
||||
- `tcp_port` — порт для подключения клиента (по умолчанию для ZooKeeper: `2181`).
|
||||
- `tcp_port_secure` — зашифрованный порт для подключения клиента.
|
||||
- `tcp_port_secure` — зашифрованный порт для SSL-соединения между клиентом и сервером сервиса.
|
||||
- `server_id` — уникальный идентификатор сервера, каждый участник кластера должен иметь уникальный номер (1, 2, 3 и т. д.).
|
||||
- `log_storage_path` — путь к журналам координации, лучше хранить их на незанятом устройстве (актуально и для ZooKeeper).
|
||||
- `snapshot_storage_path` — путь к снэпшотам координации.
|
||||
@ -50,7 +55,11 @@ ClickHouse Keeper может использоваться как равноце
|
||||
- `shutdown_timeout` — время ожидания завершения внутренних подключений и выключения, в миллисекундах (по умолчанию: 5000).
|
||||
- `startup_timeout` — время отключения сервера, если он не подключается к другим участникам кворума, в миллисекундах (по умолчанию: 30000).
|
||||
|
||||
Конфигурация кворума находится в `<keeper_server>.<raft_configuration>` и содержит описание серверов. Единственный параметр для всего кворума — `secure`, который включает зашифрованное соединение для связи между участниками кворума. Параметры для каждого `<server>`:
|
||||
Конфигурация кворума находится в `<keeper_server>.<raft_configuration>` и содержит описание серверов.
|
||||
|
||||
Единственный параметр для всего кворума — `secure`, который включает зашифрованное соединение для связи между участниками кворума. Параметру можно задать значение `true`, если для внутренней коммуникации между узлами требуется SSL-соединение, в ином случае не указывайте ничего.
|
||||
|
||||
Параметры для каждого `<server>`:
|
||||
|
||||
- `id` — идентификатор сервера в кворуме.
|
||||
- `hostname` — имя хоста, на котором размещен сервер.
|
||||
|
@ -152,6 +152,7 @@
|
||||
This setting could be used to switch replication to another network interface
|
||||
(the server may be connected to multiple networks via multiple addresses)
|
||||
-->
|
||||
|
||||
<!--
|
||||
<interserver_http_host>example.yandex.ru</interserver_http_host>
|
||||
-->
|
||||
@ -177,6 +178,7 @@
|
||||
-->
|
||||
<!-- <listen_host>::</listen_host> -->
|
||||
|
||||
|
||||
<!-- Same for hosts without support for IPv6: -->
|
||||
<!-- <listen_host>0.0.0.0</listen_host> -->
|
||||
|
||||
|
2
release
2
release
@ -87,7 +87,7 @@ if [ -z "$NO_BUILD" ] ; then
|
||||
# Build (only binary packages).
|
||||
debuild --preserve-env -e PATH \
|
||||
-e DEB_CC=$DEB_CC -e DEB_CXX=$DEB_CXX -e CMAKE_FLAGS="$CMAKE_FLAGS" \
|
||||
-b ${DEBUILD_NOSIGN_OPTIONS} ${DEBUILD_NODEPS_OPTIONS}
|
||||
-b ${DEBUILD_NOSIGN_OPTIONS} ${DEBUILD_NODEPS_OPTIONS} ${DEB_ARCH_FLAG}
|
||||
fi
|
||||
|
||||
if [ -n "$MAKE_RPM" ]; then
|
||||
|
@ -2,6 +2,7 @@
|
||||
|
||||
#include <Columns/ColumnTuple.h>
|
||||
#include <Columns/ColumnsNumber.h>
|
||||
#include <Columns/ColumnSparse.h>
|
||||
#include <Core/Block.h>
|
||||
#include <Core/ColumnNumbers.h>
|
||||
#include <Core/Field.h>
|
||||
@ -181,6 +182,13 @@ public:
|
||||
Arena * arena,
|
||||
ssize_t if_argument_pos = -1) const = 0;
|
||||
|
||||
/// The version of "addBatch", that handle sparse columns as arguments.
|
||||
virtual void addBatchSparse(
|
||||
AggregateDataPtr * places,
|
||||
size_t place_offset,
|
||||
const IColumn ** columns,
|
||||
Arena * arena) const = 0;
|
||||
|
||||
virtual void mergeBatch(
|
||||
size_t batch_size,
|
||||
AggregateDataPtr * places,
|
||||
@ -193,6 +201,10 @@ public:
|
||||
virtual void addBatchSinglePlace(
|
||||
size_t batch_size, AggregateDataPtr place, const IColumn ** columns, Arena * arena, ssize_t if_argument_pos = -1) const = 0;
|
||||
|
||||
/// The version of "addBatchSinglePlace", that handle sparse columns as arguments.
|
||||
virtual void addBatchSparseSinglePlace(
|
||||
AggregateDataPtr place, const IColumn ** columns, Arena * arena) const = 0;
|
||||
|
||||
/** The same for single place when need to aggregate only filtered data.
|
||||
* Instead of using an if-column, the condition is combined inside the null_map
|
||||
*/
|
||||
@ -367,6 +379,22 @@ public:
|
||||
}
|
||||
}
|
||||
|
||||
void addBatchSparse(
|
||||
AggregateDataPtr * places,
|
||||
size_t place_offset,
|
||||
const IColumn ** columns,
|
||||
Arena * arena) const override
|
||||
{
|
||||
const auto & column_sparse = assert_cast<const ColumnSparse &>(*columns[0]);
|
||||
const auto * values = &column_sparse.getValuesColumn();
|
||||
size_t batch_size = column_sparse.size();
|
||||
auto offset_it = column_sparse.begin();
|
||||
|
||||
for (size_t i = 0; i < batch_size; ++i, ++offset_it)
|
||||
static_cast<const Derived *>(this)->add(places[offset_it.getCurrentRow()] + place_offset,
|
||||
&values, offset_it.getValueIndex(), arena);
|
||||
}
|
||||
|
||||
void mergeBatch(
|
||||
size_t batch_size,
|
||||
AggregateDataPtr * places,
|
||||
@ -398,6 +426,19 @@ public:
|
||||
}
|
||||
}
|
||||
|
||||
void addBatchSparseSinglePlace(
|
||||
AggregateDataPtr place, const IColumn ** columns, Arena * arena) const override
|
||||
{
|
||||
/// TODO: add values and defaults separately if order of adding isn't important.
|
||||
const auto & column_sparse = assert_cast<const ColumnSparse &>(*columns[0]);
|
||||
const auto * values = &column_sparse.getValuesColumn();
|
||||
size_t batch_size = column_sparse.size();
|
||||
auto offset_it = column_sparse.begin();
|
||||
|
||||
for (size_t i = 0; i < batch_size; ++i, ++offset_it)
|
||||
static_cast<const Derived *>(this)->add(place, &values, offset_it.getValueIndex(), arena);
|
||||
}
|
||||
|
||||
void addBatchSinglePlaceNotNull(
|
||||
size_t batch_size,
|
||||
AggregateDataPtr place,
|
||||
|
@ -107,7 +107,7 @@ if (USE_AWS_S3)
|
||||
endif()
|
||||
|
||||
if (USE_AZURE_BLOB_STORAGE)
|
||||
add_headers_and_sources(dbms Disks/BlobStorage)
|
||||
add_headers_and_sources(dbms Disks/AzureBlobStorage)
|
||||
endif()
|
||||
|
||||
if (USE_HDFS)
|
||||
|
@ -133,6 +133,11 @@ public:
|
||||
|
||||
void get(size_t n, Field & res) const override;
|
||||
|
||||
bool isDefaultAt(size_t) const override
|
||||
{
|
||||
throw Exception("Method isDefaultAt is not supported for ColumnAggregateFunction", ErrorCodes::NOT_IMPLEMENTED);
|
||||
}
|
||||
|
||||
StringRef getDataAt(size_t n) const override;
|
||||
|
||||
void insertData(const char * pos, size_t length) override;
|
||||
@ -208,6 +213,16 @@ public:
|
||||
throw Exception("Method hasEqualValues is not supported for ColumnAggregateFunction", ErrorCodes::NOT_IMPLEMENTED);
|
||||
}
|
||||
|
||||
double getRatioOfDefaultRows(double) const override
|
||||
{
|
||||
throw Exception("Method getRatioOfDefaultRows is not supported for ColumnAggregateFunction", ErrorCodes::NOT_IMPLEMENTED);
|
||||
}
|
||||
|
||||
void getIndicesOfNonDefaultRows(Offsets &, size_t, size_t) const override
|
||||
{
|
||||
throw Exception("Method getIndicesOfNonDefaultRows is not supported for ColumnAggregateFunction", ErrorCodes::NOT_IMPLEMENTED);
|
||||
}
|
||||
|
||||
void getPermutation(bool reverse, size_t limit, int nan_direction_hint, Permutation & res) const override;
|
||||
void updatePermutation(bool reverse, size_t limit, int, Permutation & res, EqualRanges & equal_range) const override;
|
||||
|
||||
|
@ -182,6 +182,13 @@ StringRef ColumnArray::getDataAt(size_t n) const
|
||||
}
|
||||
|
||||
|
||||
bool ColumnArray::isDefaultAt(size_t n) const
|
||||
{
|
||||
const auto & offsets_data = getOffsets();
|
||||
return offsets_data[n] == offsets_data[static_cast<ssize_t>(n) - 1];
|
||||
}
|
||||
|
||||
|
||||
void ColumnArray::insertData(const char * pos, size_t length)
|
||||
{
|
||||
/** Similarly - only for arrays of fixed length values.
|
||||
@ -576,7 +583,8 @@ void ColumnArray::expand(const IColumn::Filter & mask, bool inverted)
|
||||
}
|
||||
|
||||
if (from != -1)
|
||||
throw Exception("Not enough bytes in mask", ErrorCodes::LOGICAL_ERROR);}
|
||||
throw Exception("Not enough bytes in mask", ErrorCodes::LOGICAL_ERROR);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
ColumnPtr ColumnArray::filterNumber(const Filter & filt, ssize_t result_size_hint) const
|
||||
@ -868,6 +876,16 @@ ColumnPtr ColumnArray::compress() const
|
||||
});
|
||||
}
|
||||
|
||||
double ColumnArray::getRatioOfDefaultRows(double sample_ratio) const
|
||||
{
|
||||
return getRatioOfDefaultRowsImpl<ColumnArray>(sample_ratio);
|
||||
}
|
||||
|
||||
void ColumnArray::getIndicesOfNonDefaultRows(Offsets & indices, size_t from, size_t limit) const
|
||||
{
|
||||
return getIndicesOfNonDefaultRowsImpl<ColumnArray>(indices, from, limit);
|
||||
}
|
||||
|
||||
|
||||
ColumnPtr ColumnArray::replicate(const Offsets & replicate_offsets) const
|
||||
{
|
||||
|
@ -60,6 +60,7 @@ public:
|
||||
Field operator[](size_t n) const override;
|
||||
void get(size_t n, Field & res) const override;
|
||||
StringRef getDataAt(size_t n) const override;
|
||||
bool isDefaultAt(size_t n) const override;
|
||||
void insertData(const char * pos, size_t length) override;
|
||||
StringRef serializeValueIntoArena(size_t n, Arena & arena, char const *& begin) const override;
|
||||
const char * deserializeAndInsertFromArena(const char * pos) override;
|
||||
@ -143,6 +144,10 @@ public:
|
||||
return false;
|
||||
}
|
||||
|
||||
double getRatioOfDefaultRows(double sample_ratio) const override;
|
||||
|
||||
void getIndicesOfNonDefaultRows(Offsets & indices, size_t from, size_t limit) const override;
|
||||
|
||||
bool isCollationSupported() const override { return getData().isCollationSupported(); }
|
||||
|
||||
private:
|
||||
|
@ -82,6 +82,7 @@ public:
|
||||
Field operator[](size_t) const override { throwMustBeDecompressed(); }
|
||||
void get(size_t, Field &) const override { throwMustBeDecompressed(); }
|
||||
StringRef getDataAt(size_t) const override { throwMustBeDecompressed(); }
|
||||
bool isDefaultAt(size_t) const override { throwMustBeDecompressed(); }
|
||||
void insert(const Field &) override { throwMustBeDecompressed(); }
|
||||
void insertRangeFrom(const IColumn &, size_t, size_t) override { throwMustBeDecompressed(); }
|
||||
void insertData(const char *, size_t) override { throwMustBeDecompressed(); }
|
||||
@ -113,6 +114,8 @@ public:
|
||||
void gather(ColumnGathererStream &) override { throwMustBeDecompressed(); }
|
||||
void getExtremes(Field &, Field &) const override { throwMustBeDecompressed(); }
|
||||
size_t byteSizeAt(size_t) const override { throwMustBeDecompressed(); }
|
||||
double getRatioOfDefaultRows(double) const override { throwMustBeDecompressed(); }
|
||||
void getIndicesOfNonDefaultRows(Offsets &, size_t, size_t) const override { throwMustBeDecompressed(); }
|
||||
|
||||
protected:
|
||||
size_t rows;
|
||||
|
@ -5,6 +5,7 @@
|
||||
#include <Columns/IColumn.h>
|
||||
#include <Common/typeid_cast.h>
|
||||
#include <Common/assert_cast.h>
|
||||
#include <Common/PODArray.h>
|
||||
|
||||
|
||||
namespace DB
|
||||
@ -115,6 +116,11 @@ public:
|
||||
return data->getFloat32(0);
|
||||
}
|
||||
|
||||
bool isDefaultAt(size_t) const override
|
||||
{
|
||||
return data->isDefaultAt(0);
|
||||
}
|
||||
|
||||
bool isNullAt(size_t) const override
|
||||
{
|
||||
return data->isNullAt(0);
|
||||
@ -239,6 +245,27 @@ public:
|
||||
return false;
|
||||
}
|
||||
|
||||
double getRatioOfDefaultRows(double) const override
|
||||
{
|
||||
return data->isDefaultAt(0) ? 1.0 : 0.0;
|
||||
}
|
||||
|
||||
void getIndicesOfNonDefaultRows(Offsets & indices, size_t from, size_t limit) const override
|
||||
{
|
||||
if (!data->isDefaultAt(0))
|
||||
{
|
||||
size_t to = limit && from + limit < size() ? from + limit : size();
|
||||
indices.reserve(indices.size() + to - from);
|
||||
for (size_t i = from; i < to; ++i)
|
||||
indices.push_back(i);
|
||||
}
|
||||
}
|
||||
|
||||
SerializationInfoPtr getSerializationInfo() const override
|
||||
{
|
||||
return data->getSerializationInfo();
|
||||
}
|
||||
|
||||
bool isNullable() const override { return isColumnNullable(*data); }
|
||||
bool onlyNull() const override { return data->isNullAt(0); }
|
||||
bool isNumeric() const override { return data->isNumeric(); }
|
||||
|
@ -177,8 +177,17 @@ public:
|
||||
return false;
|
||||
}
|
||||
|
||||
ColumnPtr compress() const override;
|
||||
double getRatioOfDefaultRows(double sample_ratio) const override
|
||||
{
|
||||
return this->template getRatioOfDefaultRowsImpl<Self>(sample_ratio);
|
||||
}
|
||||
|
||||
void getIndicesOfNonDefaultRows(IColumn::Offsets & indices, size_t from, size_t limit) const override
|
||||
{
|
||||
return this->template getIndicesOfNonDefaultRowsImpl<Self>(indices, from, limit);
|
||||
}
|
||||
|
||||
ColumnPtr compress() const override;
|
||||
|
||||
void insertValue(const T value) { data.push_back(value); }
|
||||
Container & getData() { return data; }
|
||||
|
@ -51,6 +51,12 @@ MutableColumnPtr ColumnFixedString::cloneResized(size_t size) const
|
||||
return new_col_holder;
|
||||
}
|
||||
|
||||
bool ColumnFixedString::isDefaultAt(size_t index) const
|
||||
{
|
||||
assert(index < size());
|
||||
return memoryIsZero(chars.data() + index * n, n);
|
||||
}
|
||||
|
||||
void ColumnFixedString::insert(const Field & x)
|
||||
{
|
||||
const String & s = DB::get<const String &>(x);
|
||||
|
@ -88,6 +88,8 @@ public:
|
||||
return StringRef(&chars[n * index], n);
|
||||
}
|
||||
|
||||
bool isDefaultAt(size_t index) const override;
|
||||
|
||||
void insert(const Field & x) override;
|
||||
|
||||
void insertFrom(const IColumn & src_, size_t index) override;
|
||||
@ -182,6 +184,16 @@ public:
|
||||
return false;
|
||||
}
|
||||
|
||||
double getRatioOfDefaultRows(double sample_ratio) const override
|
||||
{
|
||||
return getRatioOfDefaultRowsImpl<ColumnFixedString>(sample_ratio);
|
||||
}
|
||||
|
||||
void getIndicesOfNonDefaultRows(Offsets & indices, size_t from, size_t limit) const override
|
||||
{
|
||||
return getIndicesOfNonDefaultRowsImpl<ColumnFixedString>(indices, from, limit);
|
||||
}
|
||||
|
||||
bool canBeInsideNullable() const override { return true; }
|
||||
|
||||
bool isFixedAndContiguous() const override { return true; }
|
||||
|
@ -68,6 +68,11 @@ public:
|
||||
throw Exception("Cannot get value from " + getName(), ErrorCodes::NOT_IMPLEMENTED);
|
||||
}
|
||||
|
||||
bool isDefaultAt(size_t) const override
|
||||
{
|
||||
throw Exception("isDefaultAt is not implemented for " + getName(), ErrorCodes::NOT_IMPLEMENTED);
|
||||
}
|
||||
|
||||
void insert(const Field &) override
|
||||
{
|
||||
throw Exception("Cannot insert into " + getName(), ErrorCodes::NOT_IMPLEMENTED);
|
||||
@ -153,6 +158,16 @@ public:
|
||||
throw Exception("Method gather is not supported for " + getName(), ErrorCodes::NOT_IMPLEMENTED);
|
||||
}
|
||||
|
||||
double getRatioOfDefaultRows(double) const override
|
||||
{
|
||||
throw Exception("Method getRatioOfDefaultRows is not supported for " + getName(), ErrorCodes::NOT_IMPLEMENTED);
|
||||
}
|
||||
|
||||
void getIndicesOfNonDefaultRows(Offsets &, size_t, size_t) const override
|
||||
{
|
||||
throw Exception("Method getIndicesOfNonDefaultRows is not supported for " + getName(), ErrorCodes::NOT_IMPLEMENTED);
|
||||
}
|
||||
|
||||
bool isShortCircuitArgument() const { return is_short_circuit_argument; }
|
||||
|
||||
DataTypePtr getResultType() const;
|
||||
|
@ -64,6 +64,7 @@ public:
|
||||
return getDictionary().getDataAtWithTerminatingZero(getIndexes().getUInt(n));
|
||||
}
|
||||
|
||||
bool isDefaultAt(size_t n) const override { return getDictionary().isDefaultAt(getIndexes().getUInt(n)); }
|
||||
UInt64 get64(size_t n) const override { return getDictionary().get64(getIndexes().getUInt(n)); }
|
||||
UInt64 getUInt(size_t n) const override { return getDictionary().getUInt(getIndexes().getUInt(n)); }
|
||||
Int64 getInt(size_t n) const override { return getDictionary().getInt(getIndexes().getUInt(n)); }
|
||||
@ -180,6 +181,16 @@ public:
|
||||
return false;
|
||||
}
|
||||
|
||||
double getRatioOfDefaultRows(double sample_ratio) const override
|
||||
{
|
||||
return getIndexes().getRatioOfDefaultRows(sample_ratio);
|
||||
}
|
||||
|
||||
void getIndicesOfNonDefaultRows(Offsets & indices, size_t from, size_t limit) const override
|
||||
{
|
||||
return getIndexes().getIndicesOfNonDefaultRows(indices, from, limit);
|
||||
}
|
||||
|
||||
bool valuesHaveFixedSize() const override { return getDictionary().valuesHaveFixedSize(); }
|
||||
bool isFixedAndContiguous() const override { return false; }
|
||||
size_t sizeOfValueIfFixed() const override { return getDictionary().sizeOfValueIfFixed(); }
|
||||
|
@ -81,6 +81,11 @@ void ColumnMap::get(size_t n, Field & res) const
|
||||
getNestedData().get(offset + i, map[i]);
|
||||
}
|
||||
|
||||
bool ColumnMap::isDefaultAt(size_t n) const
|
||||
{
|
||||
return nested->isDefaultAt(n);
|
||||
}
|
||||
|
||||
StringRef ColumnMap::getDataAt(size_t) const
|
||||
{
|
||||
throw Exception("Method getDataAt is not supported for " + getName(), ErrorCodes::NOT_IMPLEMENTED);
|
||||
@ -273,6 +278,16 @@ bool ColumnMap::structureEquals(const IColumn & rhs) const
|
||||
return false;
|
||||
}
|
||||
|
||||
double ColumnMap::getRatioOfDefaultRows(double sample_ratio) const
|
||||
{
|
||||
return getRatioOfDefaultRowsImpl<ColumnMap>(sample_ratio);
|
||||
}
|
||||
|
||||
void ColumnMap::getIndicesOfNonDefaultRows(Offsets & indices, size_t from, size_t limit) const
|
||||
{
|
||||
return getIndicesOfNonDefaultRowsImpl<ColumnMap>(indices, from, limit);
|
||||
}
|
||||
|
||||
ColumnPtr ColumnMap::compress() const
|
||||
{
|
||||
auto compressed = nested->compress();
|
||||
|
@ -51,6 +51,7 @@ public:
|
||||
Field operator[](size_t n) const override;
|
||||
void get(size_t n, Field & res) const override;
|
||||
|
||||
bool isDefaultAt(size_t n) const override;
|
||||
StringRef getDataAt(size_t n) const override;
|
||||
void insertData(const char * pos, size_t length) override;
|
||||
void insert(const Field & x) override;
|
||||
@ -85,6 +86,8 @@ public:
|
||||
void protect() override;
|
||||
void forEachSubcolumn(ColumnCallback callback) override;
|
||||
bool structureEquals(const IColumn & rhs) const override;
|
||||
double getRatioOfDefaultRows(double sample_ratio) const override;
|
||||
void getIndicesOfNonDefaultRows(Offsets & indices, size_t from, size_t limit) const override;
|
||||
|
||||
const ColumnArray & getNestedColumn() const { return assert_cast<const ColumnArray &>(*nested); }
|
||||
ColumnArray & getNestedColumn() { return assert_cast<ColumnArray &>(*nested); }
|
||||
|
@ -648,6 +648,29 @@ void ColumnNullable::checkConsistency() const
|
||||
ErrorCodes::SIZES_OF_NESTED_COLUMNS_ARE_INCONSISTENT);
|
||||
}
|
||||
|
||||
ColumnPtr ColumnNullable::createWithOffsets(const IColumn::Offsets & offsets, const Field & default_field, size_t total_rows, size_t shift) const
|
||||
{
|
||||
ColumnPtr new_values;
|
||||
ColumnPtr new_null_map;
|
||||
|
||||
if (default_field.getType() == Field::Types::Null)
|
||||
{
|
||||
auto default_column = nested_column->cloneEmpty();
|
||||
default_column->insertDefault();
|
||||
|
||||
/// Value in main column, when null map is 1 is implementation defined. So, take any value.
|
||||
new_values = nested_column->createWithOffsets(offsets, (*default_column)[0], total_rows, shift);
|
||||
new_null_map = null_map->createWithOffsets(offsets, Field(1u), total_rows, shift);
|
||||
}
|
||||
else
|
||||
{
|
||||
new_values = nested_column->createWithOffsets(offsets, default_field, total_rows, shift);
|
||||
new_null_map = null_map->createWithOffsets(offsets, Field(0u), total_rows, shift);
|
||||
}
|
||||
|
||||
return ColumnNullable::create(new_values, new_null_map);
|
||||
}
|
||||
|
||||
ColumnPtr makeNullable(const ColumnPtr & column)
|
||||
{
|
||||
if (isColumnNullable(*column))
|
||||
|
@ -54,6 +54,7 @@ public:
|
||||
void get(size_t n, Field & res) const override;
|
||||
bool getBool(size_t n) const override { return isNullAt(n) ? false : nested_column->getBool(n); }
|
||||
UInt64 get64(size_t n) const override { return nested_column->get64(n); }
|
||||
bool isDefaultAt(size_t n) const override { return isNullAt(n); }
|
||||
|
||||
/**
|
||||
* If isNullAt(n) returns false, returns the nested column's getDataAt(n), otherwise returns a special value
|
||||
@ -137,6 +138,18 @@ public:
|
||||
return false;
|
||||
}
|
||||
|
||||
double getRatioOfDefaultRows(double sample_ratio) const override
|
||||
{
|
||||
return null_map->getRatioOfDefaultRows(sample_ratio);
|
||||
}
|
||||
|
||||
void getIndicesOfNonDefaultRows(Offsets & indices, size_t from, size_t limit) const override
|
||||
{
|
||||
null_map->getIndicesOfNonDefaultRows(indices, from, limit);
|
||||
}
|
||||
|
||||
ColumnPtr createWithOffsets(const IColumn::Offsets & offsets, const Field & default_field, size_t total_rows, size_t shift) const override;
|
||||
|
||||
bool isNullable() const override { return true; }
|
||||
bool isFixedAndContiguous() const override { return false; }
|
||||
bool valuesHaveFixedSize() const override { return nested_column->valuesHaveFixedSize(); }
|
||||
|
779
src/Columns/ColumnSparse.cpp
Normal file
779
src/Columns/ColumnSparse.cpp
Normal file
@ -0,0 +1,779 @@
|
||||
#include <Columns/ColumnSparse.h>
|
||||
#include <Columns/ColumnsCommon.h>
|
||||
#include <Columns/ColumnCompressed.h>
|
||||
#include <Columns/ColumnTuple.h>
|
||||
#include <Common/WeakHash.h>
|
||||
#include <Common/SipHash.h>
|
||||
#include <Common/HashTable/Hash.h>
|
||||
#include <Processors/Transforms/ColumnGathererTransform.h>
|
||||
|
||||
#include <algorithm>
|
||||
|
||||
namespace DB
|
||||
{
|
||||
|
||||
namespace ErrorCodes
|
||||
{
|
||||
extern const int LOGICAL_ERROR;
|
||||
extern const int SIZES_OF_COLUMNS_DOESNT_MATCH;
|
||||
}
|
||||
|
||||
ColumnSparse::ColumnSparse(MutableColumnPtr && values_)
|
||||
: values(std::move(values_)), _size(0)
|
||||
{
|
||||
if (!values->empty())
|
||||
throw Exception("Not empty values passed to ColumnSparse, but no offsets passed", ErrorCodes::LOGICAL_ERROR);
|
||||
|
||||
values->insertDefault();
|
||||
offsets = ColumnUInt64::create();
|
||||
}
|
||||
|
||||
ColumnSparse::ColumnSparse(MutableColumnPtr && values_, MutableColumnPtr && offsets_, size_t size_)
|
||||
: values(std::move(values_)), offsets(std::move(offsets_)), _size(size_)
|
||||
{
|
||||
const ColumnUInt64 * offsets_concrete = typeid_cast<const ColumnUInt64 *>(offsets.get());
|
||||
|
||||
if (!offsets_concrete)
|
||||
throw Exception(ErrorCodes::LOGICAL_ERROR, "'offsets' column must be a ColumnUInt64, got: {}", offsets->getName());
|
||||
|
||||
/// 'values' should contain one extra element: default value at 0 position.
|
||||
if (offsets->size() + 1 != values->size())
|
||||
throw Exception(ErrorCodes::LOGICAL_ERROR,
|
||||
"Values size ({}) is inconsistent with offsets size ({})", values->size(), offsets->size());
|
||||
|
||||
if (_size < offsets->size())
|
||||
throw Exception(ErrorCodes::LOGICAL_ERROR,
|
||||
"Size of sparse column ({}) cannot be lower than number of non-default values ({})", _size, offsets->size());
|
||||
|
||||
if (!offsets_concrete->empty() && _size <= offsets_concrete->getData().back())
|
||||
throw Exception(ErrorCodes::LOGICAL_ERROR,
|
||||
"Size of sparse column ({}) should be greater than last position of non-default value ({})",
|
||||
_size, offsets_concrete->getData().back());
|
||||
|
||||
#ifndef NDEBUG
|
||||
const auto & offsets_data = getOffsetsData();
|
||||
const auto * it = std::adjacent_find(offsets_data.begin(), offsets_data.end(), std::greater_equal<UInt64>());
|
||||
if (it != offsets_data.end())
|
||||
throw Exception(ErrorCodes::LOGICAL_ERROR, "Offsets of ColumnSparse must be strictly sorted");
|
||||
#endif
|
||||
}
|
||||
|
||||
MutableColumnPtr ColumnSparse::cloneResized(size_t new_size) const
|
||||
{
|
||||
if (new_size == 0)
|
||||
return ColumnSparse::create(values->cloneEmpty());
|
||||
|
||||
if (new_size >= _size)
|
||||
return ColumnSparse::create(IColumn::mutate(values), IColumn::mutate(offsets), new_size);
|
||||
|
||||
auto res = ColumnSparse::create(values->cloneEmpty());
|
||||
res->insertRangeFrom(*this, 0, new_size);
|
||||
return res;
|
||||
}
|
||||
|
||||
bool ColumnSparse::isDefaultAt(size_t n) const
|
||||
{
|
||||
return getValueIndex(n) == 0;
|
||||
}
|
||||
|
||||
bool ColumnSparse::isNullAt(size_t n) const
|
||||
{
|
||||
return values->isNullAt(getValueIndex(n));
|
||||
}
|
||||
|
||||
Field ColumnSparse::operator[](size_t n) const
|
||||
{
|
||||
return (*values)[getValueIndex(n)];
|
||||
}
|
||||
|
||||
void ColumnSparse::get(size_t n, Field & res) const
|
||||
{
|
||||
values->get(getValueIndex(n), res);
|
||||
}
|
||||
|
||||
bool ColumnSparse::getBool(size_t n) const
|
||||
{
|
||||
return values->getBool(getValueIndex(n));
|
||||
}
|
||||
|
||||
Float64 ColumnSparse::getFloat64(size_t n) const
|
||||
{
|
||||
return values->getFloat64(getValueIndex(n));
|
||||
}
|
||||
|
||||
Float32 ColumnSparse::getFloat32(size_t n) const
|
||||
{
|
||||
return values->getFloat32(getValueIndex(n));
|
||||
}
|
||||
|
||||
UInt64 ColumnSparse::getUInt(size_t n) const
|
||||
{
|
||||
return values->getUInt(getValueIndex(n));
|
||||
}
|
||||
|
||||
Int64 ColumnSparse::getInt(size_t n) const
|
||||
{
|
||||
return values->getInt(getValueIndex(n));
|
||||
}
|
||||
|
||||
UInt64 ColumnSparse::get64(size_t n) const
|
||||
{
|
||||
return values->get64(getValueIndex(n));
|
||||
}
|
||||
|
||||
StringRef ColumnSparse::getDataAt(size_t n) const
|
||||
{
|
||||
return values->getDataAt(getValueIndex(n));
|
||||
}
|
||||
|
||||
ColumnPtr ColumnSparse::convertToFullColumnIfSparse() const
|
||||
{
|
||||
return values->createWithOffsets(getOffsetsData(), (*values)[0], _size, /*shift=*/ 1);
|
||||
}
|
||||
|
||||
void ColumnSparse::insertSingleValue(const Inserter & inserter)
|
||||
{
|
||||
inserter(*values);
|
||||
|
||||
size_t last_idx = values->size() - 1;
|
||||
if (values->isDefaultAt(last_idx))
|
||||
values->popBack(1);
|
||||
else
|
||||
getOffsetsData().push_back(_size);
|
||||
|
||||
++_size;
|
||||
}
|
||||
|
||||
void ColumnSparse::insertData(const char * pos, size_t length)
|
||||
{
|
||||
insertSingleValue([&](IColumn & column) { column.insertData(pos, length); });
|
||||
}
|
||||
|
||||
StringRef ColumnSparse::serializeValueIntoArena(size_t n, Arena & arena, char const *& begin) const
|
||||
{
|
||||
return values->serializeValueIntoArena(getValueIndex(n), arena, begin);
|
||||
}
|
||||
|
||||
const char * ColumnSparse::deserializeAndInsertFromArena(const char * pos)
|
||||
{
|
||||
const char * res = nullptr;
|
||||
insertSingleValue([&](IColumn & column) { res = column.deserializeAndInsertFromArena(pos); });
|
||||
return res;
|
||||
}
|
||||
|
||||
const char * ColumnSparse::skipSerializedInArena(const char * pos) const
|
||||
{
|
||||
return values->skipSerializedInArena(pos);
|
||||
}
|
||||
|
||||
void ColumnSparse::insertRangeFrom(const IColumn & src, size_t start, size_t length)
|
||||
{
|
||||
if (length == 0)
|
||||
return;
|
||||
|
||||
if (start + length > src.size())
|
||||
throw Exception("Parameter out of bound in IColumnString::insertRangeFrom method.",
|
||||
ErrorCodes::LOGICAL_ERROR);
|
||||
|
||||
auto & offsets_data = getOffsetsData();
|
||||
|
||||
size_t end = start + length;
|
||||
if (const auto * src_sparse = typeid_cast<const ColumnSparse *>(&src))
|
||||
{
|
||||
const auto & src_offsets = src_sparse->getOffsetsData();
|
||||
const auto & src_values = src_sparse->getValuesColumn();
|
||||
|
||||
size_t offset_start = std::lower_bound(src_offsets.begin(), src_offsets.end(), start) - src_offsets.begin();
|
||||
size_t offset_end = std::lower_bound(src_offsets.begin(), src_offsets.end(), end) - src_offsets.begin();
|
||||
assert(offset_start <= offset_end);
|
||||
|
||||
if (offset_start != offset_end)
|
||||
{
|
||||
offsets_data.reserve(offsets_data.size() + offset_end - offset_start);
|
||||
insertManyDefaults(src_offsets[offset_start] - start);
|
||||
offsets_data.push_back(_size);
|
||||
++_size;
|
||||
|
||||
for (size_t i = offset_start + 1; i < offset_end; ++i)
|
||||
{
|
||||
size_t current_diff = src_offsets[i] - src_offsets[i - 1];
|
||||
insertManyDefaults(current_diff - 1);
|
||||
offsets_data.push_back(_size);
|
||||
++_size;
|
||||
}
|
||||
|
||||
/// 'end' <= 'src_offsets[offsets_end]', but end is excluded, so index is 'offsets_end' - 1.
|
||||
/// Since 'end' is excluded, need to subtract one more row from result.
|
||||
insertManyDefaults(end - src_offsets[offset_end - 1] - 1);
|
||||
values->insertRangeFrom(src_values, offset_start + 1, offset_end - offset_start);
|
||||
}
|
||||
else
|
||||
{
|
||||
insertManyDefaults(length);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
for (size_t i = start; i < end; ++i)
|
||||
{
|
||||
if (!src.isDefaultAt(i))
|
||||
{
|
||||
values->insertFrom(src, i);
|
||||
offsets_data.push_back(_size);
|
||||
}
|
||||
|
||||
++_size;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void ColumnSparse::insert(const Field & x)
|
||||
{
|
||||
insertSingleValue([&](IColumn & column) { column.insert(x); });
|
||||
}
|
||||
|
||||
void ColumnSparse::insertFrom(const IColumn & src, size_t n)
|
||||
{
|
||||
if (const auto * src_sparse = typeid_cast<const ColumnSparse *>(&src))
|
||||
{
|
||||
if (size_t value_index = src_sparse->getValueIndex(n))
|
||||
{
|
||||
getOffsetsData().push_back(_size);
|
||||
values->insertFrom(src_sparse->getValuesColumn(), value_index);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
if (!src.isDefaultAt(n))
|
||||
{
|
||||
values->insertFrom(src, n);
|
||||
getOffsetsData().push_back(_size);
|
||||
}
|
||||
}
|
||||
|
||||
++_size;
|
||||
}
|
||||
|
||||
void ColumnSparse::insertDefault()
|
||||
{
|
||||
++_size;
|
||||
}
|
||||
|
||||
void ColumnSparse::insertManyDefaults(size_t length)
|
||||
{
|
||||
_size += length;
|
||||
}
|
||||
|
||||
void ColumnSparse::popBack(size_t n)
|
||||
{
|
||||
assert(n < _size);
|
||||
|
||||
auto & offsets_data = getOffsetsData();
|
||||
size_t new_size = _size - n;
|
||||
|
||||
size_t removed_values = 0;
|
||||
while (!offsets_data.empty() && offsets_data.back() >= new_size)
|
||||
{
|
||||
offsets_data.pop_back();
|
||||
++removed_values;
|
||||
}
|
||||
|
||||
if (removed_values)
|
||||
values->popBack(removed_values);
|
||||
|
||||
_size = new_size;
|
||||
}
|
||||
|
||||
ColumnPtr ColumnSparse::filter(const Filter & filt, ssize_t) const
|
||||
{
|
||||
if (_size != filt.size())
|
||||
throw Exception("Size of filter doesn't match size of column.", ErrorCodes::SIZES_OF_COLUMNS_DOESNT_MATCH);
|
||||
|
||||
if (offsets->empty())
|
||||
{
|
||||
auto res = cloneEmpty();
|
||||
res->insertManyDefaults(countBytesInFilter(filt));
|
||||
return res;
|
||||
}
|
||||
|
||||
auto res_offsets = offsets->cloneEmpty();
|
||||
auto & res_offsets_data = assert_cast<ColumnUInt64 &>(*res_offsets).getData();
|
||||
|
||||
Filter values_filter;
|
||||
values_filter.reserve(values->size());
|
||||
values_filter.push_back(1);
|
||||
size_t values_result_size_hint = 1;
|
||||
|
||||
size_t res_offset = 0;
|
||||
auto offset_it = begin();
|
||||
for (size_t i = 0; i < _size; ++i, ++offset_it)
|
||||
{
|
||||
if (!offset_it.isDefault())
|
||||
{
|
||||
if (filt[i])
|
||||
{
|
||||
res_offsets_data.push_back(res_offset);
|
||||
values_filter.push_back(1);
|
||||
++res_offset;
|
||||
++values_result_size_hint;
|
||||
}
|
||||
else
|
||||
{
|
||||
values_filter.push_back(0);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
res_offset += filt[i] != 0;
|
||||
}
|
||||
}
|
||||
|
||||
auto res_values = values->filter(values_filter, values_result_size_hint);
|
||||
return this->create(std::move(res_values), std::move(res_offsets), res_offset);
|
||||
}
|
||||
|
||||
void ColumnSparse::expand(const Filter & mask, bool inverted)
|
||||
{
|
||||
if (mask.size() < _size)
|
||||
throw Exception("Mask size should be no less than data size.", ErrorCodes::LOGICAL_ERROR);
|
||||
|
||||
auto res_offsets = offsets->cloneEmpty();
|
||||
auto & res_offsets_data = assert_cast<ColumnUInt64 &>(*res_offsets).getData();
|
||||
|
||||
auto it = begin();
|
||||
for (size_t i = 0; i < mask.size(); ++i)
|
||||
{
|
||||
if (!!mask[i] ^ inverted)
|
||||
{
|
||||
if (it.getCurrentRow() == _size)
|
||||
throw Exception("Too many bytes in mask", ErrorCodes::LOGICAL_ERROR);
|
||||
|
||||
if (!it.isDefault())
|
||||
res_offsets_data[it.getCurrentOffset()] = i;
|
||||
|
||||
++it;
|
||||
}
|
||||
}
|
||||
|
||||
_size = mask.size();
|
||||
}
|
||||
|
||||
ColumnPtr ColumnSparse::permute(const Permutation & perm, size_t limit) const
|
||||
{
|
||||
return permuteImpl(*this, perm, limit);
|
||||
}
|
||||
|
||||
ColumnPtr ColumnSparse::index(const IColumn & indexes, size_t limit) const
|
||||
{
|
||||
return selectIndexImpl(*this, indexes, limit);
|
||||
}
|
||||
|
||||
template <typename Type>
|
||||
ColumnPtr ColumnSparse::indexImpl(const PaddedPODArray<Type> & indexes, size_t limit) const
|
||||
{
|
||||
assert(limit <= indexes.size());
|
||||
if (limit == 0)
|
||||
return ColumnSparse::create(values->cloneEmpty());
|
||||
|
||||
if (offsets->empty())
|
||||
{
|
||||
auto res = cloneEmpty();
|
||||
res->insertManyDefaults(limit);
|
||||
return res;
|
||||
}
|
||||
|
||||
auto res_offsets = offsets->cloneEmpty();
|
||||
auto & res_offsets_data = assert_cast<ColumnUInt64 &>(*res_offsets).getData();
|
||||
auto res_values = values->cloneEmpty();
|
||||
res_values->insertDefault();
|
||||
|
||||
/// If we need to permute full column, or if limit is large enough,
|
||||
/// it's better to save indexes of values in O(size)
|
||||
/// and avoid binary search for obtaining every index.
|
||||
/// 3 is just a guess for overhead on copying indexes.
|
||||
bool execute_linear =
|
||||
limit == _size || limit * std::bit_width(offsets->size()) > _size * 3;
|
||||
|
||||
if (execute_linear)
|
||||
{
|
||||
PaddedPODArray<UInt64> values_index(_size);
|
||||
auto offset_it = begin();
|
||||
for (size_t i = 0; i < _size; ++i, ++offset_it)
|
||||
values_index[i] = offset_it.getValueIndex();
|
||||
|
||||
for (size_t i = 0; i < limit; ++i)
|
||||
{
|
||||
size_t index = values_index[indexes[i]];
|
||||
if (index != 0)
|
||||
{
|
||||
res_values->insertFrom(*values, index);
|
||||
res_offsets_data.push_back(i);
|
||||
}
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
for (size_t i = 0; i < limit; ++i)
|
||||
{
|
||||
size_t index = getValueIndex(indexes[i]);
|
||||
if (index != 0)
|
||||
{
|
||||
res_values->insertFrom(*values, index);
|
||||
res_offsets_data.push_back(i);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return ColumnSparse::create(std::move(res_values), std::move(res_offsets), limit);
|
||||
}
|
||||
|
||||
int ColumnSparse::compareAt(size_t n, size_t m, const IColumn & rhs_, int null_direction_hint) const
|
||||
{
|
||||
if (const auto * rhs_sparse = typeid_cast<const ColumnSparse *>(&rhs_))
|
||||
return values->compareAt(getValueIndex(n), rhs_sparse->getValueIndex(m), rhs_sparse->getValuesColumn(), null_direction_hint);
|
||||
|
||||
return values->compareAt(getValueIndex(n), m, rhs_, null_direction_hint);
|
||||
}
|
||||
|
||||
void ColumnSparse::compareColumn(const IColumn & rhs, size_t rhs_row_num,
|
||||
PaddedPODArray<UInt64> * row_indexes, PaddedPODArray<Int8> & compare_results,
|
||||
int direction, int nan_direction_hint) const
|
||||
{
|
||||
if (row_indexes)
|
||||
{
|
||||
/// TODO: implement without conversion to full column.
|
||||
auto this_full = convertToFullColumnIfSparse();
|
||||
auto rhs_full = rhs.convertToFullColumnIfSparse();
|
||||
this_full->compareColumn(*rhs_full, rhs_row_num, row_indexes, compare_results, direction, nan_direction_hint);
|
||||
}
|
||||
else
|
||||
{
|
||||
const auto & rhs_sparse = assert_cast<const ColumnSparse &>(rhs);
|
||||
PaddedPODArray<Int8> nested_result;
|
||||
values->compareColumn(rhs_sparse.getValuesColumn(), rhs_sparse.getValueIndex(rhs_row_num),
|
||||
nullptr, nested_result, direction, nan_direction_hint);
|
||||
|
||||
const auto & offsets_data = getOffsetsData();
|
||||
compare_results.resize_fill(_size, nested_result[0]);
|
||||
for (size_t i = 0; i < offsets_data.size(); ++i)
|
||||
compare_results[offsets_data[i]] = nested_result[i + 1];
|
||||
}
|
||||
}
|
||||
|
||||
int ColumnSparse::compareAtWithCollation(size_t n, size_t m, const IColumn & rhs, int null_direction_hint, const Collator & collator) const
|
||||
{
|
||||
if (const auto * rhs_sparse = typeid_cast<const ColumnSparse *>(&rhs))
|
||||
return values->compareAtWithCollation(getValueIndex(n), rhs_sparse->getValueIndex(m), rhs_sparse->getValuesColumn(), null_direction_hint, collator);
|
||||
|
||||
return values->compareAtWithCollation(getValueIndex(n), m, rhs, null_direction_hint, collator);
|
||||
}
|
||||
|
||||
bool ColumnSparse::hasEqualValues() const
|
||||
{
|
||||
size_t num_defaults = getNumberOfDefaults();
|
||||
if (num_defaults == _size)
|
||||
return true;
|
||||
|
||||
/// Have at least 1 default and 1 non-default values.
|
||||
if (num_defaults != 0)
|
||||
return false;
|
||||
|
||||
/// Check that probably all non-default values are equal.
|
||||
/// It's suboptiomal, but it's a rare case.
|
||||
for (size_t i = 2; i < values->size(); ++i)
|
||||
if (values->compareAt(1, i, *values, 1) != 0)
|
||||
return false;
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
void ColumnSparse::getPermutationImpl(bool reverse, size_t limit, int null_direction_hint, Permutation & res, const Collator * collator) const
|
||||
{
|
||||
if (_size == 0)
|
||||
return;
|
||||
|
||||
res.resize(_size);
|
||||
if (offsets->empty())
|
||||
{
|
||||
for (size_t i = 0; i < _size; ++i)
|
||||
res[i] = i;
|
||||
return;
|
||||
}
|
||||
|
||||
if (limit == 0 || limit > _size)
|
||||
limit = _size;
|
||||
|
||||
Permutation perm;
|
||||
/// Firstly we sort all values.
|
||||
/// limit + 1 for case when there are 0 default values.
|
||||
if (collator)
|
||||
values->getPermutationWithCollation(*collator, reverse, limit + 1, null_direction_hint, perm);
|
||||
else
|
||||
values->getPermutation(reverse, limit + 1, null_direction_hint, perm);
|
||||
|
||||
size_t num_of_defaults = getNumberOfDefaults();
|
||||
size_t row = 0;
|
||||
|
||||
const auto & offsets_data = getOffsetsData();
|
||||
|
||||
/// Fill the permutation.
|
||||
for (size_t i = 0; i < perm.size() && row < limit; ++i)
|
||||
{
|
||||
if (perm[i] == 0)
|
||||
{
|
||||
if (!num_of_defaults)
|
||||
continue;
|
||||
|
||||
/// Fill the positions of default values in the required quantity.
|
||||
auto offset_it = begin();
|
||||
while (row < limit)
|
||||
{
|
||||
while (offset_it.getCurrentRow() < _size && !offset_it.isDefault())
|
||||
++offset_it;
|
||||
|
||||
if (offset_it.getCurrentRow() == _size)
|
||||
break;
|
||||
|
||||
res[row++] = offset_it.getCurrentRow();
|
||||
++offset_it;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
res[row++] = offsets_data[perm[i] - 1];
|
||||
}
|
||||
}
|
||||
|
||||
assert(row == limit);
|
||||
}
|
||||
|
||||
void ColumnSparse::getPermutation(bool reverse, size_t limit, int null_direction_hint, Permutation & res) const
|
||||
{
|
||||
return getPermutationImpl(reverse, limit, null_direction_hint, res, nullptr);
|
||||
}
|
||||
|
||||
void ColumnSparse::updatePermutation(bool reverse, size_t limit, int null_direction_hint, Permutation & res, EqualRanges & equal_range) const
|
||||
{
|
||||
auto this_full = convertToFullColumnIfSparse();
|
||||
this_full->updatePermutation(reverse, limit, null_direction_hint, res, equal_range);
|
||||
}
|
||||
|
||||
void ColumnSparse::getPermutationWithCollation(const Collator & collator, bool reverse, size_t limit, int null_direction_hint, Permutation & res) const
|
||||
{
|
||||
return getPermutationImpl(reverse, limit, null_direction_hint, res, &collator);
|
||||
}
|
||||
|
||||
void ColumnSparse::updatePermutationWithCollation(
|
||||
const Collator & collator, bool reverse, size_t limit, int null_direction_hint, Permutation & res, EqualRanges& equal_range) const
|
||||
{
|
||||
auto this_full = convertToFullColumnIfSparse();
|
||||
this_full->updatePermutationWithCollation(collator, reverse, limit, null_direction_hint, res, equal_range);
|
||||
}
|
||||
|
||||
size_t ColumnSparse::byteSize() const
|
||||
{
|
||||
return values->byteSize() + offsets->byteSize() + sizeof(_size);
|
||||
}
|
||||
|
||||
size_t ColumnSparse::byteSizeAt(size_t n) const
|
||||
{
|
||||
size_t index = getValueIndex(n);
|
||||
size_t res = values->byteSizeAt(index);
|
||||
if (index)
|
||||
res += sizeof(UInt64);
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
size_t ColumnSparse::allocatedBytes() const
|
||||
{
|
||||
return values->allocatedBytes() + offsets->allocatedBytes() + sizeof(_size);
|
||||
}
|
||||
|
||||
void ColumnSparse::protect()
|
||||
{
|
||||
values->protect();
|
||||
offsets->protect();
|
||||
}
|
||||
|
||||
ColumnPtr ColumnSparse::replicate(const Offsets & replicate_offsets) const
|
||||
{
|
||||
/// TODO: implement specializations.
|
||||
if (_size != replicate_offsets.size())
|
||||
throw Exception("Size of offsets doesn't match size of column.", ErrorCodes::SIZES_OF_COLUMNS_DOESNT_MATCH);
|
||||
|
||||
if (_size == 0)
|
||||
return ColumnSparse::create(values->cloneEmpty());
|
||||
|
||||
auto res_offsets = offsets->cloneEmpty();
|
||||
auto & res_offsets_data = assert_cast<ColumnUInt64 &>(*res_offsets).getData();
|
||||
auto res_values = values->cloneEmpty();
|
||||
res_values->insertDefault();
|
||||
|
||||
auto offset_it = begin();
|
||||
for (size_t i = 0; i < _size; ++i, ++offset_it)
|
||||
{
|
||||
if (!offset_it.isDefault())
|
||||
{
|
||||
size_t replicate_size = replicate_offsets[i] - replicate_offsets[i - 1];
|
||||
res_offsets_data.reserve(res_offsets_data.size() + replicate_size);
|
||||
for (size_t row = replicate_offsets[i - 1]; row < replicate_offsets[i]; ++row)
|
||||
{
|
||||
res_offsets_data.push_back(row);
|
||||
res_values->insertFrom(*values, offset_it.getValueIndex());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return ColumnSparse::create(std::move(res_values), std::move(res_offsets), replicate_offsets.back());
|
||||
}
|
||||
|
||||
void ColumnSparse::updateHashWithValue(size_t n, SipHash & hash) const
|
||||
{
|
||||
values->updateHashWithValue(getValueIndex(n), hash);
|
||||
}
|
||||
|
||||
void ColumnSparse::updateWeakHash32(WeakHash32 & hash) const
|
||||
{
|
||||
if (hash.getData().size() != _size)
|
||||
throw Exception(ErrorCodes::LOGICAL_ERROR, "Size of WeakHash32 does not match size of column: "
|
||||
"column size is {}, hash size is {}", _size, hash.getData().size());
|
||||
|
||||
auto offset_it = begin();
|
||||
auto & hash_data = hash.getData();
|
||||
for (size_t i = 0; i < _size; ++i, ++offset_it)
|
||||
{
|
||||
size_t value_index = offset_it.getValueIndex();
|
||||
auto data_ref = values->getDataAt(value_index);
|
||||
hash_data[i] = ::updateWeakHash32(reinterpret_cast<const UInt8 *>(data_ref.data), data_ref.size, hash_data[i]);
|
||||
}
|
||||
}
|
||||
|
||||
void ColumnSparse::updateHashFast(SipHash & hash) const
|
||||
{
|
||||
values->updateHashFast(hash);
|
||||
offsets->updateHashFast(hash);
|
||||
hash.update(_size);
|
||||
}
|
||||
|
||||
void ColumnSparse::getExtremes(Field & min, Field & max) const
|
||||
{
|
||||
if (_size == 0)
|
||||
{
|
||||
values->get(0, min);
|
||||
values->get(0, max);
|
||||
return;
|
||||
}
|
||||
|
||||
if (getNumberOfDefaults() == 0)
|
||||
{
|
||||
size_t min_idx = 1;
|
||||
size_t max_idx = 1;
|
||||
|
||||
for (size_t i = 2; i < values->size(); ++i)
|
||||
{
|
||||
if (values->compareAt(i, min_idx, *values, 1) < 0)
|
||||
min_idx = i;
|
||||
else if (values->compareAt(i, max_idx, *values, 1) > 0)
|
||||
max_idx = i;
|
||||
}
|
||||
|
||||
values->get(min_idx, min);
|
||||
values->get(max_idx, max);
|
||||
return;
|
||||
}
|
||||
|
||||
values->getExtremes(min, max);
|
||||
}
|
||||
|
||||
void ColumnSparse::getIndicesOfNonDefaultRows(IColumn::Offsets & indices, size_t from, size_t limit) const
|
||||
{
|
||||
const auto & offsets_data = getOffsetsData();
|
||||
const auto * start = from ? std::lower_bound(offsets_data.begin(), offsets_data.end(), from) : offsets_data.begin();
|
||||
const auto * end = limit ? std::lower_bound(offsets_data.begin(), offsets_data.end(), from + limit) : offsets_data.end();
|
||||
|
||||
indices.insert(start, end);
|
||||
}
|
||||
|
||||
double ColumnSparse::getRatioOfDefaultRows(double) const
|
||||
{
|
||||
return static_cast<double>(getNumberOfDefaults()) / _size;
|
||||
}
|
||||
|
||||
MutableColumns ColumnSparse::scatter(ColumnIndex num_columns, const Selector & selector) const
|
||||
{
|
||||
return scatterImpl<ColumnSparse>(num_columns, selector);
|
||||
}
|
||||
|
||||
void ColumnSparse::gather(ColumnGathererStream & gatherer_stream)
|
||||
{
|
||||
gatherer_stream.gather(*this);
|
||||
}
|
||||
|
||||
ColumnPtr ColumnSparse::compress() const
|
||||
{
|
||||
auto values_compressed = values->compress();
|
||||
auto offsets_compressed = offsets->compress();
|
||||
|
||||
size_t byte_size = values_compressed->byteSize() + offsets_compressed->byteSize();
|
||||
|
||||
return ColumnCompressed::create(size(), byte_size,
|
||||
[values_compressed = std::move(values_compressed), offsets_compressed = std::move(offsets_compressed), size = size()]
|
||||
{
|
||||
return ColumnSparse::create(values_compressed->decompress(), offsets_compressed->decompress(), size);
|
||||
});
|
||||
}
|
||||
|
||||
bool ColumnSparse::structureEquals(const IColumn & rhs) const
|
||||
{
|
||||
if (const auto * rhs_sparse = typeid_cast<const ColumnSparse *>(&rhs))
|
||||
return values->structureEquals(*rhs_sparse->values);
|
||||
return false;
|
||||
}
|
||||
|
||||
void ColumnSparse::forEachSubcolumn(ColumnCallback callback)
|
||||
{
|
||||
callback(values);
|
||||
callback(offsets);
|
||||
}
|
||||
|
||||
const IColumn::Offsets & ColumnSparse::getOffsetsData() const
|
||||
{
|
||||
return assert_cast<const ColumnUInt64 &>(*offsets).getData();
|
||||
}
|
||||
|
||||
IColumn::Offsets & ColumnSparse::getOffsetsData()
|
||||
{
|
||||
return assert_cast<ColumnUInt64 &>(*offsets).getData();
|
||||
}
|
||||
|
||||
size_t ColumnSparse::getValueIndex(size_t n) const
|
||||
{
|
||||
assert(n < _size);
|
||||
|
||||
const auto & offsets_data = getOffsetsData();
|
||||
const auto * it = std::lower_bound(offsets_data.begin(), offsets_data.end(), n);
|
||||
if (it == offsets_data.end() || *it != n)
|
||||
return 0;
|
||||
|
||||
return it - offsets_data.begin() + 1;
|
||||
}
|
||||
|
||||
ColumnPtr recursiveRemoveSparse(const ColumnPtr & column)
|
||||
{
|
||||
if (!column)
|
||||
return column;
|
||||
|
||||
if (const auto * column_tuple = typeid_cast<const ColumnTuple *>(column.get()))
|
||||
{
|
||||
auto columns = column_tuple->getColumns();
|
||||
for (auto & element : columns)
|
||||
element = recursiveRemoveSparse(element);
|
||||
|
||||
return ColumnTuple::create(columns);
|
||||
}
|
||||
|
||||
return column->convertToFullColumnIfSparse();
|
||||
}
|
||||
|
||||
}
|
231
src/Columns/ColumnSparse.h
Normal file
231
src/Columns/ColumnSparse.h
Normal file
@ -0,0 +1,231 @@
|
||||
#pragma once
|
||||
|
||||
#include <Columns/IColumn.h>
|
||||
#include <Columns/IColumnImpl.h>
|
||||
#include <Columns/ColumnsNumber.h>
|
||||
#include <Common/typeid_cast.h>
|
||||
#include <Common/assert_cast.h>
|
||||
|
||||
class Collator;
|
||||
|
||||
namespace DB
|
||||
{
|
||||
|
||||
|
||||
/** Column for spare representation.
|
||||
* It stores column with non-default values and column
|
||||
* with their sorted positions in original column. Column with
|
||||
* values contains also one default value at 0 position to make
|
||||
* implementation of execution of functions and sorting more convenient.
|
||||
*/
|
||||
class ColumnSparse final : public COWHelper<IColumn, ColumnSparse>
|
||||
{
|
||||
private:
|
||||
friend class COWHelper<IColumn, ColumnSparse>;
|
||||
|
||||
explicit ColumnSparse(MutableColumnPtr && values_);
|
||||
ColumnSparse(MutableColumnPtr && values_, MutableColumnPtr && offsets_, size_t size_);
|
||||
ColumnSparse(const ColumnSparse &) = default;
|
||||
|
||||
public:
|
||||
static constexpr auto DEFAULT_ROWS_SEARCH_SAMPLE_RATIO = 0.1;
|
||||
static constexpr auto DEFAULT_RATIO_FOR_SPARSE_SERIALIZATION = 0.95;
|
||||
|
||||
using Base = COWHelper<IColumn, ColumnSparse>;
|
||||
static Ptr create(const ColumnPtr & values_, const ColumnPtr & offsets_, size_t size_)
|
||||
{
|
||||
return Base::create(values_->assumeMutable(), offsets_->assumeMutable(), size_);
|
||||
}
|
||||
|
||||
template <typename TColumnPtr, typename = typename std::enable_if<IsMutableColumns<TColumnPtr>::value>::type>
|
||||
static MutablePtr create(TColumnPtr && values_, TColumnPtr && offsets_, size_t size_)
|
||||
{
|
||||
return Base::create(std::move(values_), std::move(offsets_), size_);
|
||||
}
|
||||
|
||||
static Ptr create(const ColumnPtr & values_)
|
||||
{
|
||||
return Base::create(values_->assumeMutable());
|
||||
}
|
||||
|
||||
template <typename TColumnPtr, typename = typename std::enable_if<IsMutableColumns<TColumnPtr>::value>::type>
|
||||
static MutablePtr create(TColumnPtr && values_)
|
||||
{
|
||||
return Base::create(std::forward<TColumnPtr>(values_));
|
||||
}
|
||||
|
||||
bool isSparse() const override { return true; }
|
||||
const char * getFamilyName() const override { return "Sparse"; }
|
||||
std::string getName() const override { return "Sparse(" + values->getName() + ")"; }
|
||||
TypeIndex getDataType() const override { return values->getDataType(); }
|
||||
MutableColumnPtr cloneResized(size_t new_size) const override;
|
||||
size_t size() const override { return _size; }
|
||||
bool isDefaultAt(size_t n) const override;
|
||||
bool isNullAt(size_t n) const override;
|
||||
Field operator[](size_t n) const override;
|
||||
void get(size_t n, Field & res) const override;
|
||||
bool getBool(size_t n) const override;
|
||||
Float64 getFloat64(size_t n) const override;
|
||||
Float32 getFloat32(size_t n) const override;
|
||||
UInt64 getUInt(size_t n) const override;
|
||||
Int64 getInt(size_t n) const override;
|
||||
UInt64 get64(size_t n) const override;
|
||||
StringRef getDataAt(size_t n) const override;
|
||||
|
||||
ColumnPtr convertToFullColumnIfSparse() const override;
|
||||
|
||||
/// Will insert null value if pos=nullptr
|
||||
void insertData(const char * pos, size_t length) override;
|
||||
StringRef serializeValueIntoArena(size_t n, Arena & arena, char const *& begin) const override;
|
||||
const char * deserializeAndInsertFromArena(const char * pos) override;
|
||||
const char * skipSerializedInArena(const char *) const override;
|
||||
void insertRangeFrom(const IColumn & src, size_t start, size_t length) override;
|
||||
void insert(const Field & x) override;
|
||||
void insertFrom(const IColumn & src, size_t n) override;
|
||||
void insertDefault() override;
|
||||
void insertManyDefaults(size_t length) override;
|
||||
|
||||
void popBack(size_t n) override;
|
||||
ColumnPtr filter(const Filter & filt, ssize_t) const override;
|
||||
void expand(const Filter & mask, bool inverted) override;
|
||||
ColumnPtr permute(const Permutation & perm, size_t limit) const override;
|
||||
|
||||
ColumnPtr index(const IColumn & indexes, size_t limit) const override;
|
||||
|
||||
template <typename Type>
|
||||
ColumnPtr indexImpl(const PaddedPODArray<Type> & indexes, size_t limit) const;
|
||||
|
||||
int compareAt(size_t n, size_t m, const IColumn & rhs_, int null_direction_hint) const override;
|
||||
void compareColumn(const IColumn & rhs, size_t rhs_row_num,
|
||||
PaddedPODArray<UInt64> * row_indexes, PaddedPODArray<Int8> & compare_results,
|
||||
int direction, int nan_direction_hint) const override;
|
||||
|
||||
int compareAtWithCollation(size_t n, size_t m, const IColumn & rhs, int null_direction_hint, const Collator & collator) const override;
|
||||
bool hasEqualValues() const override;
|
||||
|
||||
void getPermutationImpl(bool reverse, size_t limit, int null_direction_hint, Permutation & res, const Collator * collator) const;
|
||||
|
||||
void getPermutation(bool reverse, size_t limit, int null_direction_hint, Permutation & res) const override;
|
||||
void updatePermutation(bool reverse, size_t limit, int null_direction_hint, Permutation & res, EqualRanges & equal_range) const override;
|
||||
void getPermutationWithCollation(const Collator & collator, bool reverse, size_t limit, int null_direction_hint, Permutation & res) const override;
|
||||
void updatePermutationWithCollation(
|
||||
const Collator & collator, bool reverse, size_t limit, int null_direction_hint, Permutation & res, EqualRanges& equal_range) const override;
|
||||
|
||||
size_t byteSize() const override;
|
||||
size_t byteSizeAt(size_t n) const override;
|
||||
size_t allocatedBytes() const override;
|
||||
void protect() override;
|
||||
ColumnPtr replicate(const Offsets & replicate_offsets) const override;
|
||||
void updateHashWithValue(size_t n, SipHash & hash) const override;
|
||||
void updateWeakHash32(WeakHash32 & hash) const override;
|
||||
void updateHashFast(SipHash & hash) const override;
|
||||
void getExtremes(Field & min, Field & max) const override;
|
||||
|
||||
void getIndicesOfNonDefaultRows(IColumn::Offsets & indices, size_t from, size_t limit) const override;
|
||||
double getRatioOfDefaultRows(double sample_ratio) const override;
|
||||
|
||||
MutableColumns scatter(ColumnIndex num_columns, const Selector & selector) const override;
|
||||
|
||||
void gather(ColumnGathererStream & gatherer_stream) override;
|
||||
|
||||
ColumnPtr compress() const override;
|
||||
|
||||
void forEachSubcolumn(ColumnCallback callback) override;
|
||||
|
||||
bool structureEquals(const IColumn & rhs) const override;
|
||||
|
||||
bool isNullable() const override { return values->isNullable(); }
|
||||
bool isFixedAndContiguous() const override { return false; }
|
||||
bool valuesHaveFixedSize() const override { return values->valuesHaveFixedSize(); }
|
||||
size_t sizeOfValueIfFixed() const override { return values->sizeOfValueIfFixed() + values->sizeOfValueIfFixed(); }
|
||||
bool isCollationSupported() const override { return values->isCollationSupported(); }
|
||||
|
||||
size_t getNumberOfDefaults() const { return _size - offsets->size(); }
|
||||
size_t getNumberOfTrailingDefaults() const
|
||||
{
|
||||
return offsets->empty() ? _size : _size - getOffsetsData().back() - 1;
|
||||
}
|
||||
|
||||
/// Return position of element in 'values' columns,
|
||||
/// that corresponds to n-th element of full column.
|
||||
/// O(log(offsets.size())) complexity,
|
||||
size_t getValueIndex(size_t n) const;
|
||||
|
||||
const IColumn & getValuesColumn() const { return *values; }
|
||||
IColumn & getValuesColumn() { return *values; }
|
||||
|
||||
const ColumnPtr & getValuesPtr() const { return values; }
|
||||
ColumnPtr & getValuesPtr() { return values; }
|
||||
|
||||
const IColumn::Offsets & getOffsetsData() const;
|
||||
IColumn::Offsets & getOffsetsData();
|
||||
|
||||
const ColumnPtr & getOffsetsPtr() const { return offsets; }
|
||||
ColumnPtr & getOffsetsPtr() { return offsets; }
|
||||
|
||||
const IColumn & getOffsetsColumn() const { return *offsets; }
|
||||
IColumn & getOffsetsColumn() { return *offsets; }
|
||||
|
||||
/// This class helps to iterate over all values in ColumnSparse.
|
||||
class Iterator
|
||||
{
|
||||
public:
|
||||
Iterator(const PaddedPODArray<UInt64> & offsets_, size_t size_, size_t current_offset_, size_t current_row_)
|
||||
: offsets(offsets_), size(size_), current_offset(current_offset_), current_row(current_row_)
|
||||
{
|
||||
}
|
||||
|
||||
bool ALWAYS_INLINE isDefault() const { return current_offset == offsets.size() || current_row != offsets[current_offset]; }
|
||||
size_t ALWAYS_INLINE getValueIndex() const { return isDefault() ? 0 : current_offset + 1; }
|
||||
size_t ALWAYS_INLINE getCurrentRow() const { return current_row; }
|
||||
size_t ALWAYS_INLINE getCurrentOffset() const { return current_offset; }
|
||||
|
||||
bool operator==(const Iterator & other) const
|
||||
{
|
||||
return size == other.size
|
||||
&& current_offset == other.current_offset
|
||||
&& current_row == other.current_row;
|
||||
}
|
||||
|
||||
bool operator!=(const Iterator & other) const { return !(*this == other); }
|
||||
|
||||
Iterator operator++()
|
||||
{
|
||||
if (!isDefault())
|
||||
++current_offset;
|
||||
++current_row;
|
||||
return *this;
|
||||
}
|
||||
|
||||
private:
|
||||
const PaddedPODArray<UInt64> & offsets;
|
||||
const size_t size;
|
||||
size_t current_offset;
|
||||
size_t current_row;
|
||||
};
|
||||
|
||||
Iterator begin() const { return Iterator(getOffsetsData(), _size, 0, 0); }
|
||||
Iterator end() const { return Iterator(getOffsetsData(), _size, getOffsetsData().size(), _size); }
|
||||
|
||||
private:
|
||||
using Inserter = std::function<void(IColumn &)>;
|
||||
|
||||
/// Inserts value to 'values' column via callback.
|
||||
/// Properly handles cases, when inserted value is default.
|
||||
/// Used, when it's unknown in advance if inserted value is default.
|
||||
void insertSingleValue(const Inserter & inserter);
|
||||
|
||||
/// Contains default value at 0 position.
|
||||
/// It's convenient, because it allows to execute, e.g functions or sorting,
|
||||
/// for this column without handling different cases.
|
||||
WrappedPtr values;
|
||||
|
||||
/// Sorted offsets of non-default values in the full column.
|
||||
/// 'offsets[i]' corresponds to 'values[i + 1]'.
|
||||
WrappedPtr offsets;
|
||||
size_t _size;
|
||||
};
|
||||
|
||||
ColumnPtr recursiveRemoveSparse(const ColumnPtr & column);
|
||||
|
||||
}
|
@ -107,6 +107,12 @@ public:
|
||||
return StringRef(&chars[offsetAt(n)], sizeAt(n));
|
||||
}
|
||||
|
||||
bool isDefaultAt(size_t n) const override
|
||||
{
|
||||
assert(n < size());
|
||||
return sizeAt(n) == 1;
|
||||
}
|
||||
|
||||
/// Suppress gcc 7.3.1 warning: '*((void*)&<anonymous> +8)' may be used uninitialized in this function
|
||||
#if !defined(__clang__)
|
||||
#pragma GCC diagnostic push
|
||||
@ -278,6 +284,16 @@ public:
|
||||
return typeid(rhs) == typeid(ColumnString);
|
||||
}
|
||||
|
||||
double getRatioOfDefaultRows(double sample_ratio) const override
|
||||
{
|
||||
return getRatioOfDefaultRowsImpl<ColumnString>(sample_ratio);
|
||||
}
|
||||
|
||||
void getIndicesOfNonDefaultRows(Offsets & indices, size_t from, size_t limit) const override
|
||||
{
|
||||
return getIndicesOfNonDefaultRowsImpl<ColumnString>(indices, from, limit);
|
||||
}
|
||||
|
||||
Chars & getChars() { return chars; }
|
||||
const Chars & getChars() const { return chars; }
|
||||
|
||||
|
@ -12,6 +12,7 @@
|
||||
#include <base/sort.h>
|
||||
#include <base/map.h>
|
||||
#include <base/range.h>
|
||||
#include <DataTypes/Serializations/SerializationInfoTuple.h>
|
||||
|
||||
|
||||
namespace DB
|
||||
@ -113,6 +114,15 @@ void ColumnTuple::get(size_t n, Field & res) const
|
||||
res = tuple;
|
||||
}
|
||||
|
||||
bool ColumnTuple::isDefaultAt(size_t n) const
|
||||
{
|
||||
const size_t tuple_size = columns.size();
|
||||
for (size_t i = 0; i < tuple_size; ++i)
|
||||
if (!columns[i]->isDefaultAt(n))
|
||||
return false;
|
||||
return true;
|
||||
}
|
||||
|
||||
StringRef ColumnTuple::getDataAt(size_t) const
|
||||
{
|
||||
throw Exception("Method getDataAt is not supported for " + getName(), ErrorCodes::NOT_IMPLEMENTED);
|
||||
@ -536,4 +546,25 @@ ColumnPtr ColumnTuple::compress() const
|
||||
});
|
||||
}
|
||||
|
||||
double ColumnTuple::getRatioOfDefaultRows(double sample_ratio) const
|
||||
{
|
||||
return getRatioOfDefaultRowsImpl<ColumnTuple>(sample_ratio);
|
||||
}
|
||||
|
||||
void ColumnTuple::getIndicesOfNonDefaultRows(Offsets & indices, size_t from, size_t limit) const
|
||||
{
|
||||
return getIndicesOfNonDefaultRowsImpl<ColumnTuple>(indices, from, limit);
|
||||
}
|
||||
|
||||
SerializationInfoPtr ColumnTuple::getSerializationInfo() const
|
||||
{
|
||||
MutableSerializationInfos infos;
|
||||
infos.reserve(columns.size());
|
||||
|
||||
for (const auto & column : columns)
|
||||
infos.push_back(const_pointer_cast<SerializationInfo>(column->getSerializationInfo()));
|
||||
|
||||
return std::make_shared<SerializationInfoTuple>(std::move(infos), SerializationInfo::Settings{});
|
||||
}
|
||||
|
||||
}
|
||||
|
@ -53,6 +53,7 @@ public:
|
||||
Field operator[](size_t n) const override;
|
||||
void get(size_t n, Field & res) const override;
|
||||
|
||||
bool isDefaultAt(size_t n) const override;
|
||||
StringRef getDataAt(size_t n) const override;
|
||||
void insertData(const char * pos, size_t length) override;
|
||||
void insert(const Field & x) override;
|
||||
@ -93,6 +94,9 @@ public:
|
||||
bool structureEquals(const IColumn & rhs) const override;
|
||||
bool isCollationSupported() const override;
|
||||
ColumnPtr compress() const override;
|
||||
double getRatioOfDefaultRows(double sample_ratio) const override;
|
||||
void getIndicesOfNonDefaultRows(Offsets & indices, size_t from, size_t limit) const override;
|
||||
SerializationInfoPtr getSerializationInfo() const override;
|
||||
|
||||
size_t tupleSize() const { return columns.size(); }
|
||||
|
||||
|
@ -68,6 +68,7 @@ public:
|
||||
|
||||
Field operator[](size_t n) const override { return (*getNestedColumn())[n]; }
|
||||
void get(size_t n, Field & res) const override { getNestedColumn()->get(n, res); }
|
||||
bool isDefaultAt(size_t n) const override { return n == 0; }
|
||||
StringRef getDataAt(size_t n) const override { return getNestedColumn()->getDataAt(n); }
|
||||
StringRef getDataAtWithTerminatingZero(size_t n) const override
|
||||
{
|
||||
@ -122,6 +123,16 @@ public:
|
||||
return false;
|
||||
}
|
||||
|
||||
double getRatioOfDefaultRows(double) const override
|
||||
{
|
||||
throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Method 'getRatioOfDefaultRows' not implemented for ColumnUnique");
|
||||
}
|
||||
|
||||
void getIndicesOfNonDefaultRows(IColumn::Offsets &, size_t, size_t) const override
|
||||
{
|
||||
throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Method 'getIndicesOfNonDefaultRows' not implemented for ColumnUnique");
|
||||
}
|
||||
|
||||
const UInt64 * tryGetSavedHash() const override { return reverse_index.tryGetSavedHash(); }
|
||||
|
||||
UInt128 getHash() const override { return hash.getHash(*getRawColumnPtr()); }
|
||||
|
@ -502,6 +502,24 @@ ColumnPtr ColumnVector<T>::compress() const
|
||||
});
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
ColumnPtr ColumnVector<T>::createWithOffsets(const IColumn::Offsets & offsets, const Field & default_field, size_t total_rows, size_t shift) const
|
||||
{
|
||||
if (offsets.size() + shift != size())
|
||||
throw Exception(ErrorCodes::LOGICAL_ERROR,
|
||||
"Incompatible sizes of offsets ({}), shift ({}) and size of column {}", offsets.size(), shift, size());
|
||||
|
||||
auto res = this->create();
|
||||
auto & res_data = res->getData();
|
||||
|
||||
T default_value = safeGet<T>(default_field);
|
||||
res_data.resize_fill(total_rows, default_value);
|
||||
for (size_t i = 0; i < offsets.size(); ++i)
|
||||
res_data[offsets[i]] = data[i + shift];
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
/// Explicit template instantiations - to avoid code bloat in headers.
|
||||
template class ColumnVector<UInt8>;
|
||||
template class ColumnVector<UInt16>;
|
||||
|
@ -328,11 +328,25 @@ public:
|
||||
return StringRef(reinterpret_cast<const char *>(&data[n]), sizeof(data[n]));
|
||||
}
|
||||
|
||||
bool isDefaultAt(size_t n) const override { return data[n] == T{}; }
|
||||
|
||||
bool structureEquals(const IColumn & rhs) const override
|
||||
{
|
||||
return typeid(rhs) == typeid(ColumnVector<T>);
|
||||
}
|
||||
|
||||
double getRatioOfDefaultRows(double sample_ratio) const override
|
||||
{
|
||||
return this->template getRatioOfDefaultRowsImpl<Self>(sample_ratio);
|
||||
}
|
||||
|
||||
void getIndicesOfNonDefaultRows(IColumn::Offsets & indices, size_t from, size_t limit) const override
|
||||
{
|
||||
return this->template getIndicesOfNonDefaultRowsImpl<Self>(indices, from, limit);
|
||||
}
|
||||
|
||||
ColumnPtr createWithOffsets(const IColumn::Offsets & offsets, const Field & default_field, size_t total_rows, size_t shift) const override;
|
||||
|
||||
ColumnPtr compress() const override;
|
||||
|
||||
/// Replace elements that match the filter with zeroes. If inverted replaces not matched elements.
|
||||
|
@ -4,6 +4,7 @@
|
||||
#include <Columns/ColumnsNumber.h>
|
||||
#include <Columns/ColumnNullable.h>
|
||||
#include <Columns/ColumnConst.h>
|
||||
#include <Columns/ColumnSparse.h>
|
||||
#include <Core/ColumnWithTypeAndName.h>
|
||||
|
||||
|
||||
@ -50,6 +51,9 @@ ConstantFilterDescription::ConstantFilterDescription(const IColumn & column)
|
||||
|
||||
FilterDescription::FilterDescription(const IColumn & column_)
|
||||
{
|
||||
if (column_.isSparse())
|
||||
data_holder = recursiveRemoveSparse(column_.getPtr());
|
||||
|
||||
if (column_.lowCardinality())
|
||||
data_holder = column_.convertToFullColumnIfLowCardinality();
|
||||
|
||||
|
@ -4,11 +4,17 @@
|
||||
#include <Columns/ColumnNullable.h>
|
||||
#include <Columns/ColumnConst.h>
|
||||
#include <Core/Field.h>
|
||||
#include <DataTypes/Serializations/SerializationInfo.h>
|
||||
|
||||
|
||||
namespace DB
|
||||
{
|
||||
|
||||
namespace ErrorCodes
|
||||
{
|
||||
extern const int LOGICAL_ERROR;
|
||||
}
|
||||
|
||||
String IColumn::dumpStructure() const
|
||||
{
|
||||
WriteBufferFromOwnString res;
|
||||
@ -30,6 +36,39 @@ void IColumn::insertFrom(const IColumn & src, size_t n)
|
||||
insert(src[n]);
|
||||
}
|
||||
|
||||
ColumnPtr IColumn::createWithOffsets(const Offsets & offsets, const Field & default_field, size_t total_rows, size_t shift) const
|
||||
{
|
||||
if (offsets.size() + shift != size())
|
||||
throw Exception(ErrorCodes::LOGICAL_ERROR,
|
||||
"Incompatible sizes of offsets ({}), shift ({}) and size of column {}", offsets.size(), shift, size());
|
||||
|
||||
auto res = cloneEmpty();
|
||||
res->reserve(total_rows);
|
||||
|
||||
ssize_t current_offset = -1;
|
||||
for (size_t i = 0; i < offsets.size(); ++i)
|
||||
{
|
||||
ssize_t offsets_diff = static_cast<ssize_t>(offsets[i]) - current_offset;
|
||||
current_offset = offsets[i];
|
||||
|
||||
if (offsets_diff > 1)
|
||||
res->insertMany(default_field, offsets_diff - 1);
|
||||
|
||||
res->insertFrom(*this, i + shift);
|
||||
}
|
||||
|
||||
ssize_t offsets_diff = static_cast<ssize_t>(total_rows) - current_offset;
|
||||
if (offsets_diff > 1)
|
||||
res->insertMany(default_field, offsets_diff - 1);
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
SerializationInfoPtr IColumn::getSerializationInfo() const
|
||||
{
|
||||
return std::make_shared<SerializationInfo>(ISerialization::getKind(*this), SerializationInfo::Settings{});
|
||||
}
|
||||
|
||||
bool isColumnNullable(const IColumn & column)
|
||||
{
|
||||
return checkColumn<ColumnNullable>(column);
|
||||
|
@ -26,9 +26,8 @@ class ColumnGathererStream;
|
||||
class Field;
|
||||
class WeakHash32;
|
||||
|
||||
class ISerialization;
|
||||
using SerializationPtr = std::shared_ptr<const ISerialization>;
|
||||
|
||||
class SerializationInfo;
|
||||
using SerializationInfoPtr = std::shared_ptr<const SerializationInfo>;
|
||||
|
||||
/*
|
||||
* Represents a set of equal ranges in previous column to perform sorting in current column.
|
||||
@ -64,9 +63,18 @@ public:
|
||||
virtual Ptr convertToFullColumnIfConst() const { return getPtr(); }
|
||||
|
||||
/// If column isn't ColumnLowCardinality, return itself.
|
||||
/// If column is ColumnLowCardinality, transforms is to full column.
|
||||
/// If column is ColumnLowCardinality, transforms it to full column.
|
||||
virtual Ptr convertToFullColumnIfLowCardinality() const { return getPtr(); }
|
||||
|
||||
/// If column isn't ColumnSparse, return itself.
|
||||
/// If column is ColumnSparse, transforms it to full column.
|
||||
virtual Ptr convertToFullColumnIfSparse() const { return getPtr(); }
|
||||
|
||||
Ptr convertToFullIfNeeded() const
|
||||
{
|
||||
return convertToFullColumnIfSparse()->convertToFullColumnIfConst()->convertToFullColumnIfLowCardinality();
|
||||
}
|
||||
|
||||
/// Creates empty column with the same type.
|
||||
virtual MutablePtr cloneEmpty() const { return cloneResized(0); }
|
||||
|
||||
@ -133,7 +141,7 @@ public:
|
||||
throw Exception("Method getInt is not supported for " + getName(), ErrorCodes::NOT_IMPLEMENTED);
|
||||
}
|
||||
|
||||
virtual bool isDefaultAt(size_t n) const { return get64(n) == 0; }
|
||||
virtual bool isDefaultAt(size_t n) const = 0;
|
||||
virtual bool isNullAt(size_t /*n*/) const { return false; }
|
||||
|
||||
/** If column is numeric, return value of n-th element, casted to bool.
|
||||
@ -173,6 +181,13 @@ public:
|
||||
insertFrom(src, position);
|
||||
}
|
||||
|
||||
/// Appends one field multiple times. Can be optimized in inherited classes.
|
||||
virtual void insertMany(const Field & field, size_t length)
|
||||
{
|
||||
for (size_t i = 0; i < length; ++i)
|
||||
insert(field);
|
||||
}
|
||||
|
||||
/// Appends data located in specified memory chunk if it is possible (throws an exception if it cannot be implemented).
|
||||
/// Is used to optimize some computations (in aggregation, for example).
|
||||
/// Parameter length could be ignored if column values have fixed size.
|
||||
@ -375,6 +390,22 @@ public:
|
||||
throw Exception("Method structureEquals is not supported for " + getName(), ErrorCodes::NOT_IMPLEMENTED);
|
||||
}
|
||||
|
||||
/// Returns ration of values in column, that equal to default value of column.
|
||||
/// Checks only @sample_ratio ratio of rows.
|
||||
virtual double getRatioOfDefaultRows(double sample_ratio = 1.0) const = 0;
|
||||
|
||||
/// Returns indices of values in column, that not equal to default value of column.
|
||||
virtual void getIndicesOfNonDefaultRows(Offsets & indices, size_t from, size_t limit) const = 0;
|
||||
|
||||
/// Returns column with @total_size elements.
|
||||
/// In result column values from current column are at positions from @offsets.
|
||||
/// Other values are filled by @default_value.
|
||||
/// @shift means how much rows to skip from the beginning of current column.
|
||||
/// Used to create full column from sparse.
|
||||
virtual Ptr createWithOffsets(const Offsets & offsets, const Field & default_field, size_t total_rows, size_t shift) const;
|
||||
|
||||
virtual SerializationInfoPtr getSerializationInfo() const;
|
||||
|
||||
/// Compress column in memory to some representation that allows to decompress it back.
|
||||
/// Return itself if compression is not applicable for this column type.
|
||||
virtual Ptr compress() const
|
||||
@ -457,6 +488,8 @@ public:
|
||||
|
||||
virtual bool lowCardinality() const { return false; }
|
||||
|
||||
virtual bool isSparse() const { return false; }
|
||||
|
||||
virtual bool isCollationSupported() const { return false; }
|
||||
|
||||
virtual ~IColumn() = default;
|
||||
@ -468,7 +501,6 @@ public:
|
||||
String dumpStructure() const;
|
||||
|
||||
protected:
|
||||
|
||||
/// Template is to devirtualize calls to insertFrom method.
|
||||
/// In derived classes (that use final keyword), implement scatter method as call to scatterImpl.
|
||||
template <typename Derived>
|
||||
@ -489,6 +521,13 @@ protected:
|
||||
template <typename Derived>
|
||||
bool hasEqualValuesImpl() const;
|
||||
|
||||
/// Template is to devirtualize calls to 'isDefaultAt' method.
|
||||
template <typename Derived>
|
||||
double getRatioOfDefaultRowsImpl(double sample_ratio) const;
|
||||
|
||||
template <typename Derived>
|
||||
void getIndicesOfNonDefaultRowsImpl(Offsets & indices, size_t from, size_t limit) const;
|
||||
|
||||
/// Uses std::sort and partial_sort as default algorithms.
|
||||
/// Implements 'less' and 'equals' via comparator.
|
||||
/// If 'less' and 'equals' can be implemented more optimal
|
||||
|
@ -46,6 +46,7 @@ public:
|
||||
Field operator[](size_t) const override { throw Exception("Cannot get value from " + getName(), ErrorCodes::NOT_IMPLEMENTED); }
|
||||
void get(size_t, Field &) const override { throw Exception("Cannot get value from " + getName(), ErrorCodes::NOT_IMPLEMENTED); }
|
||||
void insert(const Field &) override { throw Exception("Cannot insert element into " + getName(), ErrorCodes::NOT_IMPLEMENTED); }
|
||||
bool isDefaultAt(size_t) const override { throw Exception("isDefaultAt is not implemented for " + getName(), ErrorCodes::NOT_IMPLEMENTED); }
|
||||
|
||||
StringRef getDataAt(size_t) const override
|
||||
{
|
||||
@ -161,6 +162,16 @@ public:
|
||||
return res;
|
||||
}
|
||||
|
||||
double getRatioOfDefaultRows(double) const override
|
||||
{
|
||||
throw Exception("Method getRatioOfDefaultRows is not supported for " + getName(), ErrorCodes::NOT_IMPLEMENTED);
|
||||
}
|
||||
|
||||
void getIndicesOfNonDefaultRows(Offsets &, size_t, size_t) const override
|
||||
{
|
||||
throw Exception("Method getIndicesOfNonDefaultRows is not supported for " + getName(), ErrorCodes::NOT_IMPLEMENTED);
|
||||
}
|
||||
|
||||
void gather(ColumnGathererStream &) override
|
||||
{
|
||||
throw Exception("Method gather is not supported for " + getName(), ErrorCodes::NOT_IMPLEMENTED);
|
||||
|
@ -16,6 +16,7 @@ namespace DB
|
||||
namespace ErrorCodes
|
||||
{
|
||||
extern const int SIZES_OF_COLUMNS_DOESNT_MATCH;
|
||||
extern const int LOGICAL_ERROR;
|
||||
}
|
||||
|
||||
template <typename Derived>
|
||||
@ -141,6 +142,56 @@ bool IColumn::hasEqualValuesImpl() const
|
||||
return true;
|
||||
}
|
||||
|
||||
template <typename Derived>
|
||||
double IColumn::getRatioOfDefaultRowsImpl(double sample_ratio) const
|
||||
{
|
||||
if (sample_ratio <= 0.0 || sample_ratio > 1.0)
|
||||
throw Exception(ErrorCodes::LOGICAL_ERROR,
|
||||
"Value of 'sample_ratio' must be in interval (0.0; 1.0], but got: {}", sample_ratio);
|
||||
|
||||
/// Randomize a little to avoid boundary effects.
|
||||
std::uniform_int_distribution<size_t> dist(1, static_cast<size_t>(1.0 / sample_ratio));
|
||||
|
||||
size_t num_rows = size();
|
||||
size_t num_sampled_rows = static_cast<size_t>(num_rows * sample_ratio);
|
||||
size_t num_checked_rows = dist(thread_local_rng);
|
||||
num_sampled_rows = std::min(num_sampled_rows + dist(thread_local_rng), num_rows);
|
||||
size_t res = 0;
|
||||
|
||||
if (num_sampled_rows == num_rows)
|
||||
{
|
||||
for (size_t i = 0; i < num_rows; ++i)
|
||||
res += static_cast<const Derived &>(*this).isDefaultAt(i);
|
||||
num_checked_rows = num_rows;
|
||||
}
|
||||
else if (num_sampled_rows != 0)
|
||||
{
|
||||
for (size_t i = num_checked_rows; i < num_rows; ++i)
|
||||
{
|
||||
if (num_checked_rows * num_rows <= i * num_sampled_rows)
|
||||
{
|
||||
res += static_cast<const Derived &>(*this).isDefaultAt(i);
|
||||
++num_checked_rows;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return static_cast<double>(res) / num_checked_rows;
|
||||
}
|
||||
|
||||
template <typename Derived>
|
||||
void IColumn::getIndicesOfNonDefaultRowsImpl(Offsets & indices, size_t from, size_t limit) const
|
||||
{
|
||||
size_t to = limit && from + limit < size() ? from + limit : size();
|
||||
indices.reserve(indices.size() + to - from);
|
||||
|
||||
for (size_t i = from; i < to; ++i)
|
||||
{
|
||||
if (!static_cast<const Derived &>(*this).isDefaultAt(i))
|
||||
indices.push_back(i);
|
||||
}
|
||||
}
|
||||
|
||||
template <typename Comparator>
|
||||
void IColumn::updatePermutationImpl(
|
||||
size_t limit,
|
||||
|
327
src/Columns/tests/gtest_column_sparse.cpp
Normal file
327
src/Columns/tests/gtest_column_sparse.cpp
Normal file
@ -0,0 +1,327 @@
|
||||
#include <Columns/ColumnSparse.h>
|
||||
#include <Columns/ColumnsNumber.h>
|
||||
|
||||
#include <Common/randomSeed.h>
|
||||
#include <pcg_random.hpp>
|
||||
#include <gtest/gtest.h>
|
||||
|
||||
#include <algorithm>
|
||||
#include <numeric>
|
||||
|
||||
#include <Common/FieldVisitors.h>
|
||||
|
||||
using namespace DB;
|
||||
pcg64 rng(randomSeed());
|
||||
|
||||
std::pair<MutableColumnPtr, MutableColumnPtr> createColumns(size_t n, size_t k)
|
||||
{
|
||||
auto values = ColumnVector<UInt64>::create();
|
||||
auto offsets = ColumnVector<UInt64>::create();
|
||||
auto full = ColumnVector<UInt64>::create();
|
||||
|
||||
auto & values_data = values->getData();
|
||||
auto & offsets_data = offsets->getData();
|
||||
auto & full_data = full->getData();
|
||||
|
||||
values_data.push_back(0);
|
||||
|
||||
for (size_t i = 0; i < n; ++i)
|
||||
{
|
||||
bool not_zero = rng() % k == 0;
|
||||
size_t value = not_zero ? rng() % 1000000 : 0;
|
||||
full_data.push_back(value);
|
||||
|
||||
if (not_zero)
|
||||
{
|
||||
values_data.push_back(value);
|
||||
offsets_data.push_back(i);
|
||||
}
|
||||
}
|
||||
|
||||
auto sparse = ColumnSparse::create(std::move(values), std::move(offsets), n);
|
||||
return std::make_pair(std::move(sparse), std::move(full));
|
||||
}
|
||||
|
||||
bool checkEquals(const IColumn & lhs, const IColumn & rhs)
|
||||
{
|
||||
if (lhs.size() != rhs.size())
|
||||
return false;
|
||||
|
||||
for (size_t i = 0; i < lhs.size(); ++i)
|
||||
if (lhs.compareAt(i, i, rhs, 0) != 0)
|
||||
return false;
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
// Can't use ErrorCodes, because of 'using namespace DB'.
|
||||
constexpr int error_code = 12345;
|
||||
|
||||
constexpr size_t T = 5000;
|
||||
constexpr size_t MAX_ROWS = 10000;
|
||||
constexpr size_t sparse_ratios[] = {1, 2, 5, 10, 32, 50, 64, 100, 256, 500, 1000, 5000, 10000};
|
||||
constexpr size_t K = sizeof(sparse_ratios) / sizeof(sparse_ratios[0]);
|
||||
|
||||
#define DUMP_COLUMN(column) std::cerr << #column << ": " << (column)->dumpStructure() << "\n"
|
||||
|
||||
TEST(ColumnSparse, InsertRangeFrom)
|
||||
{
|
||||
auto test_case = [&](size_t n1, size_t k1, size_t n2, size_t k2, size_t from, size_t len)
|
||||
{
|
||||
auto [sparse_dst, full_dst] = createColumns(n1, k1);
|
||||
auto [sparse_src, full_src] = createColumns(n2, k2);
|
||||
|
||||
sparse_dst->insertRangeFrom(*sparse_src, from, len);
|
||||
full_dst->insertRangeFrom(*full_src, from, len);
|
||||
|
||||
if (!checkEquals(*sparse_dst->convertToFullColumnIfSparse(), *full_dst))
|
||||
{
|
||||
DUMP_COLUMN(sparse_src);
|
||||
DUMP_COLUMN(full_src);
|
||||
DUMP_COLUMN(sparse_dst);
|
||||
DUMP_COLUMN(full_dst);
|
||||
throw Exception(error_code, "Columns are unequal");
|
||||
}
|
||||
};
|
||||
|
||||
try
|
||||
{
|
||||
for (size_t i = 0; i < T; ++i)
|
||||
{
|
||||
size_t n1 = rng() % MAX_ROWS + 1;
|
||||
size_t k1 = sparse_ratios[rng() % K];
|
||||
|
||||
size_t n2 = rng() % MAX_ROWS + 1;
|
||||
size_t k2 = sparse_ratios[rng() % K];
|
||||
|
||||
size_t from = rng() % n2;
|
||||
size_t to = rng() % n2;
|
||||
|
||||
if (from > to)
|
||||
std::swap(from, to);
|
||||
|
||||
test_case(n1, k1, n2, k2, from, to - from);
|
||||
}
|
||||
}
|
||||
catch (const Exception & e)
|
||||
{
|
||||
FAIL() << e.displayText();
|
||||
}
|
||||
}
|
||||
|
||||
TEST(ColumnSparse, PopBack)
|
||||
{
|
||||
auto test_case = [&](size_t n, size_t k, size_t m)
|
||||
{
|
||||
auto [sparse_dst, full_dst] = createColumns(n, k);
|
||||
|
||||
sparse_dst->popBack(m);
|
||||
full_dst->popBack(m);
|
||||
|
||||
if (!checkEquals(*sparse_dst->convertToFullColumnIfSparse(), *full_dst))
|
||||
{
|
||||
DUMP_COLUMN(sparse_dst);
|
||||
DUMP_COLUMN(full_dst);
|
||||
throw Exception(error_code, "Columns are unequal");
|
||||
}
|
||||
};
|
||||
|
||||
try
|
||||
{
|
||||
for (size_t i = 0; i < T; ++i)
|
||||
{
|
||||
size_t n = rng() % MAX_ROWS + 1;
|
||||
size_t k = sparse_ratios[rng() % K];
|
||||
size_t m = rng() % n;
|
||||
|
||||
test_case(n, k, m);
|
||||
}
|
||||
}
|
||||
catch (const Exception & e)
|
||||
{
|
||||
FAIL() << e.displayText();
|
||||
}
|
||||
}
|
||||
|
||||
TEST(ColumnSparse, Filter)
|
||||
{
|
||||
auto test_case = [&](size_t n, size_t k, size_t m)
|
||||
{
|
||||
auto [sparse_src, full_src] = createColumns(n, k);
|
||||
|
||||
PaddedPODArray<UInt8> filt(n);
|
||||
for (size_t i = 0; i < n; ++i)
|
||||
filt[i] = rng() % m == 0;
|
||||
|
||||
auto sparse_dst = sparse_src->filter(filt, -1);
|
||||
auto full_dst = full_src->filter(filt, -1);
|
||||
|
||||
if (!checkEquals(*sparse_dst->convertToFullColumnIfSparse(), *full_dst))
|
||||
{
|
||||
DUMP_COLUMN(sparse_src);
|
||||
DUMP_COLUMN(full_src);
|
||||
DUMP_COLUMN(sparse_dst);
|
||||
DUMP_COLUMN(full_dst);
|
||||
throw Exception(error_code, "Columns are unequal");
|
||||
}
|
||||
};
|
||||
|
||||
try
|
||||
{
|
||||
for (size_t i = 0; i < T; ++i)
|
||||
{
|
||||
size_t n = rng() % MAX_ROWS + 1;
|
||||
size_t k = sparse_ratios[rng() % K];
|
||||
size_t m = sparse_ratios[rng() % K];
|
||||
|
||||
test_case(n, k, m);
|
||||
}
|
||||
}
|
||||
catch (const Exception & e)
|
||||
{
|
||||
FAIL() << e.displayText();
|
||||
}
|
||||
}
|
||||
|
||||
TEST(ColumnSparse, Permute)
|
||||
{
|
||||
auto test_case = [&](size_t n, size_t k, size_t limit)
|
||||
{
|
||||
auto [sparse_src, full_src] = createColumns(n, k);
|
||||
|
||||
IColumn::Permutation perm(n);
|
||||
std::iota(perm.begin(), perm.end(), 0);
|
||||
std::shuffle(perm.begin(), perm.end(), rng);
|
||||
|
||||
auto sparse_dst = sparse_src->permute(perm, limit);
|
||||
auto full_dst = full_src->permute(perm, limit);
|
||||
|
||||
if (limit)
|
||||
{
|
||||
sparse_dst = sparse_dst->cut(0, limit);
|
||||
full_dst = full_dst->cut(0, limit);
|
||||
}
|
||||
|
||||
if (!checkEquals(*sparse_dst->convertToFullColumnIfSparse(), *full_dst))
|
||||
{
|
||||
DUMP_COLUMN(sparse_src);
|
||||
DUMP_COLUMN(full_src);
|
||||
DUMP_COLUMN(sparse_dst);
|
||||
DUMP_COLUMN(full_dst);
|
||||
throw Exception(error_code, "Columns are unequal");
|
||||
}
|
||||
};
|
||||
|
||||
try
|
||||
{
|
||||
for (size_t i = 0; i < T; ++i)
|
||||
{
|
||||
size_t n = rng() % MAX_ROWS + 1;
|
||||
size_t k = sparse_ratios[rng() % K];
|
||||
size_t limit = rng() % 2 ? 0 : rng() % n;
|
||||
|
||||
test_case(n, k, limit);
|
||||
}
|
||||
}
|
||||
catch (const Exception & e)
|
||||
{
|
||||
FAIL() << e.displayText();
|
||||
}
|
||||
}
|
||||
|
||||
TEST(ColumnSparse, CompareColumn)
|
||||
{
|
||||
auto test_case = [&](size_t n1, size_t k1, size_t n2, size_t k2, size_t row_num)
|
||||
{
|
||||
auto [sparse_src1, full_src1] = createColumns(n1, k1);
|
||||
auto [sparse_src2, full_src2] = createColumns(n2, k2);
|
||||
|
||||
PaddedPODArray<Int8> comp_sparse;
|
||||
PaddedPODArray<Int8> comp_full;
|
||||
|
||||
sparse_src1->compareColumn(*sparse_src2, row_num, nullptr, comp_sparse, 1, 1);
|
||||
full_src1->compareColumn(*full_src2, row_num, nullptr, comp_full, 1, 1);
|
||||
|
||||
if (comp_sparse != comp_full)
|
||||
{
|
||||
DUMP_COLUMN(sparse_src1);
|
||||
DUMP_COLUMN(full_src1);
|
||||
DUMP_COLUMN(sparse_src2);
|
||||
DUMP_COLUMN(full_src2);
|
||||
throw Exception(error_code, "Compare results are unequal");
|
||||
}
|
||||
};
|
||||
|
||||
try
|
||||
{
|
||||
for (size_t i = 0; i < T; ++i)
|
||||
{
|
||||
size_t n1 = rng() % MAX_ROWS + 1;
|
||||
size_t k1 = sparse_ratios[rng() % K];
|
||||
|
||||
size_t n2 = rng() % MAX_ROWS + 1;
|
||||
size_t k2 = sparse_ratios[rng() % K];
|
||||
|
||||
size_t row_num = rng() % n2;
|
||||
|
||||
test_case(n1, k1, n2, k2, row_num);
|
||||
}
|
||||
}
|
||||
catch (const Exception & e)
|
||||
{
|
||||
FAIL() << e.displayText();
|
||||
}
|
||||
}
|
||||
|
||||
TEST(ColumnSparse, GetPermutation)
|
||||
{
|
||||
auto test_case = [&](size_t n, size_t k, size_t limit, bool reverse)
|
||||
{
|
||||
auto [sparse_src, full_src] = createColumns(n, k);
|
||||
|
||||
IColumn::Permutation perm_sparse;
|
||||
IColumn::Permutation perm_full;
|
||||
|
||||
sparse_src->getPermutation(reverse, limit, 1, perm_sparse);
|
||||
full_src->getPermutation(reverse, limit, 1, perm_full);
|
||||
|
||||
auto sparse_sorted = sparse_src->permute(perm_sparse, limit);
|
||||
auto full_sorted = full_src->permute(perm_full, limit);
|
||||
|
||||
if (limit)
|
||||
{
|
||||
sparse_sorted = sparse_sorted->cut(0, limit);
|
||||
full_sorted = full_sorted->cut(0, limit);
|
||||
}
|
||||
|
||||
if (!checkEquals(*sparse_sorted->convertToFullColumnIfSparse(), *full_sorted))
|
||||
{
|
||||
DUMP_COLUMN(sparse_src);
|
||||
DUMP_COLUMN(full_src);
|
||||
DUMP_COLUMN(sparse_sorted);
|
||||
DUMP_COLUMN(full_sorted);
|
||||
throw Exception(error_code, "Sorted columns are unequal");
|
||||
}
|
||||
};
|
||||
|
||||
try
|
||||
{
|
||||
for (size_t i = 0; i < T; ++i)
|
||||
{
|
||||
size_t n = rng() % MAX_ROWS + 1;
|
||||
size_t k = sparse_ratios[rng() % K];
|
||||
|
||||
size_t limit = rng() % 2 ? 0 : rng() % n;
|
||||
bool reverse = rng() % 2;
|
||||
|
||||
test_case(n, k, limit, reverse);
|
||||
}
|
||||
}
|
||||
catch (const Exception & e)
|
||||
{
|
||||
FAIL() << e.displayText();
|
||||
}
|
||||
}
|
||||
|
||||
#undef DUMP_COLUMN
|
||||
#undef DUMP_NON_DEFAULTS
|
@ -470,7 +470,7 @@
|
||||
M(497, ACCESS_DENIED) \
|
||||
M(498, LIMIT_BY_WITH_TIES_IS_NOT_SUPPORTED) \
|
||||
M(499, S3_ERROR) \
|
||||
M(500, BLOB_STORAGE_ERROR) \
|
||||
M(500, AZURE_BLOB_STORAGE_ERROR) \
|
||||
M(501, CANNOT_CREATE_DATABASE) \
|
||||
M(502, CANNOT_SIGQUEUE) \
|
||||
M(503, AGGREGATE_FUNCTION_THROW) \
|
||||
|
@ -117,7 +117,7 @@ ASTPtr CompressionCodecFactory::validateCodecAndGetPreprocessedAST(
|
||||
};
|
||||
|
||||
ISerialization::SubstreamPath path;
|
||||
column_type->getDefaultSerialization()->enumerateStreams(path, callback, column_type, nullptr);
|
||||
column_type->getDefaultSerialization()->enumerateStreams(path, callback, column_type);
|
||||
|
||||
if (!result_codec)
|
||||
throw Exception(ErrorCodes::LOGICAL_ERROR, "Cannot find any substream with data type for type {}. It's a bug", column_type->getName());
|
||||
|
@ -55,7 +55,7 @@ namespace
|
||||
return "/";
|
||||
}
|
||||
|
||||
void writeNode(const KeeperStorage::Node & node, WriteBuffer & out)
|
||||
void writeNode(const KeeperStorage::Node & node, SnapshotVersion version, WriteBuffer & out)
|
||||
{
|
||||
writeBinary(node.data, out);
|
||||
|
||||
@ -76,6 +76,11 @@ namespace
|
||||
writeBinary(node.stat.pzxid, out);
|
||||
|
||||
writeBinary(node.seq_num, out);
|
||||
|
||||
if (version >= SnapshotVersion::V4)
|
||||
{
|
||||
writeBinary(node.size_bytes, out);
|
||||
}
|
||||
}
|
||||
|
||||
void readNode(KeeperStorage::Node & node, ReadBuffer & in, SnapshotVersion version, ACLMap & acl_map)
|
||||
@ -124,6 +129,11 @@ namespace
|
||||
readBinary(node.stat.numChildren, in);
|
||||
readBinary(node.stat.pzxid, in);
|
||||
readBinary(node.seq_num, in);
|
||||
|
||||
if (version >= SnapshotVersion::V4)
|
||||
{
|
||||
readBinary(node.size_bytes, in);
|
||||
}
|
||||
}
|
||||
|
||||
void serializeSnapshotMetadata(const SnapshotMetadataPtr & snapshot_meta, WriteBuffer & out)
|
||||
@ -176,7 +186,7 @@ void KeeperStorageSnapshot::serialize(const KeeperStorageSnapshot & snapshot, Wr
|
||||
throw Exception(ErrorCodes::LOGICAL_ERROR, "Trying to serialize node with mzxid {}, but last snapshot index {}", node.stat.mzxid, snapshot.snapshot_meta->get_last_log_idx());
|
||||
|
||||
writeBinary(path, out);
|
||||
writeNode(node, out);
|
||||
writeNode(node, snapshot.version, out);
|
||||
|
||||
/// Last iteration: check and exit here without iterator increment. Otherwise
|
||||
/// false positive race condition on list end is possible.
|
||||
|
@ -18,9 +18,10 @@ enum SnapshotVersion : uint8_t
|
||||
V1 = 1, /// with ACL map
|
||||
V2 = 2, /// with 64 bit buffer header
|
||||
V3 = 3, /// compress snapshots with ZSTD codec
|
||||
V4 = 4, /// add Node size to snapshots
|
||||
};
|
||||
|
||||
static constexpr auto CURRENT_SNAPSHOT_VERSION = SnapshotVersion::V3;
|
||||
static constexpr auto CURRENT_SNAPSHOT_VERSION = SnapshotVersion::V4;
|
||||
|
||||
/// What is stored in binary shapsnot
|
||||
struct SnapshotDeserializationResult
|
||||
|
@ -133,21 +133,6 @@ static bool fixupACL(
|
||||
return valid_found;
|
||||
}
|
||||
|
||||
uint64_t KeeperStorage::Node::sizeInBytes() const
|
||||
{
|
||||
uint64_t total_size{0};
|
||||
for (const auto & child : children)
|
||||
total_size += child.size();
|
||||
|
||||
total_size += data.size();
|
||||
|
||||
total_size += sizeof(acl_id);
|
||||
total_size += sizeof(is_sequental);
|
||||
total_size += sizeof(stat);
|
||||
total_size += sizeof(seq_num);
|
||||
return total_size;
|
||||
}
|
||||
|
||||
static KeeperStorage::ResponsesForSessions processWatchesImpl(const String & path, KeeperStorage::Watches & watches, KeeperStorage::Watches & list_watches, Coordination::Event event_type)
|
||||
{
|
||||
KeeperStorage::ResponsesForSessions result;
|
||||
@ -354,6 +339,7 @@ struct KeeperStorageCreateRequestProcessor final : public KeeperStorageRequestPr
|
||||
{
|
||||
|
||||
parent.children.insert(child_path);
|
||||
parent.size_bytes += child_path.size();
|
||||
prev_parent_cversion = parent.stat.cversion;
|
||||
prev_parent_zxid = parent.stat.pzxid;
|
||||
|
||||
@ -391,6 +377,7 @@ struct KeeperStorageCreateRequestProcessor final : public KeeperStorageRequestPr
|
||||
undo_parent.stat.cversion = prev_parent_cversion;
|
||||
undo_parent.stat.pzxid = prev_parent_zxid;
|
||||
undo_parent.children.erase(child_path);
|
||||
undo_parent.size_bytes -= child_path.size();
|
||||
});
|
||||
};
|
||||
|
||||
@ -524,6 +511,7 @@ struct KeeperStorageRemoveRequestProcessor final : public KeeperStorageRequestPr
|
||||
--parent.stat.numChildren;
|
||||
++parent.stat.cversion;
|
||||
parent.children.erase(child_basename);
|
||||
parent.size_bytes -= child_basename.size();
|
||||
});
|
||||
|
||||
response.error = Coordination::Error::ZOK;
|
||||
@ -543,6 +531,7 @@ struct KeeperStorageRemoveRequestProcessor final : public KeeperStorageRequestPr
|
||||
++parent.stat.numChildren;
|
||||
--parent.stat.cversion;
|
||||
parent.children.insert(child_basename);
|
||||
parent.size_bytes += child_basename.size();
|
||||
});
|
||||
};
|
||||
}
|
||||
@ -621,11 +610,11 @@ struct KeeperStorageSetRequestProcessor final : public KeeperStorageRequestProce
|
||||
|
||||
auto itr = container.updateValue(request.path, [zxid, request] (KeeperStorage::Node & value)
|
||||
{
|
||||
value.data = request.data;
|
||||
value.stat.version++;
|
||||
value.stat.mzxid = zxid;
|
||||
value.stat.mtime = std::chrono::system_clock::now().time_since_epoch() / std::chrono::milliseconds(1);
|
||||
value.stat.dataLength = request.data.length();
|
||||
value.size_bytes = value.size_bytes + request.data.size() - value.data.size();
|
||||
value.data = request.data;
|
||||
});
|
||||
|
||||
@ -1110,6 +1099,7 @@ KeeperStorage::ResponsesForSessions KeeperStorage::processRequest(const Coordina
|
||||
--parent.stat.numChildren;
|
||||
++parent.stat.cversion;
|
||||
parent.children.erase(getBaseName(ephemeral_path));
|
||||
parent.size_bytes -= getBaseName(ephemeral_path).size();
|
||||
});
|
||||
|
||||
auto responses = processWatchesImpl(ephemeral_path, watches, list_watches, Coordination::Event::DELETED);
|
||||
|
@ -35,9 +35,22 @@ public:
|
||||
Coordination::Stat stat{};
|
||||
int32_t seq_num = 0;
|
||||
ChildrenSet children{};
|
||||
uint64_t size_bytes; // save size to avoid calculate every time
|
||||
|
||||
Node()
|
||||
{
|
||||
size_bytes = sizeof(size_bytes);
|
||||
size_bytes += data.size();
|
||||
size_bytes += sizeof(acl_id);
|
||||
size_bytes += sizeof(is_sequental);
|
||||
size_bytes += sizeof(stat);
|
||||
size_bytes += sizeof(seq_num);
|
||||
}
|
||||
/// Object memory size
|
||||
uint64_t sizeInBytes() const;
|
||||
uint64_t sizeInBytes() const
|
||||
{
|
||||
return size_bytes;
|
||||
}
|
||||
};
|
||||
|
||||
struct ResponseForSession
|
||||
|
@ -977,24 +977,24 @@ TEST_P(CoordinationTest, SnapshotableHashMapDataSize)
|
||||
|
||||
world.disableSnapshotMode();
|
||||
world.insert("world", n1);
|
||||
EXPECT_EQ(world.getApproximateDataSize(), 94);
|
||||
EXPECT_EQ(world.getApproximateDataSize(), 98);
|
||||
world.updateValue("world", [&](Node & value) { value = n2; });
|
||||
EXPECT_EQ(world.getApproximateDataSize(), 96);
|
||||
EXPECT_EQ(world.getApproximateDataSize(), 98);
|
||||
|
||||
world.erase("world");
|
||||
EXPECT_EQ(world.getApproximateDataSize(), 0);
|
||||
|
||||
world.enableSnapshotMode();
|
||||
world.insert("world", n1);
|
||||
EXPECT_EQ(world.getApproximateDataSize(), 94);
|
||||
EXPECT_EQ(world.getApproximateDataSize(), 98);
|
||||
world.updateValue("world", [&](Node & value) { value = n2; });
|
||||
EXPECT_EQ(world.getApproximateDataSize(), 190);
|
||||
EXPECT_EQ(world.getApproximateDataSize(), 196);
|
||||
|
||||
world.clearOutdatedNodes();
|
||||
EXPECT_EQ(world.getApproximateDataSize(), 96);
|
||||
EXPECT_EQ(world.getApproximateDataSize(), 98);
|
||||
|
||||
world.erase("world");
|
||||
EXPECT_EQ(world.getApproximateDataSize(), 96);
|
||||
EXPECT_EQ(world.getApproximateDataSize(), 98);
|
||||
|
||||
world.clear();
|
||||
EXPECT_EQ(world.getApproximateDataSize(), 0);
|
||||
|
@ -9,6 +9,7 @@
|
||||
#include <Common/assert_cast.h>
|
||||
|
||||
#include <Columns/ColumnConst.h>
|
||||
#include <Columns/ColumnSparse.h>
|
||||
|
||||
#include <iterator>
|
||||
|
||||
@ -37,7 +38,7 @@ static ReturnType onError(const std::string & message [[maybe_unused]], int code
|
||||
|
||||
template <typename ReturnType>
|
||||
static ReturnType checkColumnStructure(const ColumnWithTypeAndName & actual, const ColumnWithTypeAndName & expected,
|
||||
const std::string & context_description, bool allow_remove_constants, int code)
|
||||
const std::string & context_description, bool allow_materialize, int code)
|
||||
{
|
||||
if (actual.name != expected.name)
|
||||
return onError<ReturnType>("Block structure mismatch in " + context_description + " stream: different names of columns:\n"
|
||||
@ -52,11 +53,16 @@ static ReturnType checkColumnStructure(const ColumnWithTypeAndName & actual, con
|
||||
|
||||
const IColumn * actual_column = actual.column.get();
|
||||
|
||||
/// If we allow to remove constants, and expected column is not const, then unwrap actual constant column.
|
||||
if (allow_remove_constants && !isColumnConst(*expected.column))
|
||||
/// If we allow to materialize, and expected column is not const or sparse, then unwrap actual column.
|
||||
if (allow_materialize)
|
||||
{
|
||||
if (const auto * column_const = typeid_cast<const ColumnConst *>(actual_column))
|
||||
actual_column = &column_const->getDataColumn();
|
||||
if (!isColumnConst(*expected.column))
|
||||
if (const auto * column_const = typeid_cast<const ColumnConst *>(actual_column))
|
||||
actual_column = &column_const->getDataColumn();
|
||||
|
||||
if (!expected.column->isSparse())
|
||||
if (const auto * column_sparse = typeid_cast<const ColumnSparse *>(actual_column))
|
||||
actual_column = &column_sparse->getValuesColumn();
|
||||
}
|
||||
|
||||
if (actual_column->getName() != expected.column->getName())
|
||||
@ -79,7 +85,7 @@ static ReturnType checkColumnStructure(const ColumnWithTypeAndName & actual, con
|
||||
|
||||
|
||||
template <typename ReturnType>
|
||||
static ReturnType checkBlockStructure(const Block & lhs, const Block & rhs, const std::string & context_description, bool allow_remove_constants)
|
||||
static ReturnType checkBlockStructure(const Block & lhs, const Block & rhs, const std::string & context_description, bool allow_materialize)
|
||||
{
|
||||
size_t columns = rhs.columns();
|
||||
if (lhs.columns() != columns)
|
||||
@ -93,11 +99,11 @@ static ReturnType checkBlockStructure(const Block & lhs, const Block & rhs, cons
|
||||
|
||||
if constexpr (std::is_same_v<ReturnType, bool>)
|
||||
{
|
||||
if (!checkColumnStructure<ReturnType>(actual, expected, context_description, allow_remove_constants, ErrorCodes::LOGICAL_ERROR))
|
||||
if (!checkColumnStructure<ReturnType>(actual, expected, context_description, allow_materialize, ErrorCodes::LOGICAL_ERROR))
|
||||
return false;
|
||||
}
|
||||
else
|
||||
checkColumnStructure<ReturnType>(actual, expected, context_description, allow_remove_constants, ErrorCodes::LOGICAL_ERROR);
|
||||
checkColumnStructure<ReturnType>(actual, expected, context_description, allow_materialize, ErrorCodes::LOGICAL_ERROR);
|
||||
}
|
||||
|
||||
return ReturnType(true);
|
||||
@ -706,6 +712,11 @@ void Block::updateHash(SipHash & hash) const
|
||||
col.column->updateHashWithValue(row_no, hash);
|
||||
}
|
||||
|
||||
void convertToFullIfSparse(Block & block)
|
||||
{
|
||||
for (auto & column : block)
|
||||
column.column = recursiveRemoveSparse(column.column);
|
||||
}
|
||||
|
||||
ColumnPtr getColumnFromBlock(const Block & block, const NameAndTypePair & column)
|
||||
{
|
||||
@ -729,7 +740,7 @@ Block materializeBlock(const Block & block)
|
||||
for (size_t i = 0; i < columns; ++i)
|
||||
{
|
||||
auto & element = res.getByPosition(i);
|
||||
element.column = element.column->convertToFullColumnIfConst();
|
||||
element.column = recursiveRemoveSparse(element.column->convertToFullColumnIfConst());
|
||||
}
|
||||
|
||||
return res;
|
||||
@ -738,7 +749,7 @@ Block materializeBlock(const Block & block)
|
||||
void materializeBlockInplace(Block & block)
|
||||
{
|
||||
for (size_t i = 0; i < block.columns(); ++i)
|
||||
block.getByPosition(i).column = block.getByPosition(i).column->convertToFullColumnIfConst();
|
||||
block.getByPosition(i).column = recursiveRemoveSparse(block.getByPosition(i).column->convertToFullColumnIfConst());
|
||||
}
|
||||
|
||||
}
|
||||
|
@ -193,6 +193,8 @@ void assertCompatibleHeader(const Block & actual, const Block & desired, const s
|
||||
/// Calculate difference in structure of blocks and write description into output strings. NOTE It doesn't compare values of constant columns.
|
||||
void getBlocksDifference(const Block & lhs, const Block & rhs, std::string & out_lhs_diff, std::string & out_rhs_diff);
|
||||
|
||||
void convertToFullIfSparse(Block & block);
|
||||
|
||||
/// Helps in-memory storages to extract columns from block.
|
||||
/// Properly handles cases, when column is a subcolumn and when it is compressed.
|
||||
ColumnPtr getColumnFromBlock(const Block & block, const NameAndTypePair & column);
|
||||
|
@ -8,5 +8,6 @@ namespace DB
|
||||
{
|
||||
|
||||
using ColumnNumbers = std::vector<size_t>;
|
||||
using ColumnNumbersList = std::vector<ColumnNumbers>;
|
||||
|
||||
}
|
||||
|
@ -7,6 +7,7 @@
|
||||
#include <IO/WriteHelpers.h>
|
||||
#include <IO/ReadBufferFromString.h>
|
||||
#include <IO/WriteBufferFromString.h>
|
||||
#include <IO/Operators.h>
|
||||
|
||||
|
||||
namespace DB
|
||||
@ -43,6 +44,17 @@ String NameAndTypePair::getSubcolumnName() const
|
||||
return name.substr(*subcolumn_delimiter_position + 1, name.size() - *subcolumn_delimiter_position);
|
||||
}
|
||||
|
||||
String NameAndTypePair::dump() const
|
||||
{
|
||||
WriteBufferFromOwnString out;
|
||||
out << "name: " << name << "\n"
|
||||
<< "type: " << type->getName() << "\n"
|
||||
<< "name in storage: " << getNameInStorage() << "\n"
|
||||
<< "type in storage: " << getTypeInStorage()->getName();
|
||||
|
||||
return out.str();
|
||||
}
|
||||
|
||||
void NamesAndTypesList::readText(ReadBuffer & buf)
|
||||
{
|
||||
const DataTypeFactory & data_type_factory = DataTypeFactory::instance();
|
||||
|
@ -40,6 +40,8 @@ public:
|
||||
return name == rhs.name && type->equals(*rhs.type);
|
||||
}
|
||||
|
||||
String dump() const;
|
||||
|
||||
String name;
|
||||
DataTypePtr type;
|
||||
|
||||
@ -107,6 +109,8 @@ public:
|
||||
std::optional<NameAndTypePair> tryGetByName(const std::string & name) const;
|
||||
};
|
||||
|
||||
using NamesAndTypesLists = std::vector<NamesAndTypesList>;
|
||||
|
||||
}
|
||||
|
||||
namespace std
|
||||
|
@ -44,6 +44,8 @@
|
||||
|
||||
#define DBMS_MIN_PROTOCOL_VERSION_WITH_INCREMENTAL_PROFILE_EVENTS 54451
|
||||
|
||||
#define DBMS_MIN_REVISION_WITH_CUSTOM_SERIALIZATION 54454
|
||||
|
||||
/// Version of ClickHouse TCP protocol.
|
||||
///
|
||||
/// Should be incremented manually on protocol changes.
|
||||
@ -51,7 +53,6 @@
|
||||
/// NOTE: DBMS_TCP_PROTOCOL_VERSION has nothing common with VERSION_REVISION,
|
||||
/// later is just a number for server version (one number instead of commit SHA)
|
||||
/// for simplicity (sometimes it may be more convenient in some use cases).
|
||||
|
||||
#define DBMS_TCP_PROTOCOL_VERSION 54453
|
||||
#define DBMS_TCP_PROTOCOL_VERSION 54454
|
||||
|
||||
#define DBMS_MIN_PROTOCOL_VERSION_WITH_INITIAL_QUERY_START_TIME 54449
|
||||
|
@ -496,8 +496,12 @@ class IColumn;
|
||||
M(Bool, database_replicated_always_detach_permanently, false, "Execute DETACH TABLE as DETACH TABLE PERMANENTLY if database engine is Replicated", 0) \
|
||||
M(DistributedDDLOutputMode, distributed_ddl_output_mode, DistributedDDLOutputMode::THROW, "Format of distributed DDL query result", 0) \
|
||||
M(UInt64, distributed_ddl_entry_format_version, 1, "Version of DDL entry to write into ZooKeeper", 0) \
|
||||
\
|
||||
M(UInt64, external_storage_max_read_rows, 0, "Limit maximum number of rows when table with external engine should flush history data. Now supported only for MySQL table engine, database engine, dictionary and MaterializedMySQL. If equal to 0, this setting is disabled", 0) \
|
||||
M(UInt64, external_storage_max_read_bytes, 0, "Limit maximum number of bytes when table with external engine should flush history data. Now supported only for MySQL table engine, database engine, dictionary and MaterializedMySQL. If equal to 0, this setting is disabled", 0) \
|
||||
M(UInt64, external_storage_connect_timeout_sec, DBMS_DEFAULT_CONNECT_TIMEOUT_SEC, "Connect timeout in seconds. Now supported only for MySQL", 0) \
|
||||
M(UInt64, external_storage_rw_timeout_sec, DBMS_DEFAULT_RECEIVE_TIMEOUT_SEC, "Read/write timeout in seconds. Now supported only for MySQL", 0) \
|
||||
\
|
||||
M(UnionMode, union_default_mode, UnionMode::Unspecified, "Set default Union Mode in SelectWithUnion query. Possible values: empty string, 'ALL', 'DISTINCT'. If empty, query without Union Mode will throw exception.", 0) \
|
||||
M(Bool, optimize_aggregators_of_group_by_keys, true, "Eliminates min/max/any/anyLast aggregators of GROUP BY keys in SELECT section", 0) \
|
||||
M(Bool, optimize_group_by_function_keys, true, "Eliminates functions of other keys in GROUP BY section", 0) \
|
||||
|
@ -66,6 +66,7 @@ public:
|
||||
bool shouldAlignRightInPrettyFormats() const override { return false; }
|
||||
|
||||
SerializationPtr doGetDefaultSerialization() const override;
|
||||
bool supportsSparseSerialization() const override { return false; }
|
||||
|
||||
bool isVersioned() const { return function->isVersioned(); }
|
||||
|
||||
|
@ -51,6 +51,7 @@ public:
|
||||
bool isNullable() const override { return false; }
|
||||
bool onlyNull() const override { return false; }
|
||||
bool lowCardinality() const override { return true; }
|
||||
bool supportsSparseSerialization() const override { return false; }
|
||||
bool isLowCardinalityNullable() const override { return dictionary_type->isNullable(); }
|
||||
|
||||
static MutableColumnUniquePtr createColumnUnique(const IDataType & keys_type);
|
||||
|
@ -6,8 +6,10 @@
|
||||
#include <DataTypes/DataTypeTuple.h>
|
||||
#include <DataTypes/DataTypeArray.h>
|
||||
#include <DataTypes/DataTypeFactory.h>
|
||||
#include <DataTypes/Serializations/SerializationInfo.h>
|
||||
#include <DataTypes/Serializations/SerializationTuple.h>
|
||||
#include <DataTypes/Serializations/SerializationNamed.h>
|
||||
#include <DataTypes/Serializations/SerializationInfoTuple.h>
|
||||
#include <DataTypes/NestedUtils.h>
|
||||
#include <Parsers/IAST.h>
|
||||
#include <Parsers/ASTNameTypePair.h>
|
||||
@ -152,6 +154,20 @@ MutableColumnPtr DataTypeTuple::createColumn() const
|
||||
return ColumnTuple::create(std::move(tuple_columns));
|
||||
}
|
||||
|
||||
MutableColumnPtr DataTypeTuple::createColumn(const ISerialization & serialization) const
|
||||
{
|
||||
const auto & element_serializations =
|
||||
assert_cast<const SerializationTuple &>(serialization).getElementsSerializations();
|
||||
|
||||
size_t size = elems.size();
|
||||
assert(element_serializations.size() == size);
|
||||
MutableColumns tuple_columns(size);
|
||||
for (size_t i = 0; i < size; ++i)
|
||||
tuple_columns[i] = elems[i]->createColumn(*element_serializations[i]->getNested());
|
||||
|
||||
return ColumnTuple::create(std::move(tuple_columns));
|
||||
}
|
||||
|
||||
Field DataTypeTuple::getDefault() const
|
||||
{
|
||||
return Tuple(collections::map<Tuple>(elems, [] (const DataTypePtr & elem) { return elem->getDefault(); }));
|
||||
@ -248,21 +264,33 @@ SerializationPtr DataTypeTuple::doGetDefaultSerialization() const
|
||||
return std::make_shared<SerializationTuple>(std::move(serializations), use_explicit_names);
|
||||
}
|
||||
|
||||
SerializationPtr DataTypeTuple::getSerialization(const String & column_name, const StreamExistenceCallback & callback) const
|
||||
SerializationPtr DataTypeTuple::getSerialization(const SerializationInfo & info) const
|
||||
{
|
||||
SerializationTuple::ElementSerializations serializations(elems.size());
|
||||
const auto & info_tuple = assert_cast<const SerializationInfoTuple &>(info);
|
||||
bool use_explicit_names = have_explicit_names && serialize_names;
|
||||
|
||||
for (size_t i = 0; i < elems.size(); ++i)
|
||||
{
|
||||
String elem_name = use_explicit_names ? names[i] : toString(i + 1);
|
||||
auto subcolumn_name = Nested::concatenateName(column_name, elem_name);
|
||||
auto serializaion = elems[i]->getSerialization(subcolumn_name, callback);
|
||||
serializations[i] = std::make_shared<SerializationNamed>(serializaion, elem_name);
|
||||
auto serialization = elems[i]->getSerialization(*info_tuple.getElementInfo(i));
|
||||
serializations[i] = std::make_shared<SerializationNamed>(serialization, elem_name);
|
||||
}
|
||||
|
||||
return std::make_shared<SerializationTuple>(std::move(serializations), use_explicit_names);
|
||||
}
|
||||
|
||||
MutableSerializationInfoPtr DataTypeTuple::createSerializationInfo(const SerializationInfo::Settings & settings) const
|
||||
{
|
||||
MutableSerializationInfos infos;
|
||||
infos.reserve(elems.size());
|
||||
for (const auto & elem : elems)
|
||||
infos.push_back(elem->createSerializationInfo(settings));
|
||||
|
||||
return std::make_shared<SerializationInfoTuple>(std::move(infos), settings);
|
||||
}
|
||||
|
||||
|
||||
static DataTypePtr create(const ASTPtr & arguments)
|
||||
{
|
||||
if (!arguments || arguments->children.empty())
|
||||
|
@ -36,8 +36,10 @@ public:
|
||||
const char * getFamilyName() const override { return "Tuple"; }
|
||||
|
||||
bool canBeInsideNullable() const override { return false; }
|
||||
bool supportsSparseSerialization() const override { return true; }
|
||||
|
||||
MutableColumnPtr createColumn() const override;
|
||||
MutableColumnPtr createColumn(const ISerialization & serialization) const override;
|
||||
|
||||
Field getDefault() const override;
|
||||
void insertDefaultInto(IColumn & column) const override;
|
||||
@ -52,9 +54,9 @@ public:
|
||||
size_t getMaximumSizeOfValueInMemory() const override;
|
||||
size_t getSizeOfValueInMemory() const override;
|
||||
|
||||
SerializationPtr getSerialization(const String & column_name, const StreamExistenceCallback & callback) const override;
|
||||
|
||||
SerializationPtr doGetDefaultSerialization() const override;
|
||||
SerializationPtr getSerialization(const SerializationInfo & info) const override;
|
||||
MutableSerializationInfoPtr createSerializationInfo(const SerializationInfo::Settings & settings) const override;
|
||||
|
||||
const DataTypePtr & getElement(size_t i) const { return elems[i]; }
|
||||
const DataTypes & getElements() const { return elems; }
|
||||
|
@ -1,5 +1,6 @@
|
||||
#include <Columns/IColumn.h>
|
||||
#include <Columns/ColumnConst.h>
|
||||
#include <Columns/ColumnSparse.h>
|
||||
|
||||
#include <Common/Exception.h>
|
||||
#include <Common/SipHash.h>
|
||||
@ -10,6 +11,8 @@
|
||||
#include <DataTypes/IDataType.h>
|
||||
#include <DataTypes/DataTypeCustom.h>
|
||||
#include <DataTypes/NestedUtils.h>
|
||||
#include <DataTypes/Serializations/SerializationSparse.h>
|
||||
#include <DataTypes/Serializations/SerializationInfo.h>
|
||||
|
||||
|
||||
namespace DB
|
||||
@ -40,6 +43,15 @@ void IDataType::updateAvgValueSizeHint(const IColumn & column, double & avg_valu
|
||||
}
|
||||
}
|
||||
|
||||
MutableColumnPtr IDataType::createColumn(const ISerialization & serialization) const
|
||||
{
|
||||
auto column = createColumn();
|
||||
if (serialization.getKind() == ISerialization::Kind::SPARSE)
|
||||
return ColumnSparse::create(std::move(column));
|
||||
|
||||
return column;
|
||||
}
|
||||
|
||||
ColumnPtr IDataType::createColumnConst(size_t size, const Field & field) const
|
||||
{
|
||||
auto column = createColumn();
|
||||
@ -65,9 +77,7 @@ size_t IDataType::getSizeOfValueInMemory() const
|
||||
|
||||
void IDataType::forEachSubcolumn(
|
||||
const SubcolumnCallback & callback,
|
||||
const SerializationPtr & serialization,
|
||||
const DataTypePtr & type,
|
||||
const ColumnPtr & column)
|
||||
const SubstreamData & data)
|
||||
{
|
||||
ISerialization::StreamCallback callback_with_data = [&](const auto & subpath)
|
||||
{
|
||||
@ -76,66 +86,59 @@ void IDataType::forEachSubcolumn(
|
||||
if (!subpath[i].visited && ISerialization::hasSubcolumnForPath(subpath, i + 1))
|
||||
{
|
||||
auto name = ISerialization::getSubcolumnNameForStream(subpath, i + 1);
|
||||
auto data = ISerialization::createFromPath(subpath, i);
|
||||
callback(subpath, name, data);
|
||||
auto subdata = ISerialization::createFromPath(subpath, i);
|
||||
callback(subpath, name, subdata);
|
||||
}
|
||||
subpath[i].visited = true;
|
||||
}
|
||||
};
|
||||
|
||||
ISerialization::SubstreamPath path;
|
||||
serialization->enumerateStreams(path, callback_with_data, type, column);
|
||||
SubstreamPath path;
|
||||
data.serialization->enumerateStreams(path, callback_with_data, data);
|
||||
}
|
||||
|
||||
DataTypePtr IDataType::tryGetSubcolumnType(const String & subcolumn_name) const
|
||||
template <typename Ptr>
|
||||
Ptr IDataType::getForSubcolumn(
|
||||
const String & subcolumn_name,
|
||||
const SubstreamData & data,
|
||||
Ptr SubstreamData::*member,
|
||||
bool throw_if_null) const
|
||||
{
|
||||
DataTypePtr res;
|
||||
forEachSubcolumn([&](const auto &, const auto & name, const auto & data)
|
||||
Ptr res;
|
||||
forEachSubcolumn([&](const auto &, const auto & name, const auto & subdata)
|
||||
{
|
||||
if (name == subcolumn_name)
|
||||
res = data.type;
|
||||
}, getDefaultSerialization(), getPtr(), nullptr);
|
||||
res = subdata.*member;
|
||||
}, data);
|
||||
|
||||
if (!res && throw_if_null)
|
||||
throw Exception(ErrorCodes::ILLEGAL_COLUMN, "There is no subcolumn {} in type {}", subcolumn_name, getName());
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
DataTypePtr IDataType::tryGetSubcolumnType(const String & subcolumn_name) const
|
||||
{
|
||||
SubstreamData data = { getDefaultSerialization(), getPtr(), nullptr, nullptr };
|
||||
return getForSubcolumn<DataTypePtr>(subcolumn_name, data, &SubstreamData::type, false);
|
||||
}
|
||||
|
||||
DataTypePtr IDataType::getSubcolumnType(const String & subcolumn_name) const
|
||||
{
|
||||
auto subcolumn_type = tryGetSubcolumnType(subcolumn_name);
|
||||
if (subcolumn_type)
|
||||
return subcolumn_type;
|
||||
|
||||
throw Exception(ErrorCodes::ILLEGAL_COLUMN, "There is no subcolumn {} in type {}", subcolumn_name, getName());
|
||||
SubstreamData data = { getDefaultSerialization(), getPtr(), nullptr, nullptr };
|
||||
return getForSubcolumn<DataTypePtr>(subcolumn_name, data, &SubstreamData::type);
|
||||
}
|
||||
|
||||
SerializationPtr IDataType::getSubcolumnSerialization(const String & subcolumn_name, const SerializationPtr & serialization) const
|
||||
{
|
||||
SerializationPtr res;
|
||||
forEachSubcolumn([&](const auto &, const auto & name, const auto & data)
|
||||
{
|
||||
if (name == subcolumn_name)
|
||||
res = data.serialization;
|
||||
}, serialization, nullptr, nullptr);
|
||||
|
||||
if (res)
|
||||
return res;
|
||||
|
||||
throw Exception(ErrorCodes::ILLEGAL_COLUMN, "There is no subcolumn {} in type {}", subcolumn_name, getName());
|
||||
SubstreamData data = { serialization, nullptr, nullptr, nullptr };
|
||||
return getForSubcolumn<SerializationPtr>(subcolumn_name, data, &SubstreamData::serialization);
|
||||
}
|
||||
|
||||
ColumnPtr IDataType::getSubcolumn(const String & subcolumn_name, const ColumnPtr & column) const
|
||||
{
|
||||
ColumnPtr res;
|
||||
forEachSubcolumn([&](const auto &, const auto & name, const auto & data)
|
||||
{
|
||||
if (name == subcolumn_name)
|
||||
res = data.column;
|
||||
}, getDefaultSerialization(), nullptr, column);
|
||||
|
||||
if (res)
|
||||
return res;
|
||||
|
||||
throw Exception(ErrorCodes::ILLEGAL_COLUMN, "There is no subcolumn {} in type {}", subcolumn_name, getName());
|
||||
SubstreamData data = { getDefaultSerialization(), nullptr, column, nullptr };
|
||||
return getForSubcolumn<ColumnPtr>(subcolumn_name, data, &SubstreamData::column);
|
||||
}
|
||||
|
||||
Names IDataType::getSubcolumnNames() const
|
||||
@ -144,7 +147,7 @@ Names IDataType::getSubcolumnNames() const
|
||||
forEachSubcolumn([&](const auto &, const auto & name, const auto &)
|
||||
{
|
||||
res.push_back(name);
|
||||
}, getDefaultSerialization(), nullptr, nullptr);
|
||||
}, { getDefaultSerialization(), nullptr, nullptr, nullptr });
|
||||
return res;
|
||||
}
|
||||
|
||||
@ -163,6 +166,12 @@ void IDataType::setCustomization(DataTypeCustomDescPtr custom_desc_) const
|
||||
custom_serialization = std::move(custom_desc_->serialization);
|
||||
}
|
||||
|
||||
MutableSerializationInfoPtr IDataType::createSerializationInfo(
|
||||
const SerializationInfo::Settings & settings) const
|
||||
{
|
||||
return std::make_shared<SerializationInfo>(ISerialization::Kind::DEFAULT, settings);
|
||||
}
|
||||
|
||||
SerializationPtr IDataType::getDefaultSerialization() const
|
||||
{
|
||||
if (custom_serialization)
|
||||
@ -171,22 +180,48 @@ SerializationPtr IDataType::getDefaultSerialization() const
|
||||
return doGetDefaultSerialization();
|
||||
}
|
||||
|
||||
SerializationPtr IDataType::getSparseSerialization() const
|
||||
{
|
||||
return std::make_shared<SerializationSparse>(getDefaultSerialization());
|
||||
}
|
||||
|
||||
SerializationPtr IDataType::getSerialization(ISerialization::Kind kind) const
|
||||
{
|
||||
if (supportsSparseSerialization() && kind == ISerialization::Kind::SPARSE)
|
||||
return getSparseSerialization();
|
||||
|
||||
return getDefaultSerialization();
|
||||
}
|
||||
|
||||
SerializationPtr IDataType::getSerialization(const SerializationInfo & info) const
|
||||
{
|
||||
return getSerialization(info.getKind());
|
||||
}
|
||||
|
||||
// static
|
||||
SerializationPtr IDataType::getSerialization(const NameAndTypePair & column, const IDataType::StreamExistenceCallback & callback)
|
||||
SerializationPtr IDataType::getSerialization(const NameAndTypePair & column, const SerializationInfo & info)
|
||||
{
|
||||
if (column.isSubcolumn())
|
||||
{
|
||||
const auto & type_in_storage = column.getTypeInStorage();
|
||||
auto default_serialization = type_in_storage->getDefaultSerialization();
|
||||
return type_in_storage->getSubcolumnSerialization(column.getSubcolumnName(), default_serialization);
|
||||
auto serialization = type_in_storage->getSerialization(info);
|
||||
return type_in_storage->getSubcolumnSerialization(column.getSubcolumnName(), serialization);
|
||||
}
|
||||
|
||||
return column.type->getSerialization(column.name, callback);
|
||||
return column.type->getSerialization(info);
|
||||
}
|
||||
|
||||
SerializationPtr IDataType::getSerialization(const String &, const StreamExistenceCallback &) const
|
||||
// static
|
||||
SerializationPtr IDataType::getSerialization(const NameAndTypePair & column)
|
||||
{
|
||||
return getDefaultSerialization();
|
||||
if (column.isSubcolumn())
|
||||
{
|
||||
const auto & type_in_storage = column.getTypeInStorage();
|
||||
auto serialization = type_in_storage->getDefaultSerialization();
|
||||
return type_in_storage->getSubcolumnSerialization(column.getSubcolumnName(), serialization);
|
||||
}
|
||||
|
||||
return column.type->getDefaultSerialization();
|
||||
}
|
||||
|
||||
}
|
||||
|
@ -6,7 +6,8 @@
|
||||
#include <Core/Names.h>
|
||||
#include <Core/TypeId.h>
|
||||
#include <DataTypes/DataTypeCustom.h>
|
||||
|
||||
#include <DataTypes/Serializations/ISerialization.h>
|
||||
#include <DataTypes/Serializations/SerializationInfo.h>
|
||||
|
||||
namespace DB
|
||||
{
|
||||
@ -27,7 +28,6 @@ using DataTypePtr = std::shared_ptr<const IDataType>;
|
||||
using DataTypes = std::vector<DataTypePtr>;
|
||||
|
||||
struct NameAndTypePair;
|
||||
class SerializationInfo;
|
||||
|
||||
struct DataTypeWithConstInfo
|
||||
{
|
||||
@ -84,45 +84,54 @@ public:
|
||||
SerializationPtr getSubcolumnSerialization(const String & subcolumn_name, const SerializationPtr & serialization) const;
|
||||
ColumnPtr getSubcolumn(const String & subcolumn_name, const ColumnPtr & column) const;
|
||||
|
||||
using SubstreamData = ISerialization::SubstreamData;
|
||||
using SubstreamPath = ISerialization::SubstreamPath;
|
||||
|
||||
using SubcolumnCallback = std::function<void(
|
||||
const ISerialization::SubstreamPath &,
|
||||
const SubstreamPath &,
|
||||
const String &,
|
||||
const ISerialization::SubstreamData &)>;
|
||||
const SubstreamData &)>;
|
||||
|
||||
static void forEachSubcolumn(
|
||||
const SubcolumnCallback & callback,
|
||||
const SerializationPtr & serialization,
|
||||
const DataTypePtr & type,
|
||||
const ColumnPtr & column);
|
||||
const SubstreamData & data);
|
||||
|
||||
Names getSubcolumnNames() const;
|
||||
|
||||
/// Returns default serialization of data type.
|
||||
virtual MutableSerializationInfoPtr createSerializationInfo(
|
||||
const SerializationInfo::Settings & settings) const;
|
||||
|
||||
/// TODO: support more types.
|
||||
virtual bool supportsSparseSerialization() const { return !haveSubtypes(); }
|
||||
|
||||
SerializationPtr getDefaultSerialization() const;
|
||||
SerializationPtr getSparseSerialization() const;
|
||||
|
||||
/// Asks whether the stream with given name exists in table.
|
||||
/// If callback returned true for all streams, which are required for
|
||||
/// one of serialization types, that serialization will be chosen for reading.
|
||||
/// If callback always returned false, the default serialization will be chosen.
|
||||
using StreamExistenceCallback = std::function<bool(const String &)>;
|
||||
/// Chooses serialization according to serialization kind.
|
||||
SerializationPtr getSerialization(ISerialization::Kind kind) const;
|
||||
|
||||
/// Chooses serialization for reading of one column or subcolumns by
|
||||
/// checking existence of substreams using callback.
|
||||
static SerializationPtr getSerialization(
|
||||
const NameAndTypePair & column,
|
||||
const StreamExistenceCallback & callback = [](const String &) { return false; });
|
||||
/// Chooses serialization according to collected information about content of column.
|
||||
virtual SerializationPtr getSerialization(const SerializationInfo & info) const;
|
||||
|
||||
virtual SerializationPtr getSerialization(const String & column_name, const StreamExistenceCallback & callback) const;
|
||||
/// Chooses between subcolumn serialization and regular serialization according to @column.
|
||||
/// This method typically should be used to get serialization for reading column or subcolumn.
|
||||
static SerializationPtr getSerialization(const NameAndTypePair & column, const SerializationInfo & info);
|
||||
|
||||
static SerializationPtr getSerialization(const NameAndTypePair & column);
|
||||
|
||||
protected:
|
||||
virtual String doGetName() const { return getFamilyName(); }
|
||||
virtual SerializationPtr doGetDefaultSerialization() const = 0;
|
||||
|
||||
public:
|
||||
/** Create empty column for corresponding type.
|
||||
/** Create empty column for corresponding type and default serialization.
|
||||
*/
|
||||
virtual MutableColumnPtr createColumn() const = 0;
|
||||
|
||||
/** Create empty column for corresponding type and serialization.
|
||||
*/
|
||||
virtual MutableColumnPtr createColumn(const ISerialization & serialization) const;
|
||||
|
||||
/** Create ColumnConst for corresponding type, with specified size and value.
|
||||
*/
|
||||
ColumnPtr createColumnConst(size_t size, const Field & field) const;
|
||||
@ -292,6 +301,14 @@ protected:
|
||||
public:
|
||||
const IDataTypeCustomName * getCustomName() const { return custom_name.get(); }
|
||||
const ISerialization * getCustomSerialization() const { return custom_serialization.get(); }
|
||||
|
||||
private:
|
||||
template <typename Ptr>
|
||||
Ptr getForSubcolumn(
|
||||
const String & subcolumn_name,
|
||||
const SubstreamData & data,
|
||||
Ptr SubstreamData::*member,
|
||||
bool throw_if_null = true) const;
|
||||
};
|
||||
|
||||
|
||||
|
@ -36,18 +36,18 @@ std::string concatenateName(const std::string & nested_table_name, const std::st
|
||||
|
||||
/** Name can be treated as compound if it contains dot (.) in the middle.
|
||||
*/
|
||||
std::pair<std::string, std::string> splitName(const std::string & name)
|
||||
std::pair<std::string, std::string> splitName(const std::string & name, bool reverse)
|
||||
{
|
||||
auto idx = name.find_first_of('.');
|
||||
auto idx = (reverse ? name.find_last_of('.') : name.find_first_of('.'));
|
||||
if (idx == std::string::npos || idx == 0 || idx + 1 == name.size())
|
||||
return {name, {}};
|
||||
|
||||
return {name.substr(0, idx), name.substr(idx + 1)};
|
||||
}
|
||||
|
||||
std::pair<std::string_view, std::string_view> splitName(const std::string_view & name)
|
||||
std::pair<std::string_view, std::string_view> splitName(const std::string_view & name, bool reverse)
|
||||
{
|
||||
auto idx = name.find_first_of('.');
|
||||
auto idx = (reverse ? name.find_last_of('.') : name.find_first_of('.'));
|
||||
if (idx == std::string::npos || idx == 0 || idx + 1 == name.size())
|
||||
return {name, {}};
|
||||
|
||||
|
@ -11,8 +11,9 @@ namespace Nested
|
||||
{
|
||||
std::string concatenateName(const std::string & nested_table_name, const std::string & nested_field_name);
|
||||
|
||||
std::pair<std::string, std::string> splitName(const std::string & name);
|
||||
std::pair<std::string_view, std::string_view> splitName(const std::string_view & name);
|
||||
/// Splits name of compound identifier by first/last dot (depending on 'reverse' parameter).
|
||||
std::pair<std::string, std::string> splitName(const std::string & name, bool reverse = false);
|
||||
std::pair<std::string_view, std::string_view> splitName(const std::string_view & name, bool reverse = false);
|
||||
|
||||
/// Returns the prefix of the name to the first '.'. Or the name is unchanged if there is no dot.
|
||||
std::string extractTableName(const std::string & nested_name);
|
||||
|
@ -16,12 +16,43 @@ namespace ErrorCodes
|
||||
{
|
||||
extern const int MULTIPLE_STREAMS_REQUIRED;
|
||||
extern const int UNEXPECTED_DATA_AFTER_PARSED_VALUE;
|
||||
extern const int LOGICAL_ERROR;
|
||||
}
|
||||
|
||||
ISerialization::Kind ISerialization::getKind(const IColumn & column)
|
||||
{
|
||||
if (column.isSparse())
|
||||
return Kind::SPARSE;
|
||||
|
||||
return Kind::DEFAULT;
|
||||
}
|
||||
|
||||
String ISerialization::kindToString(Kind kind)
|
||||
{
|
||||
switch (kind)
|
||||
{
|
||||
case Kind::DEFAULT:
|
||||
return "Default";
|
||||
case Kind::SPARSE:
|
||||
return "Sparse";
|
||||
}
|
||||
__builtin_unreachable();
|
||||
}
|
||||
|
||||
ISerialization::Kind ISerialization::stringToKind(const String & str)
|
||||
{
|
||||
if (str == "Default")
|
||||
return Kind::DEFAULT;
|
||||
else if (str == "Sparse")
|
||||
return Kind::SPARSE;
|
||||
else
|
||||
throw Exception(ErrorCodes::LOGICAL_ERROR, "Unknown serialization kind '{}'", str);
|
||||
}
|
||||
|
||||
String ISerialization::Substream::toString() const
|
||||
{
|
||||
if (type == TupleElement)
|
||||
return fmt::format("TupleElement({}, escape_tuple_delimiter={})",
|
||||
return fmt::format("TupleElement({}, escape_tuple_delimiter = {})",
|
||||
tuple_element_name, escape_tuple_delimiter ? "true" : "false");
|
||||
|
||||
return String(magic_enum::enum_name(type));
|
||||
@ -44,18 +75,22 @@ String ISerialization::SubstreamPath::toString() const
|
||||
void ISerialization::enumerateStreams(
|
||||
SubstreamPath & path,
|
||||
const StreamCallback & callback,
|
||||
DataTypePtr type,
|
||||
ColumnPtr column) const
|
||||
const SubstreamData & data) const
|
||||
{
|
||||
path.push_back(Substream::Regular);
|
||||
path.back().data = {type, column, getPtr(), nullptr};
|
||||
path.back().data = data;
|
||||
callback(path);
|
||||
path.pop_back();
|
||||
}
|
||||
|
||||
void ISerialization::enumerateStreams(const StreamCallback & callback, SubstreamPath & path) const
|
||||
{
|
||||
enumerateStreams(path, callback, nullptr, nullptr);
|
||||
enumerateStreams(path, callback, {getPtr(), nullptr, nullptr, nullptr});
|
||||
}
|
||||
|
||||
void ISerialization::enumerateStreams(SubstreamPath & path, const StreamCallback & callback, const DataTypePtr & type) const
|
||||
{
|
||||
enumerateStreams(path, callback, {getPtr(), type, nullptr, nullptr});
|
||||
}
|
||||
|
||||
void ISerialization::serializeBinaryBulk(const IColumn & column, WriteBuffer &, size_t, size_t) const
|
||||
@ -147,11 +182,23 @@ String ISerialization::getFileNameForStream(const NameAndTypePair & column, cons
|
||||
return getFileNameForStream(column.getNameInStorage(), path);
|
||||
}
|
||||
|
||||
static size_t isOffsetsOfNested(const ISerialization::SubstreamPath & path)
|
||||
{
|
||||
if (path.empty())
|
||||
return false;
|
||||
|
||||
for (const auto & elem : path)
|
||||
if (elem.type == ISerialization::Substream::ArrayElements)
|
||||
return false;
|
||||
|
||||
return path.back().type == ISerialization::Substream::ArraySizes;
|
||||
}
|
||||
|
||||
String ISerialization::getFileNameForStream(const String & name_in_storage, const SubstreamPath & path)
|
||||
{
|
||||
String stream_name;
|
||||
auto nested_storage_name = Nested::extractTableName(name_in_storage);
|
||||
if (name_in_storage != nested_storage_name && (path.size() == 1 && path[0].type == ISerialization::Substream::ArraySizes))
|
||||
if (name_in_storage != nested_storage_name && isOffsetsOfNested(path))
|
||||
stream_name = escapeForFileName(nested_storage_name);
|
||||
else
|
||||
stream_name = escapeForFileName(name_in_storage);
|
||||
@ -242,10 +289,9 @@ ISerialization::SubstreamData ISerialization::createFromPath(const SubstreamPath
|
||||
assert(prefix_len < path.size());
|
||||
|
||||
SubstreamData res = path[prefix_len].data;
|
||||
res.creator.reset();
|
||||
for (ssize_t i = static_cast<ssize_t>(prefix_len) - 1; i >= 0; --i)
|
||||
{
|
||||
const auto & creator = path[i].data.creator;
|
||||
const auto & creator = path[i].creator;
|
||||
if (creator)
|
||||
{
|
||||
res.type = res.type ? creator->create(res.type) : res.type;
|
||||
|
@ -2,15 +2,25 @@
|
||||
|
||||
#include <Common/COW.h>
|
||||
#include <Core/Types.h>
|
||||
#include <base/demangle.h>
|
||||
#include <Common/typeid_cast.h>
|
||||
#include <Columns/IColumn.h>
|
||||
|
||||
#include <boost/noncopyable.hpp>
|
||||
#include <unordered_map>
|
||||
#include <memory>
|
||||
#include <variant>
|
||||
|
||||
namespace DB
|
||||
{
|
||||
|
||||
namespace ErrorCodes
|
||||
{
|
||||
extern const int LOGICAL_ERROR;
|
||||
}
|
||||
|
||||
class IDataType;
|
||||
|
||||
class ReadBuffer;
|
||||
class WriteBuffer;
|
||||
class ProtobufReader;
|
||||
@ -22,19 +32,40 @@ using DataTypePtr = std::shared_ptr<const IDataType>;
|
||||
class ISerialization;
|
||||
using SerializationPtr = std::shared_ptr<const ISerialization>;
|
||||
|
||||
class SerializationInfo;
|
||||
using SerializationInfoPtr = std::shared_ptr<const SerializationInfo>;
|
||||
|
||||
class Field;
|
||||
|
||||
struct FormatSettings;
|
||||
struct NameAndTypePair;
|
||||
|
||||
/** Represents serialization of data type.
|
||||
* Has methods to serialize/deserialize column in binary and several text formats.
|
||||
* Every data type has default serialization, but can be serialized in different representations.
|
||||
* Default serialization can be wrapped to one of the special kind of serializations.
|
||||
* Currently there is only one special serialization: Sparse.
|
||||
* Each serialization has its own implementation of IColumn as its in-memory representation.
|
||||
*/
|
||||
class ISerialization : private boost::noncopyable, public std::enable_shared_from_this<ISerialization>
|
||||
{
|
||||
public:
|
||||
ISerialization() = default;
|
||||
virtual ~ISerialization() = default;
|
||||
|
||||
enum class Kind : UInt8
|
||||
{
|
||||
DEFAULT = 0,
|
||||
SPARSE = 1,
|
||||
};
|
||||
|
||||
virtual Kind getKind() const { return Kind::DEFAULT; }
|
||||
SerializationPtr getPtr() const { return shared_from_this(); }
|
||||
|
||||
static Kind getKind(const IColumn & column);
|
||||
static String kindToString(Kind kind);
|
||||
static Kind stringToKind(const String & str);
|
||||
|
||||
/** Binary serialization for range of values in column - for writing to disk/network, etc.
|
||||
*
|
||||
* Some data types are represented in multiple streams while being serialized.
|
||||
@ -70,10 +101,10 @@ public:
|
||||
|
||||
struct SubstreamData
|
||||
{
|
||||
SerializationPtr serialization;
|
||||
DataTypePtr type;
|
||||
ColumnPtr column;
|
||||
SerializationPtr serialization;
|
||||
SubcolumnCreatorPtr creator;
|
||||
SerializationInfoPtr serialization_info;
|
||||
};
|
||||
|
||||
struct Substream
|
||||
@ -108,6 +139,9 @@ public:
|
||||
/// Data for current substream.
|
||||
SubstreamData data;
|
||||
|
||||
/// Creator of subcolumn for current substream.
|
||||
SubcolumnCreatorPtr creator = nullptr;
|
||||
|
||||
/// Flag, that may help to traverse substream paths.
|
||||
mutable bool visited = false;
|
||||
|
||||
@ -130,13 +164,14 @@ public:
|
||||
virtual void enumerateStreams(
|
||||
SubstreamPath & path,
|
||||
const StreamCallback & callback,
|
||||
DataTypePtr type,
|
||||
ColumnPtr column) const;
|
||||
const SubstreamData & data) const;
|
||||
|
||||
void enumerateStreams(const StreamCallback & callback, SubstreamPath & path) const;
|
||||
void enumerateStreams(const StreamCallback & callback, SubstreamPath && path) const { enumerateStreams(callback, path); }
|
||||
void enumerateStreams(const StreamCallback & callback) const { enumerateStreams(callback, {}); }
|
||||
|
||||
void enumerateStreams(SubstreamPath & path, const StreamCallback & callback, const DataTypePtr & type) const;
|
||||
|
||||
using OutputStreamGetter = std::function<WriteBuffer*(const SubstreamPath &)>;
|
||||
using InputStreamGetter = std::function<ReadBuffer*(const SubstreamPath &)>;
|
||||
|
||||
@ -300,16 +335,41 @@ public:
|
||||
static ColumnPtr getFromSubstreamsCache(SubstreamsCache * cache, const SubstreamPath & path);
|
||||
|
||||
static bool isSpecialCompressionAllowed(const SubstreamPath & path);
|
||||
static size_t getArrayLevel(const SubstreamPath & path);
|
||||
|
||||
static size_t getArrayLevel(const SubstreamPath & path);
|
||||
static bool hasSubcolumnForPath(const SubstreamPath & path, size_t prefix_len);
|
||||
static SubstreamData createFromPath(const SubstreamPath & path, size_t prefix_len);
|
||||
|
||||
protected:
|
||||
template <typename State, typename StatePtr>
|
||||
State * checkAndGetState(const StatePtr & state) const;
|
||||
|
||||
[[noreturn]] void throwUnexpectedDataAfterParsedValue(IColumn & column, ReadBuffer & istr, const FormatSettings &, const String & type_name) const;
|
||||
};
|
||||
|
||||
using SerializationPtr = std::shared_ptr<const ISerialization>;
|
||||
using Serializations = std::vector<SerializationPtr>;
|
||||
using SerializationByName = std::unordered_map<String, SerializationPtr>;
|
||||
|
||||
template <typename State, typename StatePtr>
|
||||
State * ISerialization::checkAndGetState(const StatePtr & state) const
|
||||
{
|
||||
if (!state)
|
||||
throw Exception(ErrorCodes::LOGICAL_ERROR,
|
||||
"Got empty state for {}", demangle(typeid(*this).name()));
|
||||
|
||||
auto * state_concrete = typeid_cast<State *>(state.get());
|
||||
if (!state_concrete)
|
||||
{
|
||||
auto & state_ref = *state;
|
||||
throw Exception(ErrorCodes::LOGICAL_ERROR,
|
||||
"Invalid State for {}. Expected: {}, got {}",
|
||||
demangle(typeid(*this).name()),
|
||||
demangle(typeid(State).name()),
|
||||
demangle(typeid(state_ref).name()));
|
||||
}
|
||||
|
||||
return state_concrete;
|
||||
}
|
||||
|
||||
}
|
||||
|
@ -198,33 +198,38 @@ ColumnPtr SerializationArray::SubcolumnCreator::create(const ColumnPtr & prev) c
|
||||
void SerializationArray::enumerateStreams(
|
||||
SubstreamPath & path,
|
||||
const StreamCallback & callback,
|
||||
DataTypePtr type,
|
||||
ColumnPtr column) const
|
||||
const SubstreamData & data) const
|
||||
{
|
||||
const auto * type_array = type ? &assert_cast<const DataTypeArray &>(*type) : nullptr;
|
||||
const auto * column_array = column ? &assert_cast<const ColumnArray &>(*column) : nullptr;
|
||||
const auto * type_array = data.type ? &assert_cast<const DataTypeArray &>(*data.type) : nullptr;
|
||||
const auto * column_array = data.column ? &assert_cast<const ColumnArray &>(*data.column) : nullptr;
|
||||
auto offsets_column = column_array ? column_array->getOffsetsPtr() : nullptr;
|
||||
|
||||
path.push_back(Substream::ArraySizes);
|
||||
path.back().data =
|
||||
{
|
||||
type ? std::make_shared<DataTypeUInt64>() : nullptr,
|
||||
offsets_column ? arrayOffsetsToSizes(*offsets_column) : nullptr,
|
||||
std::make_shared<SerializationNamed>(
|
||||
std::make_shared<SerializationNumber<UInt64>>(),
|
||||
"size" + std::to_string(getArrayLevel(path)), false),
|
||||
nullptr,
|
||||
data.type ? std::make_shared<DataTypeUInt64>() : nullptr,
|
||||
offsets_column ? arrayOffsetsToSizes(*offsets_column) : nullptr,
|
||||
data.serialization_info,
|
||||
};
|
||||
|
||||
callback(path);
|
||||
|
||||
path.back() = Substream::ArrayElements;
|
||||
path.back().data = {type, column, getPtr(), std::make_shared<SubcolumnCreator>(offsets_column)};
|
||||
path.back().data = data;
|
||||
path.back().creator = std::make_shared<SubcolumnCreator>(offsets_column);
|
||||
|
||||
auto next_type = type_array ? type_array->getNestedType() : nullptr;
|
||||
auto next_column = column_array ? column_array->getDataPtr() : nullptr;
|
||||
SubstreamData next_data =
|
||||
{
|
||||
nested,
|
||||
type_array ? type_array->getNestedType() : nullptr,
|
||||
column_array ? column_array->getDataPtr() : nullptr,
|
||||
data.serialization_info,
|
||||
};
|
||||
|
||||
nested->enumerateStreams(path, callback, next_type, next_column);
|
||||
nested->enumerateStreams(path, callback, next_data);
|
||||
path.pop_back();
|
||||
}
|
||||
|
||||
|
@ -38,8 +38,7 @@ public:
|
||||
void enumerateStreams(
|
||||
SubstreamPath & path,
|
||||
const StreamCallback & callback,
|
||||
DataTypePtr type,
|
||||
ColumnPtr column) const override;
|
||||
const SubstreamData & data) const override;
|
||||
|
||||
void serializeBinaryBulkStatePrefix(
|
||||
SerializeBinaryBulkSettings & settings,
|
||||
|
@ -10,7 +10,7 @@ class WriteBuffer;
|
||||
struct FormatSettings;
|
||||
class IColumn;
|
||||
|
||||
/** Simple IDataTypeCustomTextSerialization that uses serializeText/deserializeText
|
||||
/** Simple ISerialization that uses serializeText/deserializeText
|
||||
* for all serialization and deserialization. */
|
||||
class SerializationCustomSimpleText : public SerializationWrapper
|
||||
{
|
||||
|
222
src/DataTypes/Serializations/SerializationInfo.cpp
Normal file
222
src/DataTypes/Serializations/SerializationInfo.cpp
Normal file
@ -0,0 +1,222 @@
|
||||
#include <DataTypes/Serializations/SerializationInfo.h>
|
||||
#include <DataTypes/NestedUtils.h>
|
||||
#include <Columns/ColumnSparse.h>
|
||||
#include <IO/ReadHelpers.h>
|
||||
#include <IO/WriteHelpers.h>
|
||||
#include <IO/VarInt.h>
|
||||
#include <base/EnumReflection.h>
|
||||
|
||||
#include <Poco/JSON/JSON.h>
|
||||
#include <Poco/JSON/Object.h>
|
||||
#include <Poco/JSON/Stringifier.h>
|
||||
#include <Poco/JSON/Parser.h>
|
||||
|
||||
|
||||
namespace DB
|
||||
{
|
||||
|
||||
namespace ErrorCodes
|
||||
{
|
||||
extern const int CORRUPTED_DATA;
|
||||
}
|
||||
|
||||
namespace
|
||||
{
|
||||
|
||||
constexpr auto KEY_VERSION = "version";
|
||||
constexpr auto KEY_NUM_ROWS = "num_rows";
|
||||
constexpr auto KEY_COLUMNS = "columns";
|
||||
constexpr auto KEY_NUM_DEFAULTS = "num_defaults";
|
||||
constexpr auto KEY_KIND = "kind";
|
||||
constexpr auto KEY_NAME = "name";
|
||||
|
||||
}
|
||||
|
||||
void SerializationInfo::Data::add(const IColumn & column)
|
||||
{
|
||||
size_t rows = column.size();
|
||||
double ratio = column.getRatioOfDefaultRows(ColumnSparse::DEFAULT_ROWS_SEARCH_SAMPLE_RATIO);
|
||||
|
||||
num_rows += rows;
|
||||
num_defaults += static_cast<size_t>(ratio * rows);
|
||||
}
|
||||
|
||||
void SerializationInfo::Data::add(const Data & other)
|
||||
{
|
||||
num_rows += other.num_rows;
|
||||
num_defaults += other.num_defaults;
|
||||
}
|
||||
|
||||
SerializationInfo::SerializationInfo(ISerialization::Kind kind_, const Settings & settings_)
|
||||
: settings(settings_)
|
||||
, kind(kind_)
|
||||
{
|
||||
}
|
||||
|
||||
void SerializationInfo::add(const IColumn & column)
|
||||
{
|
||||
data.add(column);
|
||||
if (settings.choose_kind)
|
||||
kind = chooseKind(data, settings);
|
||||
}
|
||||
|
||||
void SerializationInfo::add(const SerializationInfo & other)
|
||||
{
|
||||
data.add(other.data);
|
||||
if (settings.choose_kind)
|
||||
kind = chooseKind(data, settings);
|
||||
}
|
||||
|
||||
void SerializationInfo::replaceData(const SerializationInfo & other)
|
||||
{
|
||||
data = other.data;
|
||||
}
|
||||
|
||||
MutableSerializationInfoPtr SerializationInfo::clone() const
|
||||
{
|
||||
auto res = std::make_shared<SerializationInfo>(kind, settings);
|
||||
res->data = data;
|
||||
return res;
|
||||
}
|
||||
|
||||
void SerializationInfo::serialializeKindBinary(WriteBuffer & out) const
|
||||
{
|
||||
writeBinary(static_cast<UInt8>(kind), out);
|
||||
}
|
||||
|
||||
void SerializationInfo::deserializeFromKindsBinary(ReadBuffer & in)
|
||||
{
|
||||
UInt8 kind_num;
|
||||
readBinary(kind_num, in);
|
||||
auto maybe_kind = magic_enum::enum_cast<ISerialization::Kind>(kind_num);
|
||||
if (!maybe_kind)
|
||||
throw Exception(ErrorCodes::CORRUPTED_DATA, "Unknown serialization kind " + std::to_string(kind_num));
|
||||
|
||||
kind = *maybe_kind;
|
||||
}
|
||||
|
||||
Poco::JSON::Object SerializationInfo::toJSON() const
|
||||
{
|
||||
Poco::JSON::Object object;
|
||||
object.set(KEY_KIND, ISerialization::kindToString(kind));
|
||||
object.set(KEY_NUM_DEFAULTS, data.num_defaults);
|
||||
object.set(KEY_NUM_ROWS, data.num_rows);
|
||||
return object;
|
||||
}
|
||||
|
||||
void SerializationInfo::fromJSON(const Poco::JSON::Object & object)
|
||||
{
|
||||
if (!object.has(KEY_KIND) || !object.has(KEY_NUM_DEFAULTS) || !object.has(KEY_NUM_ROWS))
|
||||
throw Exception(ErrorCodes::CORRUPTED_DATA,
|
||||
"Missed field '{}' or '{}' or '{}' in SerializationInfo of columns",
|
||||
KEY_KIND, KEY_NUM_DEFAULTS, KEY_NUM_ROWS);
|
||||
|
||||
data.num_rows = object.getValue<size_t>(KEY_NUM_ROWS);
|
||||
data.num_defaults = object.getValue<size_t>(KEY_NUM_DEFAULTS);
|
||||
kind = ISerialization::stringToKind(object.getValue<String>(KEY_KIND));
|
||||
}
|
||||
|
||||
ISerialization::Kind SerializationInfo::chooseKind(const Data & data, const Settings & settings)
|
||||
{
|
||||
double ratio = data.num_rows ? std::min(static_cast<double>(data.num_defaults) / data.num_rows, 1.0) : 0.0;
|
||||
return ratio > settings.ratio_of_defaults_for_sparse ? ISerialization::Kind::SPARSE : ISerialization::Kind::DEFAULT;
|
||||
}
|
||||
|
||||
SerializationInfoByName::SerializationInfoByName(
|
||||
const NamesAndTypesList & columns,
|
||||
const SerializationInfo::Settings & settings)
|
||||
{
|
||||
if (settings.isAlwaysDefault())
|
||||
return;
|
||||
|
||||
for (const auto & column : columns)
|
||||
if (column.type->supportsSparseSerialization())
|
||||
emplace(column.name, column.type->createSerializationInfo(settings));
|
||||
}
|
||||
|
||||
void SerializationInfoByName::add(const Block & block)
|
||||
{
|
||||
for (const auto & column : block)
|
||||
{
|
||||
auto it = find(column.name);
|
||||
if (it == end())
|
||||
continue;
|
||||
|
||||
it->second->add(*column.column);
|
||||
}
|
||||
}
|
||||
|
||||
void SerializationInfoByName::add(const SerializationInfoByName & other)
|
||||
{
|
||||
for (const auto & [name, info] : other)
|
||||
{
|
||||
auto it = find(name);
|
||||
if (it == end())
|
||||
continue;
|
||||
|
||||
it->second->add(*info);
|
||||
}
|
||||
}
|
||||
|
||||
void SerializationInfoByName::writeJSON(WriteBuffer & out) const
|
||||
{
|
||||
Poco::JSON::Object object;
|
||||
object.set(KEY_VERSION, SERIALIZATION_INFO_VERSION);
|
||||
|
||||
Poco::JSON::Array column_infos;
|
||||
for (const auto & [name, info] : *this)
|
||||
{
|
||||
auto info_json = info->toJSON();
|
||||
info_json.set(KEY_NAME, name);
|
||||
column_infos.add(std::move(info_json));
|
||||
}
|
||||
|
||||
object.set(KEY_COLUMNS, std::move(column_infos));
|
||||
|
||||
std::ostringstream oss; // STYLE_CHECK_ALLOW_STD_STRING_STREAM
|
||||
oss.exceptions(std::ios::failbit);
|
||||
Poco::JSON::Stringifier::stringify(object, oss);
|
||||
|
||||
return writeString(oss.str(), out);
|
||||
}
|
||||
|
||||
void SerializationInfoByName::readJSON(ReadBuffer & in)
|
||||
{
|
||||
String json_str;
|
||||
readString(json_str, in);
|
||||
|
||||
Poco::JSON::Parser parser;
|
||||
auto object = parser.parse(json_str).extract<Poco::JSON::Object::Ptr>();
|
||||
|
||||
if (!object->has(KEY_VERSION))
|
||||
throw Exception(ErrorCodes::CORRUPTED_DATA, "Missed version of serialization infos");
|
||||
|
||||
if (object->getValue<size_t>(KEY_VERSION) > SERIALIZATION_INFO_VERSION)
|
||||
throw Exception(ErrorCodes::CORRUPTED_DATA,
|
||||
"Unknown version of serialization infos ({}). Should be less or equal than {}",
|
||||
object->getValue<size_t>(KEY_VERSION), SERIALIZATION_INFO_VERSION);
|
||||
|
||||
if (object->has(KEY_COLUMNS))
|
||||
{
|
||||
auto array = object->getArray(KEY_COLUMNS);
|
||||
for (const auto & elem : *array)
|
||||
{
|
||||
auto elem_object = elem.extract<Poco::JSON::Object::Ptr>();
|
||||
|
||||
if (!elem_object->has(KEY_NAME))
|
||||
throw Exception(ErrorCodes::CORRUPTED_DATA,
|
||||
"Missed field '{}' in SerializationInfo of columns", KEY_NAME);
|
||||
|
||||
auto name = elem_object->getValue<String>(KEY_NAME);
|
||||
auto it = find(name);
|
||||
|
||||
if (it == end())
|
||||
throw Exception(ErrorCodes::CORRUPTED_DATA,
|
||||
"There is no column {} in serialization infos", name);
|
||||
|
||||
it->second->fromJSON(*elem_object);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
}
|
96
src/DataTypes/Serializations/SerializationInfo.h
Normal file
96
src/DataTypes/Serializations/SerializationInfo.h
Normal file
@ -0,0 +1,96 @@
|
||||
#pragma once
|
||||
|
||||
#include <Core/Types.h>
|
||||
#include <DataTypes/Serializations/ISerialization.h>
|
||||
#include <Poco/JSON/Object.h>
|
||||
|
||||
|
||||
namespace DB
|
||||
{
|
||||
|
||||
class ReadBuffer;
|
||||
class WriteBuffer;
|
||||
class NamesAndTypesList;
|
||||
class Block;
|
||||
|
||||
constexpr auto SERIALIZATION_INFO_VERSION = 0;
|
||||
|
||||
/** Contains information about kind of serialization of column and its subcolumns.
|
||||
* Also contains information about content of columns,
|
||||
* that helps to choose kind of serialization of column.
|
||||
*
|
||||
* Currently has only information about number of default rows,
|
||||
* that helps to choose sparse serialization.
|
||||
*
|
||||
* Should be extended, when new kinds of serialization will be implemented.
|
||||
*/
|
||||
class SerializationInfo
|
||||
{
|
||||
public:
|
||||
struct Data
|
||||
{
|
||||
size_t num_rows = 0;
|
||||
size_t num_defaults = 0;
|
||||
|
||||
void add(const IColumn & column);
|
||||
void add(const Data & other);
|
||||
};
|
||||
|
||||
struct Settings
|
||||
{
|
||||
const double ratio_of_defaults_for_sparse = 1.0;
|
||||
const bool choose_kind = false;
|
||||
|
||||
bool isAlwaysDefault() const { return ratio_of_defaults_for_sparse >= 1.0; }
|
||||
};
|
||||
|
||||
SerializationInfo(ISerialization::Kind kind_, const Settings & settings_);
|
||||
|
||||
virtual ~SerializationInfo() = default;
|
||||
|
||||
virtual bool hasCustomSerialization() const { return kind != ISerialization::Kind::DEFAULT; }
|
||||
|
||||
virtual void add(const IColumn & column);
|
||||
virtual void add(const SerializationInfo & other);
|
||||
virtual void replaceData(const SerializationInfo & other);
|
||||
virtual std::shared_ptr<SerializationInfo> clone() const;
|
||||
|
||||
virtual void serialializeKindBinary(WriteBuffer & out) const;
|
||||
virtual void deserializeFromKindsBinary(ReadBuffer & in);
|
||||
|
||||
virtual Poco::JSON::Object toJSON() const;
|
||||
virtual void fromJSON(const Poco::JSON::Object & object);
|
||||
|
||||
const Settings & getSettings() const { return settings; }
|
||||
const Data & getData() const { return data; }
|
||||
ISerialization::Kind getKind() const { return kind; }
|
||||
|
||||
static ISerialization::Kind chooseKind(const Data & data, const Settings & settings);
|
||||
|
||||
protected:
|
||||
const Settings settings;
|
||||
|
||||
ISerialization::Kind kind;
|
||||
Data data;
|
||||
};
|
||||
|
||||
using SerializationInfoPtr = std::shared_ptr<const SerializationInfo>;
|
||||
using MutableSerializationInfoPtr = std::shared_ptr<SerializationInfo>;
|
||||
|
||||
using SerializationInfos = std::vector<SerializationInfoPtr>;
|
||||
using MutableSerializationInfos = std::vector<MutableSerializationInfoPtr>;
|
||||
|
||||
class SerializationInfoByName : public std::unordered_map<String, MutableSerializationInfoPtr>
|
||||
{
|
||||
public:
|
||||
SerializationInfoByName() = default;
|
||||
SerializationInfoByName(const NamesAndTypesList & columns, const SerializationInfo::Settings & settings);
|
||||
|
||||
void add(const Block & block);
|
||||
void add(const SerializationInfoByName & other);
|
||||
|
||||
void writeJSON(WriteBuffer & out) const;
|
||||
void readJSON(ReadBuffer & in);
|
||||
};
|
||||
|
||||
}
|
114
src/DataTypes/Serializations/SerializationInfoTuple.cpp
Normal file
114
src/DataTypes/Serializations/SerializationInfoTuple.cpp
Normal file
@ -0,0 +1,114 @@
|
||||
#include <DataTypes/Serializations/SerializationInfoTuple.h>
|
||||
#include <DataTypes/DataTypeTuple.h>
|
||||
#include <Columns/ColumnTuple.h>
|
||||
#include <Common/assert_cast.h>
|
||||
|
||||
namespace DB
|
||||
{
|
||||
|
||||
namespace ErrorCodes
|
||||
{
|
||||
extern const int CORRUPTED_DATA;
|
||||
extern const int THERE_IS_NO_COLUMN;
|
||||
}
|
||||
|
||||
SerializationInfoTuple::SerializationInfoTuple(
|
||||
MutableSerializationInfos elems_, const Settings & settings_)
|
||||
: SerializationInfo(ISerialization::Kind::DEFAULT, settings_)
|
||||
, elems(std::move(elems_))
|
||||
{
|
||||
}
|
||||
|
||||
bool SerializationInfoTuple::hasCustomSerialization() const
|
||||
{
|
||||
return std::any_of(elems.begin(), elems.end(), [](const auto & elem) { return elem->hasCustomSerialization(); });
|
||||
}
|
||||
|
||||
void SerializationInfoTuple::add(const IColumn & column)
|
||||
{
|
||||
SerializationInfo::add(column);
|
||||
|
||||
const auto & column_tuple = assert_cast<const ColumnTuple &>(column);
|
||||
const auto & right_elems = column_tuple.getColumns();
|
||||
assert(elems.size() == right_elems.size());
|
||||
|
||||
for (size_t i = 0; i < elems.size(); ++i)
|
||||
elems[i]->add(*right_elems[i]);
|
||||
}
|
||||
|
||||
void SerializationInfoTuple::add(const SerializationInfo & other)
|
||||
{
|
||||
SerializationInfo::add(other);
|
||||
|
||||
const auto & info_tuple = assert_cast<const SerializationInfoTuple &>(other);
|
||||
assert(elems.size() == info_tuple.elems.size());
|
||||
|
||||
for (size_t i = 0; i < elems.size(); ++i)
|
||||
elems[i]->add(*info_tuple.elems[i]);
|
||||
}
|
||||
|
||||
void SerializationInfoTuple::replaceData(const SerializationInfo & other)
|
||||
{
|
||||
SerializationInfo::add(other);
|
||||
|
||||
const auto & info_tuple = assert_cast<const SerializationInfoTuple &>(other);
|
||||
assert(elems.size() == info_tuple.elems.size());
|
||||
|
||||
for (size_t i = 0; i < elems.size(); ++i)
|
||||
elems[i]->replaceData(*info_tuple.elems[i]);
|
||||
}
|
||||
|
||||
MutableSerializationInfoPtr SerializationInfoTuple::clone() const
|
||||
{
|
||||
MutableSerializationInfos elems_cloned;
|
||||
elems_cloned.reserve(elems.size());
|
||||
for (const auto & elem : elems)
|
||||
elems_cloned.push_back(elem->clone());
|
||||
|
||||
return std::make_shared<SerializationInfoTuple>(std::move(elems_cloned), settings);
|
||||
}
|
||||
|
||||
void SerializationInfoTuple::serialializeKindBinary(WriteBuffer & out) const
|
||||
{
|
||||
SerializationInfo::serialializeKindBinary(out);
|
||||
for (const auto & elem : elems)
|
||||
elem->serialializeKindBinary(out);
|
||||
}
|
||||
|
||||
void SerializationInfoTuple::deserializeFromKindsBinary(ReadBuffer & in)
|
||||
{
|
||||
SerializationInfo::deserializeFromKindsBinary(in);
|
||||
for (const auto & elem : elems)
|
||||
elem->deserializeFromKindsBinary(in);
|
||||
}
|
||||
|
||||
Poco::JSON::Object SerializationInfoTuple::toJSON() const
|
||||
{
|
||||
auto object = SerializationInfo::toJSON();
|
||||
Poco::JSON::Array subcolumns;
|
||||
for (const auto & elem : elems)
|
||||
subcolumns.add(elem->toJSON());
|
||||
|
||||
object.set("subcolumns", std::move(subcolumns));
|
||||
return object;
|
||||
}
|
||||
|
||||
void SerializationInfoTuple::fromJSON(const Poco::JSON::Object & object)
|
||||
{
|
||||
SerializationInfo::fromJSON(object);
|
||||
|
||||
if (!object.has("subcolumns"))
|
||||
throw Exception(ErrorCodes::CORRUPTED_DATA,
|
||||
"Missed field '{}' in SerializationInfo of columns SerializationInfoTuple");
|
||||
|
||||
auto subcolumns = object.getArray("subcolumns");
|
||||
if (elems.size() != subcolumns->size())
|
||||
throw Exception(ErrorCodes::THERE_IS_NO_COLUMN,
|
||||
"Mismatched number of subcolumns between JSON and SerializationInfoTuple."
|
||||
"Expected: {}, got: {}", elems.size(), subcolumns->size());
|
||||
|
||||
for (size_t i = 0; i < elems.size(); ++i)
|
||||
elems[i]->fromJSON(*subcolumns->getObject(i));
|
||||
}
|
||||
|
||||
}
|
31
src/DataTypes/Serializations/SerializationInfoTuple.h
Normal file
31
src/DataTypes/Serializations/SerializationInfoTuple.h
Normal file
@ -0,0 +1,31 @@
|
||||
#pragma once
|
||||
#include <DataTypes/Serializations/SerializationInfo.h>
|
||||
|
||||
namespace DB
|
||||
{
|
||||
|
||||
class SerializationInfoTuple : public SerializationInfo
|
||||
{
|
||||
public:
|
||||
SerializationInfoTuple(MutableSerializationInfos elems_, const Settings & settings_);
|
||||
|
||||
bool hasCustomSerialization() const override;
|
||||
void add(const IColumn & column) override;
|
||||
void add(const SerializationInfo & other) override;
|
||||
void replaceData(const SerializationInfo & other) override;
|
||||
|
||||
MutableSerializationInfoPtr clone() const override;
|
||||
void serialializeKindBinary(WriteBuffer & out) const override;
|
||||
void deserializeFromKindsBinary(ReadBuffer & in) override;
|
||||
|
||||
Poco::JSON::Object toJSON() const override;
|
||||
void fromJSON(const Poco::JSON::Object & object) override;
|
||||
|
||||
MutableSerializationInfoPtr getElementInfo(size_t i) const { return elems[i]; }
|
||||
ISerialization::Kind getElementKind(size_t i) const { return elems[i]->getKind(); }
|
||||
|
||||
private:
|
||||
MutableSerializationInfos elems;
|
||||
};
|
||||
|
||||
}
|
@ -43,23 +43,25 @@ SerializationLowCardinality::SerializationLowCardinality(const DataTypePtr & dic
|
||||
void SerializationLowCardinality::enumerateStreams(
|
||||
SubstreamPath & path,
|
||||
const StreamCallback & callback,
|
||||
DataTypePtr type,
|
||||
ColumnPtr column) const
|
||||
const SubstreamData & data) const
|
||||
{
|
||||
const auto * column_lc = column ? &getColumnLowCardinality(*column) : nullptr;
|
||||
const auto * column_lc = data.column ? &getColumnLowCardinality(*data.column) : nullptr;
|
||||
|
||||
SubstreamData data;
|
||||
data.type = type ? dictionary_type : nullptr;
|
||||
data.column = column_lc ? column_lc->getDictionary().getNestedColumn() : nullptr;
|
||||
data.serialization = dict_inner_serialization;
|
||||
SubstreamData dict_data =
|
||||
{
|
||||
dict_inner_serialization,
|
||||
data.type ? dictionary_type : nullptr,
|
||||
column_lc ? column_lc->getDictionary().getNestedColumn() : nullptr,
|
||||
data.serialization_info,
|
||||
};
|
||||
|
||||
path.push_back(Substream::DictionaryKeys);
|
||||
path.back().data = data;
|
||||
path.back().data = dict_data;
|
||||
|
||||
dict_inner_serialization->enumerateStreams(path, callback, data.type, data.column);
|
||||
dict_inner_serialization->enumerateStreams(path, callback, dict_data);
|
||||
|
||||
path.back() = Substream::DictionaryIndexes;
|
||||
path.back().data = {type, column, getPtr(), nullptr};
|
||||
path.back().data = data;
|
||||
|
||||
callback(path);
|
||||
path.pop_back();
|
||||
@ -222,42 +224,6 @@ struct DeserializeStateLowCardinality : public ISerialization::DeserializeBinary
|
||||
explicit DeserializeStateLowCardinality(UInt64 key_version_) : key_version(key_version_) {}
|
||||
};
|
||||
|
||||
static SerializeStateLowCardinality * checkAndGetLowCardinalitySerializeState(
|
||||
ISerialization::SerializeBinaryBulkStatePtr & state)
|
||||
{
|
||||
if (!state)
|
||||
throw Exception("Got empty state for SerializationLowCardinality.", ErrorCodes::LOGICAL_ERROR);
|
||||
|
||||
auto * low_cardinality_state = typeid_cast<SerializeStateLowCardinality *>(state.get());
|
||||
if (!low_cardinality_state)
|
||||
{
|
||||
auto & state_ref = *state;
|
||||
throw Exception("Invalid SerializeBinaryBulkState for SerializationLowCardinality. Expected: "
|
||||
+ demangle(typeid(SerializeStateLowCardinality).name()) + ", got "
|
||||
+ demangle(typeid(state_ref).name()), ErrorCodes::LOGICAL_ERROR);
|
||||
}
|
||||
|
||||
return low_cardinality_state;
|
||||
}
|
||||
|
||||
static DeserializeStateLowCardinality * checkAndGetLowCardinalityDeserializeState(
|
||||
ISerialization::DeserializeBinaryBulkStatePtr & state)
|
||||
{
|
||||
if (!state)
|
||||
throw Exception("Got empty state for SerializationLowCardinality.", ErrorCodes::LOGICAL_ERROR);
|
||||
|
||||
auto * low_cardinality_state = typeid_cast<DeserializeStateLowCardinality *>(state.get());
|
||||
if (!low_cardinality_state)
|
||||
{
|
||||
auto & state_ref = *state;
|
||||
throw Exception("Invalid DeserializeBinaryBulkState for SerializationLowCardinality. Expected: "
|
||||
+ demangle(typeid(DeserializeStateLowCardinality).name()) + ", got "
|
||||
+ demangle(typeid(state_ref).name()), ErrorCodes::LOGICAL_ERROR);
|
||||
}
|
||||
|
||||
return low_cardinality_state;
|
||||
}
|
||||
|
||||
void SerializationLowCardinality::serializeBinaryBulkStatePrefix(
|
||||
SerializeBinaryBulkSettings & settings,
|
||||
SerializeBinaryBulkStatePtr & state) const
|
||||
@ -282,7 +248,7 @@ void SerializationLowCardinality::serializeBinaryBulkStateSuffix(
|
||||
SerializeBinaryBulkSettings & settings,
|
||||
SerializeBinaryBulkStatePtr & state) const
|
||||
{
|
||||
auto * low_cardinality_state = checkAndGetLowCardinalitySerializeState(state);
|
||||
auto * low_cardinality_state = checkAndGetState<SerializeStateLowCardinality>(state);
|
||||
KeysSerializationVersion::checkVersion(low_cardinality_state->key_version.value);
|
||||
|
||||
if (low_cardinality_state->shared_dictionary && settings.low_cardinality_max_dictionary_size)
|
||||
@ -521,7 +487,7 @@ void SerializationLowCardinality::serializeBinaryBulkWithMultipleStreams(
|
||||
|
||||
const ColumnLowCardinality & low_cardinality_column = typeid_cast<const ColumnLowCardinality &>(column);
|
||||
|
||||
auto * low_cardinality_state = checkAndGetLowCardinalitySerializeState(state);
|
||||
auto * low_cardinality_state = checkAndGetState<SerializeStateLowCardinality>(state);
|
||||
auto & global_dictionary = low_cardinality_state->shared_dictionary;
|
||||
KeysSerializationVersion::checkVersion(low_cardinality_state->key_version.value);
|
||||
|
||||
@ -620,7 +586,7 @@ void SerializationLowCardinality::deserializeBinaryBulkWithMultipleStreams(
|
||||
if (!indexes_stream)
|
||||
throw Exception("Got empty stream for SerializationLowCardinality indexes.", ErrorCodes::LOGICAL_ERROR);
|
||||
|
||||
auto * low_cardinality_state = checkAndGetLowCardinalityDeserializeState(state);
|
||||
auto * low_cardinality_state = checkAndGetState<DeserializeStateLowCardinality>(state);
|
||||
KeysSerializationVersion::checkVersion(low_cardinality_state->key_version.value);
|
||||
|
||||
auto read_dictionary = [this, low_cardinality_state, keys_stream]()
|
||||
|
@ -20,8 +20,7 @@ public:
|
||||
void enumerateStreams(
|
||||
SubstreamPath & path,
|
||||
const StreamCallback & callback,
|
||||
DataTypePtr type,
|
||||
ColumnPtr column) const override;
|
||||
const SubstreamData & data) const override;
|
||||
|
||||
void serializeBinaryBulkStatePrefix(
|
||||
SerializeBinaryBulkSettings & settings,
|
||||
|
@ -250,13 +250,17 @@ void SerializationMap::deserializeTextCSV(IColumn & column, ReadBuffer & istr, c
|
||||
void SerializationMap::enumerateStreams(
|
||||
SubstreamPath & path,
|
||||
const StreamCallback & callback,
|
||||
DataTypePtr type,
|
||||
ColumnPtr column) const
|
||||
const SubstreamData & data) const
|
||||
{
|
||||
auto next_type = type ? assert_cast<const DataTypeMap &>(*type).getNestedType() : nullptr;
|
||||
auto next_column = column ? assert_cast<const ColumnMap &>(*column).getNestedColumnPtr() : nullptr;
|
||||
SubstreamData next_data =
|
||||
{
|
||||
nested,
|
||||
data.type ? assert_cast<const DataTypeMap &>(*data.type).getNestedType() : nullptr,
|
||||
data.column ? assert_cast<const ColumnMap &>(*data.column).getNestedColumnPtr() : nullptr,
|
||||
data.serialization_info,
|
||||
};
|
||||
|
||||
nested->enumerateStreams(path, callback, next_type, next_column);
|
||||
nested->enumerateStreams(path, callback, next_data);
|
||||
}
|
||||
|
||||
void SerializationMap::serializeBinaryBulkStatePrefix(
|
||||
|
@ -34,8 +34,7 @@ public:
|
||||
void enumerateStreams(
|
||||
SubstreamPath & path,
|
||||
const StreamCallback & callback,
|
||||
DataTypePtr type,
|
||||
ColumnPtr column) const override;
|
||||
const SubstreamData & data) const override;
|
||||
|
||||
void serializeBinaryBulkStatePrefix(
|
||||
SerializeBinaryBulkSettings & settings,
|
||||
|
@ -6,12 +6,13 @@ namespace DB
|
||||
void SerializationNamed::enumerateStreams(
|
||||
SubstreamPath & path,
|
||||
const StreamCallback & callback,
|
||||
DataTypePtr type,
|
||||
ColumnPtr column) const
|
||||
const SubstreamData & data) const
|
||||
{
|
||||
addToPath(path);
|
||||
path.back().data = {type, column, getPtr(), std::make_shared<SubcolumnCreator>(name, escape_delimiter)};
|
||||
nested_serialization->enumerateStreams(path, callback, type, column);
|
||||
path.back().data = data;
|
||||
path.back().creator = std::make_shared<SubcolumnCreator>(name, escape_delimiter);
|
||||
|
||||
nested_serialization->enumerateStreams(path, callback, data);
|
||||
path.pop_back();
|
||||
}
|
||||
|
||||
|
@ -23,8 +23,7 @@ public:
|
||||
void enumerateStreams(
|
||||
SubstreamPath & path,
|
||||
const StreamCallback & callback,
|
||||
DataTypePtr type,
|
||||
ColumnPtr column) const override;
|
||||
const SubstreamData & data) const override;
|
||||
|
||||
void serializeBinaryBulkStatePrefix(
|
||||
SerializeBinaryBulkSettings & settings,
|
||||
|
@ -40,30 +40,35 @@ ColumnPtr SerializationNullable::SubcolumnCreator::create(const ColumnPtr & prev
|
||||
void SerializationNullable::enumerateStreams(
|
||||
SubstreamPath & path,
|
||||
const StreamCallback & callback,
|
||||
DataTypePtr type,
|
||||
ColumnPtr column) const
|
||||
const SubstreamData & data) const
|
||||
{
|
||||
const auto * type_nullable = type ? &assert_cast<const DataTypeNullable &>(*type) : nullptr;
|
||||
const auto * column_nullable = column ? &assert_cast<const ColumnNullable &>(*column) : nullptr;
|
||||
const auto * type_nullable = data.type ? &assert_cast<const DataTypeNullable &>(*data.type) : nullptr;
|
||||
const auto * column_nullable = data.column ? &assert_cast<const ColumnNullable &>(*data.column) : nullptr;
|
||||
|
||||
path.push_back(Substream::NullMap);
|
||||
path.back().data =
|
||||
{
|
||||
std::make_shared<SerializationNamed>(std::make_shared<SerializationNumber<UInt8>>(), "null", false),
|
||||
type_nullable ? std::make_shared<DataTypeUInt8>() : nullptr,
|
||||
column_nullable ? column_nullable->getNullMapColumnPtr() : nullptr,
|
||||
std::make_shared<SerializationNamed>(std::make_shared<SerializationNumber<UInt8>>(), "null", false),
|
||||
nullptr,
|
||||
data.serialization_info,
|
||||
};
|
||||
|
||||
callback(path);
|
||||
|
||||
path.back() = Substream::NullableElements;
|
||||
path.back().data = {type, column, getPtr(), std::make_shared<SubcolumnCreator>(path.back().data.column)};
|
||||
path.back().creator = std::make_shared<SubcolumnCreator>(path.back().data.column);
|
||||
path.back().data = data;
|
||||
|
||||
auto next_type = type_nullable ? type_nullable->getNestedType() : nullptr;
|
||||
auto next_column = column_nullable ? column_nullable->getNestedColumnPtr() : nullptr;
|
||||
SubstreamData next_data =
|
||||
{
|
||||
nested,
|
||||
type_nullable ? type_nullable->getNestedType() : nullptr,
|
||||
column_nullable ? column_nullable->getNestedColumnPtr() : nullptr,
|
||||
data.serialization_info,
|
||||
};
|
||||
|
||||
nested->enumerateStreams(path, callback, next_type, next_column);
|
||||
nested->enumerateStreams(path, callback, next_data);
|
||||
path.pop_back();
|
||||
}
|
||||
|
||||
|
@ -16,8 +16,7 @@ public:
|
||||
void enumerateStreams(
|
||||
SubstreamPath & path,
|
||||
const StreamCallback & callback,
|
||||
DataTypePtr type,
|
||||
ColumnPtr column) const override;
|
||||
const SubstreamData & data) const override;
|
||||
|
||||
void serializeBinaryBulkStatePrefix(
|
||||
SerializeBinaryBulkSettings & settings,
|
||||
|
380
src/DataTypes/Serializations/SerializationSparse.cpp
Normal file
380
src/DataTypes/Serializations/SerializationSparse.cpp
Normal file
@ -0,0 +1,380 @@
|
||||
#include <DataTypes/Serializations/SerializationSparse.h>
|
||||
#include <DataTypes/DataTypesNumber.h>
|
||||
#include <Columns/IColumn.h>
|
||||
#include <Columns/ColumnVector.h>
|
||||
#include <Columns/ColumnSparse.h>
|
||||
#include <Common/assert_cast.h>
|
||||
#include <IO/ReadHelpers.h>
|
||||
#include <IO/WriteHelpers.h>
|
||||
#include <IO/VarInt.h>
|
||||
|
||||
namespace DB
|
||||
{
|
||||
|
||||
namespace ErrorCodes
|
||||
{
|
||||
extern const int NOT_IMPLEMENTED;
|
||||
extern const int LOGICAL_ERROR;
|
||||
}
|
||||
|
||||
namespace
|
||||
{
|
||||
|
||||
/// 2^62, because VarInt supports only values < 2^63.
|
||||
constexpr auto END_OF_GRANULE_FLAG = 1ULL << 62;
|
||||
|
||||
struct DeserializeStateSparse : public ISerialization::DeserializeBinaryBulkState
|
||||
{
|
||||
/// Number of default values, that remain from previous read.
|
||||
size_t num_trailing_defaults = 0;
|
||||
/// Do we have non-default value after @num_trailing_defaults?
|
||||
bool has_value_after_defaults = false;
|
||||
ISerialization::DeserializeBinaryBulkStatePtr nested;
|
||||
|
||||
void reset()
|
||||
{
|
||||
num_trailing_defaults = 0;
|
||||
has_value_after_defaults = false;
|
||||
}
|
||||
};
|
||||
|
||||
void serializeOffsets(const IColumn::Offsets & offsets, WriteBuffer & ostr, size_t start, size_t end)
|
||||
{
|
||||
size_t size = offsets.size();
|
||||
for (size_t i = 0; i < size; ++i)
|
||||
{
|
||||
size_t group_size = offsets[i] - start;
|
||||
writeVarUInt(group_size, ostr);
|
||||
start += group_size + 1;
|
||||
}
|
||||
|
||||
size_t group_size = start < end ? end - start : 0;
|
||||
group_size |= END_OF_GRANULE_FLAG;
|
||||
writeVarUInt(group_size, ostr);
|
||||
}
|
||||
|
||||
|
||||
/// Returns number of read rows.
|
||||
/// @start is the size of column before reading offsets.
|
||||
size_t deserializeOffsets(IColumn::Offsets & offsets,
|
||||
ReadBuffer & istr, size_t start, size_t limit, DeserializeStateSparse & state)
|
||||
{
|
||||
if (limit && state.num_trailing_defaults >= limit)
|
||||
{
|
||||
state.num_trailing_defaults -= limit;
|
||||
return limit;
|
||||
}
|
||||
|
||||
/// Just try to guess number of offsets.
|
||||
offsets.reserve(offsets.size()
|
||||
+ static_cast<size_t>(limit * (1.0 - ColumnSparse::DEFAULT_RATIO_FOR_SPARSE_SERIALIZATION)));
|
||||
|
||||
bool first = true;
|
||||
size_t total_rows = state.num_trailing_defaults;
|
||||
if (state.has_value_after_defaults)
|
||||
{
|
||||
offsets.push_back(start + state.num_trailing_defaults);
|
||||
first = false;
|
||||
|
||||
state.has_value_after_defaults = false;
|
||||
state.num_trailing_defaults = 0;
|
||||
++total_rows;
|
||||
}
|
||||
|
||||
size_t group_size;
|
||||
while (!istr.eof())
|
||||
{
|
||||
readVarUInt(group_size, istr);
|
||||
|
||||
bool end_of_granule = group_size & END_OF_GRANULE_FLAG;
|
||||
group_size &= ~END_OF_GRANULE_FLAG;
|
||||
|
||||
size_t next_total_rows = total_rows + group_size;
|
||||
group_size += state.num_trailing_defaults;
|
||||
|
||||
if (limit && next_total_rows >= limit)
|
||||
{
|
||||
/// If it was not last group in granule,
|
||||
/// we have to add current non-default value at further reads.
|
||||
state.num_trailing_defaults = next_total_rows - limit;
|
||||
state.has_value_after_defaults = !end_of_granule;
|
||||
return limit;
|
||||
}
|
||||
|
||||
if (end_of_granule)
|
||||
{
|
||||
state.has_value_after_defaults = false;
|
||||
state.num_trailing_defaults = group_size;
|
||||
}
|
||||
else
|
||||
{
|
||||
/// If we add value to column for first time in current read,
|
||||
/// start from column's current size, because it can have some defaults after last offset,
|
||||
/// otherwise just start from previous offset.
|
||||
size_t start_of_group = start;
|
||||
if (!first && !offsets.empty())
|
||||
start_of_group = offsets.back() + 1;
|
||||
if (first)
|
||||
first = false;
|
||||
|
||||
offsets.push_back(start_of_group + group_size);
|
||||
|
||||
state.num_trailing_defaults = 0;
|
||||
state.has_value_after_defaults = false;
|
||||
++next_total_rows;
|
||||
}
|
||||
|
||||
total_rows = next_total_rows;
|
||||
}
|
||||
|
||||
return total_rows;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
SerializationSparse::SerializationSparse(const SerializationPtr & nested_)
|
||||
: nested(nested_)
|
||||
{
|
||||
}
|
||||
|
||||
SerializationPtr SerializationSparse::SubcolumnCreator::create(const SerializationPtr & prev) const
|
||||
{
|
||||
return std::make_shared<SerializationSparse>(prev);
|
||||
}
|
||||
|
||||
ColumnPtr SerializationSparse::SubcolumnCreator::create(const ColumnPtr & prev) const
|
||||
{
|
||||
return ColumnSparse::create(prev, offsets, size);
|
||||
}
|
||||
|
||||
void SerializationSparse::enumerateStreams(
|
||||
SubstreamPath & path,
|
||||
const StreamCallback & callback,
|
||||
const SubstreamData & data) const
|
||||
{
|
||||
const auto * column_sparse = data.column ? &assert_cast<const ColumnSparse &>(*data.column) : nullptr;
|
||||
|
||||
size_t column_size = column_sparse ? column_sparse->size() : 0;
|
||||
|
||||
path.push_back(Substream::SparseOffsets);
|
||||
path.back().data =
|
||||
{
|
||||
std::make_shared<SerializationNumber<UInt64>>(),
|
||||
data.type ? std::make_shared<DataTypeUInt64>() : nullptr,
|
||||
column_sparse ? column_sparse->getOffsetsPtr() : nullptr,
|
||||
data.serialization_info,
|
||||
};
|
||||
|
||||
callback(path);
|
||||
|
||||
path.back() = Substream::SparseElements;
|
||||
path.back().creator = std::make_shared<SubcolumnCreator>(path.back().data.column, column_size);
|
||||
path.back().data = data;
|
||||
|
||||
SubstreamData next_data =
|
||||
{
|
||||
nested,
|
||||
data.type,
|
||||
column_sparse ? column_sparse->getValuesPtr() : nullptr,
|
||||
data.serialization_info,
|
||||
};
|
||||
|
||||
nested->enumerateStreams(path, callback, next_data);
|
||||
path.pop_back();
|
||||
}
|
||||
|
||||
void SerializationSparse::serializeBinaryBulkStatePrefix(
|
||||
SerializeBinaryBulkSettings & settings,
|
||||
SerializeBinaryBulkStatePtr & state) const
|
||||
{
|
||||
settings.path.push_back(Substream::SparseElements);
|
||||
nested->serializeBinaryBulkStatePrefix(settings, state);
|
||||
settings.path.pop_back();
|
||||
}
|
||||
|
||||
void SerializationSparse::serializeBinaryBulkWithMultipleStreams(
|
||||
const IColumn & column,
|
||||
size_t offset,
|
||||
size_t limit,
|
||||
SerializeBinaryBulkSettings & settings,
|
||||
SerializeBinaryBulkStatePtr & state) const
|
||||
{
|
||||
size_t size = column.size();
|
||||
|
||||
auto offsets_column = DataTypeNumber<IColumn::Offset>().createColumn();
|
||||
auto & offsets_data = assert_cast<ColumnVector<IColumn::Offset> &>(*offsets_column).getData();
|
||||
column.getIndicesOfNonDefaultRows(offsets_data, offset, limit);
|
||||
|
||||
settings.path.push_back(Substream::SparseOffsets);
|
||||
if (auto * stream = settings.getter(settings.path))
|
||||
{
|
||||
size_t end = limit && offset + limit < size ? offset + limit : size;
|
||||
serializeOffsets(offsets_data, *stream, offset, end);
|
||||
}
|
||||
|
||||
if (!offsets_data.empty())
|
||||
{
|
||||
settings.path.back() = Substream::SparseElements;
|
||||
if (const auto * column_sparse = typeid_cast<const ColumnSparse *>(&column))
|
||||
{
|
||||
const auto & values = column_sparse->getValuesColumn();
|
||||
size_t begin = column_sparse->getValueIndex(offsets_data[0]);
|
||||
size_t end = column_sparse->getValueIndex(offsets_data.back());
|
||||
nested->serializeBinaryBulkWithMultipleStreams(values, begin, end - begin + 1, settings, state);
|
||||
}
|
||||
else
|
||||
{
|
||||
auto values = column.index(*offsets_column, 0);
|
||||
nested->serializeBinaryBulkWithMultipleStreams(*values, 0, values->size(), settings, state);
|
||||
}
|
||||
}
|
||||
|
||||
settings.path.pop_back();
|
||||
}
|
||||
|
||||
void SerializationSparse::serializeBinaryBulkStateSuffix(
|
||||
SerializeBinaryBulkSettings & settings,
|
||||
SerializeBinaryBulkStatePtr & state) const
|
||||
{
|
||||
settings.path.push_back(Substream::SparseElements);
|
||||
nested->serializeBinaryBulkStateSuffix(settings, state);
|
||||
settings.path.pop_back();
|
||||
}
|
||||
|
||||
void SerializationSparse::deserializeBinaryBulkStatePrefix(
|
||||
DeserializeBinaryBulkSettings & settings,
|
||||
DeserializeBinaryBulkStatePtr & state) const
|
||||
{
|
||||
auto state_sparse = std::make_shared<DeserializeStateSparse>();
|
||||
|
||||
settings.path.push_back(Substream::SparseElements);
|
||||
nested->deserializeBinaryBulkStatePrefix(settings, state_sparse->nested);
|
||||
settings.path.pop_back();
|
||||
|
||||
state = std::move(state_sparse);
|
||||
}
|
||||
|
||||
void SerializationSparse::deserializeBinaryBulkWithMultipleStreams(
|
||||
ColumnPtr & column,
|
||||
size_t limit,
|
||||
DeserializeBinaryBulkSettings & settings,
|
||||
DeserializeBinaryBulkStatePtr & state,
|
||||
SubstreamsCache * cache) const
|
||||
{
|
||||
auto * state_sparse = checkAndGetState<DeserializeStateSparse>(state);
|
||||
|
||||
if (!settings.continuous_reading)
|
||||
state_sparse->reset();
|
||||
|
||||
auto mutable_column = column->assumeMutable();
|
||||
auto & column_sparse = assert_cast<ColumnSparse &>(*mutable_column);
|
||||
auto & offsets_data = column_sparse.getOffsetsData();
|
||||
|
||||
size_t old_size = offsets_data.size();
|
||||
|
||||
size_t read_rows = 0;
|
||||
settings.path.push_back(Substream::SparseOffsets);
|
||||
if (auto * stream = settings.getter(settings.path))
|
||||
read_rows = deserializeOffsets(offsets_data, *stream, column_sparse.size(), limit, *state_sparse);
|
||||
|
||||
auto & values_column = column_sparse.getValuesPtr();
|
||||
size_t values_limit = offsets_data.size() - old_size;
|
||||
|
||||
settings.path.back() = Substream::SparseElements;
|
||||
nested->deserializeBinaryBulkWithMultipleStreams(values_column, values_limit, settings, state_sparse->nested, cache);
|
||||
settings.path.pop_back();
|
||||
|
||||
if (offsets_data.size() + 1 != values_column->size())
|
||||
throw Exception(ErrorCodes::LOGICAL_ERROR, "Inconsistent sizes of values and offsets in SerializationSparse."
|
||||
" Offsets size: {}, values size: {}", offsets_data.size(), values_column->size());
|
||||
|
||||
/// 'insertManyDefaults' just increases size of column.
|
||||
column_sparse.insertManyDefaults(read_rows);
|
||||
column = std::move(mutable_column);
|
||||
}
|
||||
|
||||
/// All methods below just wrap nested serialization.
|
||||
|
||||
void SerializationSparse::serializeBinary(const Field & field, WriteBuffer & ostr) const
|
||||
{
|
||||
nested->serializeBinary(field, ostr);
|
||||
}
|
||||
|
||||
void SerializationSparse::deserializeBinary(Field & field, ReadBuffer & istr) const
|
||||
{
|
||||
nested->deserializeBinary(field, istr);
|
||||
}
|
||||
|
||||
void SerializationSparse::serializeBinary(const IColumn & column, size_t row_num, WriteBuffer & ostr) const
|
||||
{
|
||||
const auto & column_sparse = assert_cast<const ColumnSparse &>(column);
|
||||
nested->serializeBinary(column_sparse.getValuesColumn(), column_sparse.getValueIndex(row_num), ostr);
|
||||
}
|
||||
|
||||
void SerializationSparse::deserializeBinary(IColumn &, ReadBuffer &) const
|
||||
{
|
||||
throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Method 'deserializeBinary' is not implemented for SerializationSparse");
|
||||
}
|
||||
|
||||
void SerializationSparse::serializeTextEscaped(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const
|
||||
{
|
||||
const auto & column_sparse = assert_cast<const ColumnSparse &>(column);
|
||||
nested->serializeTextEscaped(column_sparse.getValuesColumn(), column_sparse.getValueIndex(row_num), ostr, settings);
|
||||
}
|
||||
|
||||
void SerializationSparse::deserializeTextEscaped(IColumn &, ReadBuffer &, const FormatSettings &) const
|
||||
{
|
||||
throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Method 'deserializeTextEscaped' is not implemented for SerializationSparse");
|
||||
}
|
||||
|
||||
void SerializationSparse::serializeTextQuoted(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const
|
||||
{
|
||||
const auto & column_sparse = assert_cast<const ColumnSparse &>(column);
|
||||
nested->serializeTextQuoted(column_sparse.getValuesColumn(), column_sparse.getValueIndex(row_num), ostr, settings);
|
||||
}
|
||||
|
||||
void SerializationSparse::deserializeTextQuoted(IColumn &, ReadBuffer &, const FormatSettings &) const
|
||||
{
|
||||
throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Method 'deserializeTextQuoted' is not implemented for SerializationSparse");
|
||||
}
|
||||
|
||||
void SerializationSparse::serializeTextCSV(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const
|
||||
{
|
||||
const auto & column_sparse = assert_cast<const ColumnSparse &>(column);
|
||||
nested->serializeTextCSV(column_sparse.getValuesColumn(), column_sparse.getValueIndex(row_num), ostr, settings);
|
||||
}
|
||||
|
||||
void SerializationSparse::deserializeTextCSV(IColumn &, ReadBuffer &, const FormatSettings &) const
|
||||
{
|
||||
throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Method 'deserializeTextCSV' is not implemented for SerializationSparse");
|
||||
}
|
||||
|
||||
void SerializationSparse::serializeText(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const
|
||||
{
|
||||
const auto & column_sparse = assert_cast<const ColumnSparse &>(column);
|
||||
nested->serializeText(column_sparse.getValuesColumn(), column_sparse.getValueIndex(row_num), ostr, settings);
|
||||
}
|
||||
|
||||
void SerializationSparse::deserializeWholeText(IColumn &, ReadBuffer &, const FormatSettings &) const
|
||||
{
|
||||
throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Method 'deserializeWholeText' is not implemented for SerializationSparse");
|
||||
}
|
||||
|
||||
void SerializationSparse::serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const
|
||||
{
|
||||
const auto & column_sparse = assert_cast<const ColumnSparse &>(column);
|
||||
nested->serializeTextJSON(column_sparse.getValuesColumn(), column_sparse.getValueIndex(row_num), ostr, settings);
|
||||
}
|
||||
|
||||
void SerializationSparse::deserializeTextJSON(IColumn &, ReadBuffer &, const FormatSettings &) const
|
||||
{
|
||||
throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Method 'deserializeTextJSON' is not implemented for SerializationSparse");
|
||||
}
|
||||
|
||||
void SerializationSparse::serializeTextXML(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const
|
||||
{
|
||||
const auto & column_sparse = assert_cast<const ColumnSparse &>(column);
|
||||
nested->serializeTextXML(column_sparse.getValuesColumn(), column_sparse.getValueIndex(row_num), ostr, settings);
|
||||
}
|
||||
|
||||
}
|
103
src/DataTypes/Serializations/SerializationSparse.h
Normal file
103
src/DataTypes/Serializations/SerializationSparse.h
Normal file
@ -0,0 +1,103 @@
|
||||
#pragma once
|
||||
|
||||
#include <DataTypes/Serializations/ISerialization.h>
|
||||
|
||||
namespace DB
|
||||
{
|
||||
|
||||
|
||||
/** Serialization for sparse representation.
|
||||
* Only '{serialize,deserialize}BinaryBulk' makes sense.
|
||||
* Format:
|
||||
* Values and offsets are written to separate substreams.
|
||||
* There are written only non-default values.
|
||||
*
|
||||
* Offsets have position independent format: as i-th offset there
|
||||
* is written number of default values, that precedes the i-th non-default value.
|
||||
* Offsets are written in VarInt encoding.
|
||||
* Additionally at the end of every call of 'serializeBinaryBulkWithMultipleStreams'
|
||||
* there is written number of default values in the suffix of part of column,
|
||||
* that we currently writing. This value also marked with a flag, that means the end of portion of data.
|
||||
* This value is used, e.g. to allow independent reading of granules in MergeTree.
|
||||
*/
|
||||
class SerializationSparse final : public ISerialization
|
||||
{
|
||||
public:
|
||||
SerializationSparse(const SerializationPtr & nested_);
|
||||
|
||||
Kind getKind() const override { return Kind::SPARSE; }
|
||||
|
||||
virtual void enumerateStreams(
|
||||
SubstreamPath & path,
|
||||
const StreamCallback & callback,
|
||||
const SubstreamData & data) const override;
|
||||
|
||||
void serializeBinaryBulkStatePrefix(
|
||||
SerializeBinaryBulkSettings & settings,
|
||||
SerializeBinaryBulkStatePtr & state) const override;
|
||||
|
||||
void serializeBinaryBulkStateSuffix(
|
||||
SerializeBinaryBulkSettings & settings,
|
||||
SerializeBinaryBulkStatePtr & state) const override;
|
||||
|
||||
void deserializeBinaryBulkStatePrefix(
|
||||
DeserializeBinaryBulkSettings & settings,
|
||||
DeserializeBinaryBulkStatePtr & state) const override;
|
||||
|
||||
/// Allows to write ColumnSparse and other columns in sparse serialization.
|
||||
void serializeBinaryBulkWithMultipleStreams(
|
||||
const IColumn & column,
|
||||
size_t offset,
|
||||
size_t limit,
|
||||
SerializeBinaryBulkSettings & settings,
|
||||
SerializeBinaryBulkStatePtr & state) const override;
|
||||
|
||||
/// Allows to read only ColumnSparse.
|
||||
void deserializeBinaryBulkWithMultipleStreams(
|
||||
ColumnPtr & column,
|
||||
size_t limit,
|
||||
DeserializeBinaryBulkSettings & settings,
|
||||
DeserializeBinaryBulkStatePtr & state,
|
||||
SubstreamsCache * cache) const override;
|
||||
|
||||
void serializeBinary(const Field & field, WriteBuffer & ostr) const override;
|
||||
void deserializeBinary(Field & field, ReadBuffer & istr) const override;
|
||||
|
||||
void serializeBinary(const IColumn & column, size_t row_num, WriteBuffer & ostr) const override;
|
||||
void deserializeBinary(IColumn & column, ReadBuffer & istr) const override;
|
||||
|
||||
void serializeTextEscaped(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override;
|
||||
void deserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override;
|
||||
|
||||
void serializeTextQuoted(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override;
|
||||
void deserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override;
|
||||
|
||||
void serializeTextCSV(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override;
|
||||
void deserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override;
|
||||
|
||||
void serializeText(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override;
|
||||
void deserializeWholeText(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override;
|
||||
|
||||
void serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override;
|
||||
void deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override;
|
||||
|
||||
void serializeTextXML(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const override;
|
||||
|
||||
private:
|
||||
struct SubcolumnCreator : public ISubcolumnCreator
|
||||
{
|
||||
const ColumnPtr offsets;
|
||||
const size_t size;
|
||||
|
||||
SubcolumnCreator(const ColumnPtr & offsets_, size_t size_)
|
||||
: offsets(offsets_), size(size_) {}
|
||||
|
||||
DataTypePtr create(const DataTypePtr & prev) const override { return prev; }
|
||||
SerializationPtr create(const SerializationPtr & prev) const override;
|
||||
ColumnPtr create(const ColumnPtr & prev) const override;
|
||||
};
|
||||
|
||||
SerializationPtr nested;
|
||||
};
|
||||
|
||||
}
|
@ -1,5 +1,6 @@
|
||||
#include <base/range.h>
|
||||
#include <DataTypes/Serializations/SerializationTuple.h>
|
||||
#include <DataTypes/Serializations/SerializationInfoTuple.h>
|
||||
#include <DataTypes/DataTypeTuple.h>
|
||||
#include <Core/Field.h>
|
||||
#include <Columns/ColumnTuple.h>
|
||||
@ -16,7 +17,6 @@ namespace ErrorCodes
|
||||
{
|
||||
extern const int SIZES_OF_COLUMNS_IN_TUPLE_DOESNT_MATCH;
|
||||
extern const int NOT_FOUND_COLUMN_IN_BLOCK;
|
||||
extern const int LOGICAL_ERROR;
|
||||
}
|
||||
|
||||
|
||||
@ -286,18 +286,23 @@ void SerializationTuple::deserializeTextCSV(IColumn & column, ReadBuffer & istr,
|
||||
void SerializationTuple::enumerateStreams(
|
||||
SubstreamPath & path,
|
||||
const StreamCallback & callback,
|
||||
DataTypePtr type,
|
||||
ColumnPtr column) const
|
||||
const SubstreamData & data) const
|
||||
{
|
||||
const auto * type_tuple = type ? &assert_cast<const DataTypeTuple &>(*type) : nullptr;
|
||||
const auto * column_tuple = column ? &assert_cast<const ColumnTuple &>(*column) : nullptr;
|
||||
const auto * type_tuple = data.type ? &assert_cast<const DataTypeTuple &>(*data.type) : nullptr;
|
||||
const auto * column_tuple = data.column ? &assert_cast<const ColumnTuple &>(*data.column) : nullptr;
|
||||
const auto * info_tuple = data.serialization_info ? &assert_cast<const SerializationInfoTuple &>(*data.serialization_info) : nullptr;
|
||||
|
||||
for (size_t i = 0; i < elems.size(); ++i)
|
||||
{
|
||||
auto next_type = type_tuple ? type_tuple->getElement(i) : nullptr;
|
||||
auto next_column = column_tuple ? column_tuple->getColumnPtr(i) : nullptr;
|
||||
SubstreamData next_data =
|
||||
{
|
||||
elems[i],
|
||||
type_tuple ? type_tuple->getElement(i) : nullptr,
|
||||
column_tuple ? column_tuple->getColumnPtr(i) : nullptr,
|
||||
info_tuple ? info_tuple->getElementInfo(i) : nullptr,
|
||||
};
|
||||
|
||||
elems[i]->enumerateStreams(path, callback, next_type, next_column);
|
||||
elems[i]->enumerateStreams(path, callback, next_data);
|
||||
}
|
||||
}
|
||||
|
||||
@ -311,39 +316,6 @@ struct DeserializeBinaryBulkStateTuple : public ISerialization::DeserializeBinar
|
||||
std::vector<ISerialization::DeserializeBinaryBulkStatePtr> states;
|
||||
};
|
||||
|
||||
static SerializeBinaryBulkStateTuple * checkAndGetTupleSerializeState(ISerialization::SerializeBinaryBulkStatePtr & state)
|
||||
{
|
||||
if (!state)
|
||||
throw Exception("Got empty state for DataTypeTuple.", ErrorCodes::LOGICAL_ERROR);
|
||||
|
||||
auto * tuple_state = typeid_cast<SerializeBinaryBulkStateTuple *>(state.get());
|
||||
if (!tuple_state)
|
||||
{
|
||||
auto & state_ref = *state;
|
||||
throw Exception("Invalid SerializeBinaryBulkState for DataTypeTuple. Expected: "
|
||||
+ demangle(typeid(SerializeBinaryBulkStateTuple).name()) + ", got "
|
||||
+ demangle(typeid(state_ref).name()), ErrorCodes::LOGICAL_ERROR);
|
||||
}
|
||||
|
||||
return tuple_state;
|
||||
}
|
||||
|
||||
static DeserializeBinaryBulkStateTuple * checkAndGetTupleDeserializeState(ISerialization::DeserializeBinaryBulkStatePtr & state)
|
||||
{
|
||||
if (!state)
|
||||
throw Exception("Got empty state for DataTypeTuple.", ErrorCodes::LOGICAL_ERROR);
|
||||
|
||||
auto * tuple_state = typeid_cast<DeserializeBinaryBulkStateTuple *>(state.get());
|
||||
if (!tuple_state)
|
||||
{
|
||||
auto & state_ref = *state;
|
||||
throw Exception("Invalid DeserializeBinaryBulkState for DataTypeTuple. Expected: "
|
||||
+ demangle(typeid(DeserializeBinaryBulkStateTuple).name()) + ", got "
|
||||
+ demangle(typeid(state_ref).name()), ErrorCodes::LOGICAL_ERROR);
|
||||
}
|
||||
|
||||
return tuple_state;
|
||||
}
|
||||
|
||||
void SerializationTuple::serializeBinaryBulkStatePrefix(
|
||||
SerializeBinaryBulkSettings & settings,
|
||||
@ -362,7 +334,7 @@ void SerializationTuple::serializeBinaryBulkStateSuffix(
|
||||
SerializeBinaryBulkSettings & settings,
|
||||
SerializeBinaryBulkStatePtr & state) const
|
||||
{
|
||||
auto * tuple_state = checkAndGetTupleSerializeState(state);
|
||||
auto * tuple_state = checkAndGetState<SerializeBinaryBulkStateTuple>(state);
|
||||
|
||||
for (size_t i = 0; i < elems.size(); ++i)
|
||||
elems[i]->serializeBinaryBulkStateSuffix(settings, tuple_state->states[i]);
|
||||
@ -388,7 +360,7 @@ void SerializationTuple::serializeBinaryBulkWithMultipleStreams(
|
||||
SerializeBinaryBulkSettings & settings,
|
||||
SerializeBinaryBulkStatePtr & state) const
|
||||
{
|
||||
auto * tuple_state = checkAndGetTupleSerializeState(state);
|
||||
auto * tuple_state = checkAndGetState<SerializeBinaryBulkStateTuple>(state);
|
||||
|
||||
for (const auto i : collections::range(0, elems.size()))
|
||||
{
|
||||
@ -404,7 +376,7 @@ void SerializationTuple::deserializeBinaryBulkWithMultipleStreams(
|
||||
DeserializeBinaryBulkStatePtr & state,
|
||||
SubstreamsCache * cache) const
|
||||
{
|
||||
auto * tuple_state = checkAndGetTupleDeserializeState(state);
|
||||
auto * tuple_state = checkAndGetState<DeserializeBinaryBulkStateTuple>(state);
|
||||
|
||||
auto mutable_column = column->assumeMutable();
|
||||
auto & column_tuple = assert_cast<ColumnTuple &>(*mutable_column);
|
||||
|
@ -13,7 +13,9 @@ public:
|
||||
using ElementSerializations = std::vector<ElementSerializationPtr>;
|
||||
|
||||
SerializationTuple(const ElementSerializations & elems_, bool have_explicit_names_)
|
||||
: elems(elems_), have_explicit_names(have_explicit_names_) {}
|
||||
: elems(elems_), have_explicit_names(have_explicit_names_)
|
||||
{
|
||||
}
|
||||
|
||||
void serializeBinary(const Field & field, WriteBuffer & ostr) const override;
|
||||
void deserializeBinary(Field & field, ReadBuffer & istr) const override;
|
||||
@ -34,8 +36,7 @@ public:
|
||||
void enumerateStreams(
|
||||
SubstreamPath & path,
|
||||
const StreamCallback & callback,
|
||||
DataTypePtr type,
|
||||
ColumnPtr column) const override;
|
||||
const SubstreamData & data) const override;
|
||||
|
||||
void serializeBinaryBulkStatePrefix(
|
||||
SerializeBinaryBulkSettings & settings,
|
||||
@ -63,6 +64,8 @@ public:
|
||||
DeserializeBinaryBulkStatePtr & state,
|
||||
SubstreamsCache * cache) const override;
|
||||
|
||||
const ElementSerializations & getElementsSerializations() const { return elems; }
|
||||
|
||||
private:
|
||||
ElementSerializations elems;
|
||||
bool have_explicit_names;
|
||||
|
@ -7,10 +7,9 @@ namespace DB
|
||||
void SerializationWrapper::enumerateStreams(
|
||||
SubstreamPath & path,
|
||||
const StreamCallback & callback,
|
||||
DataTypePtr type,
|
||||
ColumnPtr column) const
|
||||
const SubstreamData & data) const
|
||||
{
|
||||
nested_serialization->enumerateStreams(path, callback, type, column);
|
||||
nested_serialization->enumerateStreams(path, callback, data);
|
||||
}
|
||||
|
||||
void SerializationWrapper::serializeBinaryBulkStatePrefix(
|
||||
|
@ -16,11 +16,14 @@ protected:
|
||||
public:
|
||||
SerializationWrapper(const SerializationPtr & nested_serialization_) : nested_serialization(nested_serialization_) {}
|
||||
|
||||
const SerializationPtr & getNested() const { return nested_serialization; }
|
||||
|
||||
Kind getKind() const override { return nested_serialization->getKind(); }
|
||||
|
||||
void enumerateStreams(
|
||||
SubstreamPath & path,
|
||||
const StreamCallback & callback,
|
||||
DataTypePtr type,
|
||||
ColumnPtr column) const override;
|
||||
const SubstreamData & data) const override;
|
||||
|
||||
void serializeBinaryBulkStatePrefix(
|
||||
SerializeBinaryBulkSettings & settings,
|
||||
|
32
src/DataTypes/tests/gtest_split_name.cpp
Normal file
32
src/DataTypes/tests/gtest_split_name.cpp
Normal file
@ -0,0 +1,32 @@
|
||||
#include <DataTypes/NestedUtils.h>
|
||||
|
||||
#include <gtest/gtest.h>
|
||||
|
||||
using namespace DB;
|
||||
|
||||
TEST(SplitName, forward)
|
||||
{
|
||||
ASSERT_EQ(Nested::splitName(String("abc")), (std::pair<std::string, std::string>{"abc", ""}));
|
||||
ASSERT_EQ(Nested::splitName(String("a.b")), (std::pair<std::string, std::string>{"a", "b"}));
|
||||
ASSERT_EQ(Nested::splitName(String("a.b.c")), (std::pair<std::string, std::string>{"a", "b.c"}));
|
||||
ASSERT_EQ(Nested::splitName(String("a.1")), (std::pair<std::string, std::string>{"a", "1"}));
|
||||
ASSERT_EQ(Nested::splitName(String("a.1.b")), (std::pair<std::string, std::string>{"a", "1.b"}));
|
||||
ASSERT_EQ(Nested::splitName(String("1.a")), (std::pair<std::string, std::string>{"1", "a"}));
|
||||
ASSERT_EQ(Nested::splitName(String("a.b1.b2")), (std::pair<std::string, std::string>{"a", "b1.b2"}));
|
||||
ASSERT_EQ(Nested::splitName(String("a.b1.2a.3a")), (std::pair<std::string, std::string>{"a", "b1.2a.3a"}));
|
||||
ASSERT_EQ(Nested::splitName(String("..")), (std::pair<std::string, std::string>{"..", ""}));
|
||||
}
|
||||
|
||||
TEST(SplitName, reverse)
|
||||
{
|
||||
ASSERT_EQ(Nested::splitName(String("abc"), true), (std::pair<std::string, std::string>{"abc", ""}));
|
||||
ASSERT_EQ(Nested::splitName(String("a.b"), true), (std::pair<std::string, std::string>{"a", "b"}));
|
||||
ASSERT_EQ(Nested::splitName(String("a.b.c"), true), (std::pair<std::string, std::string>{"a.b", "c"}));
|
||||
ASSERT_EQ(Nested::splitName(String("a.1"), true), (std::pair<std::string, std::string>{"a", "1"}));
|
||||
ASSERT_EQ(Nested::splitName(String("a.1a.b"), true), (std::pair<std::string, std::string>{"a.1a", "b"}));
|
||||
ASSERT_EQ(Nested::splitName(String("1a.b"), true), (std::pair<std::string, std::string>{"1a", "b"}));
|
||||
ASSERT_EQ(Nested::splitName(String("a.b1.b2"), true), (std::pair<std::string, std::string>{"a.b1", "b2"}));
|
||||
ASSERT_EQ(Nested::splitName(String("a.b1.2a.3a"), true), (std::pair<std::string, std::string>{"a.b1.2a", "3a"}));
|
||||
ASSERT_EQ(Nested::splitName(String("a.b1.b2.b3"), true), (std::pair<std::string, std::string>{"a.b1.b2", "b3"}));
|
||||
ASSERT_EQ(Nested::splitName(String(".."), true), (std::pair<std::string, std::string>{"..", ""}));
|
||||
}
|
@ -23,6 +23,8 @@
|
||||
# include <Databases/MySQL/ConnectionMySQLSettings.h>
|
||||
# include <Databases/MySQL/DatabaseMySQL.h>
|
||||
# include <Databases/MySQL/MaterializedMySQLSettings.h>
|
||||
# include <Storages/MySQL/MySQLHelpers.h>
|
||||
# include <Storages/MySQL/MySQLSettings.h>
|
||||
# include <Databases/MySQL/DatabaseMaterializedMySQL.h>
|
||||
# include <mysqlxx/Pool.h>
|
||||
#endif
|
||||
@ -198,13 +200,15 @@ DatabasePtr DatabaseFactory::getImpl(const ASTCreateQuery & create, const String
|
||||
if (engine_name == "MySQL")
|
||||
{
|
||||
auto mysql_database_settings = std::make_unique<ConnectionMySQLSettings>();
|
||||
auto mysql_pool = mysqlxx::PoolWithFailover(configuration.database, configuration.addresses, configuration.username, configuration.password);
|
||||
MySQLSettings mysql_settings;
|
||||
auto mysql_pool = createMySQLPoolWithFailover(configuration, mysql_settings);
|
||||
|
||||
mysql_database_settings->loadFromQueryContext(context);
|
||||
mysql_database_settings->loadFromQuery(*engine_define); /// higher priority
|
||||
|
||||
return std::make_shared<DatabaseMySQL>(
|
||||
context, database_name, metadata_path, engine_define, configuration.database, std::move(mysql_database_settings), std::move(mysql_pool));
|
||||
context, database_name, metadata_path, engine_define, configuration.database,
|
||||
std::move(mysql_database_settings), std::move(mysql_pool), create.attach);
|
||||
}
|
||||
|
||||
MySQLClient client(configuration.host, configuration.port, configuration.username, configuration.password);
|
||||
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue
Block a user