Merge branch 'master' into fix_loading_dependencies

This commit is contained in:
Alexander Tokmakov 2022-10-07 18:58:11 +02:00
commit 014784a9ca
584 changed files with 5064 additions and 2132 deletions

1
.gitattributes vendored
View File

@ -1,3 +1,4 @@
contrib/* linguist-vendored
*.h linguist-language=C++
tests/queries/0_stateless/data_json/* binary
tests/queries/0_stateless/*.reference -crlf

3
.gitmodules vendored
View File

@ -284,3 +284,6 @@
[submodule "contrib/llvm-project"]
path = contrib/llvm-project
url = https://github.com/ClickHouse/llvm-project.git
[submodule "contrib/corrosion"]
path = contrib/corrosion
url = https://github.com/corrosion-rs/corrosion.git

View File

@ -1,4 +1,4 @@
cmake_minimum_required(VERSION 3.15)
cmake_minimum_required(VERSION 3.20)
project(ClickHouse LANGUAGES C CXX ASM)
@ -495,6 +495,14 @@ endif ()
enable_testing() # Enable for tests without binary
option(ENABLE_EXTERNAL_OPENSSL "This option is insecure and not recommended for any occasions. If it is enabled, it allows building with alternative OpenSSL library. By default, ClickHouse is using BoringSSL, which is better. Do not use this option." OFF)
if (ENABLE_EXTERNAL_OPENSSL)
message (STATUS "Build and uses OpenSSL library instead of BoringSSL. This is strongly discouraged. Your build of ClickHouse will be unsupported.")
set(ENABLE_SSL 1)
target_compile_options(global-group INTERFACE "-Wno-deprecated-declarations")
endif ()
# when installing to /usr - place configs to /etc but for /usr/local place to /usr/local/etc
if (CMAKE_INSTALL_PREFIX STREQUAL "/usr")
set (CLICKHOUSE_ETC_DIR "/etc")
@ -557,9 +565,9 @@ macro (clickhouse_add_executable target)
endif()
endmacro()
# With cross-compiling, all targets are built for the target platform which usually different from the host
# platform. This is problematic if a build artifact X (e.g. a file or an executable) is generated by running
# another executable Y previously produced in the build. This is solved by compiling and running Y for/on
# With cross-compiling, all targets are built for the target platform which usually different from the host
# platform. This is problematic if a build artifact X (e.g. a file or an executable) is generated by running
# another executable Y previously produced in the build. This is solved by compiling and running Y for/on
# the host platform. Add target to the list:
# add_native_target(<target> ...)
set_property (GLOBAL PROPERTY NATIVE_BUILD_TARGETS)
@ -567,13 +575,17 @@ function (add_native_target)
set_property (GLOBAL APPEND PROPERTY NATIVE_BUILD_TARGETS ${ARGV})
endfunction (add_native_target)
set(ConfigIncludePath ${CMAKE_CURRENT_BINARY_DIR}/includes/configs CACHE INTERNAL "Path to generated configuration files.")
include_directories(${ConfigIncludePath})
set(CONFIG_INCLUDE_PATH ${CMAKE_CURRENT_BINARY_DIR}/includes/configs CACHE INTERNAL "Path to generated configuration files.")
include_directories(${CONFIG_INCLUDE_PATH})
# Add as many warnings as possible for our own code.
include (cmake/warnings.cmake)
include (cmake/print_flags.cmake)
if (ENABLE_RUST)
add_subdirectory (rust)
endif()
add_subdirectory (base)
add_subdirectory (src)
add_subdirectory (programs)
@ -584,7 +596,7 @@ include (cmake/sanitize_target_link_libraries.cmake)
# Build native targets if necessary
get_property(NATIVE_BUILD_TARGETS GLOBAL PROPERTY NATIVE_BUILD_TARGETS)
if (NATIVE_BUILD_TARGETS
if (NATIVE_BUILD_TARGETS
AND NOT(
CMAKE_HOST_SYSTEM_NAME STREQUAL CMAKE_SYSTEM_NAME
AND CMAKE_HOST_SYSTEM_PROCESSOR STREQUAL CMAKE_SYSTEM_PROCESSOR

View File

@ -5,6 +5,7 @@ ClickHouse® is an open-source column-oriented database management system that a
## Useful Links
* [Official website](https://clickhouse.com/) has a quick high-level overview of ClickHouse on the main page.
* [ClickHouse Cloud](https://clickhouse.com/cloud) ClickHouse as a service, built by the creators and maintainers.
* [Tutorial](https://clickhouse.com/docs/en/getting_started/tutorial/) shows how to set up and query a small ClickHouse cluster.
* [Documentation](https://clickhouse.com/docs/en/) provides more in-depth information.
* [YouTube channel](https://www.youtube.com/c/ClickHouseDB) has a lot of content about ClickHouse in video format.

View File

@ -176,6 +176,249 @@ void __explicit_bzero_chk(void * buf, size_t len, size_t unused)
}
#include <unistd.h>
#include "syscall.h"
ssize_t copy_file_range(int fd_in, off_t *off_in, int fd_out, off_t *off_out, size_t len, unsigned flags)
{
return syscall(SYS_copy_file_range, fd_in, off_in, fd_out, off_out, len, flags);
}
long splice(int fd_in, off_t *off_in, int fd_out, off_t *off_out, size_t len, unsigned flags)
{
return syscall(SYS_splice, fd_in, off_in, fd_out, off_out, len, flags);
}
#define _BSD_SOURCE
#include <sys/stat.h>
#include <stdint.h>
#if !defined(__aarch64__)
struct statx {
uint32_t stx_mask;
uint32_t stx_blksize;
uint64_t stx_attributes;
uint32_t stx_nlink;
uint32_t stx_uid;
uint32_t stx_gid;
uint16_t stx_mode;
uint16_t pad1;
uint64_t stx_ino;
uint64_t stx_size;
uint64_t stx_blocks;
uint64_t stx_attributes_mask;
struct {
int64_t tv_sec;
uint32_t tv_nsec;
int32_t pad;
} stx_atime, stx_btime, stx_ctime, stx_mtime;
uint32_t stx_rdev_major;
uint32_t stx_rdev_minor;
uint32_t stx_dev_major;
uint32_t stx_dev_minor;
uint64_t spare[14];
};
#endif
int statx(int fd, const char *restrict path, int flag,
unsigned int mask, struct statx *restrict statxbuf)
{
return syscall(SYS_statx, fd, path, flag, mask, statxbuf);
}
#include <syscall.h>
ssize_t getrandom(void *buf, size_t buflen, unsigned flags)
{
/// There was cancellable syscall (syscall_cp), but I don't care too.
return syscall(SYS_getrandom, buf, buflen, flags);
}
#include <errno.h>
#include <limits.h>
#define ALIGN (sizeof(size_t))
#define ONES ((size_t)-1/UCHAR_MAX)
#define HIGHS (ONES * (UCHAR_MAX/2+1))
#define HASZERO(x) ((x)-ONES & ~(x) & HIGHS)
char *__strchrnul(const char *s, int c)
{
c = (unsigned char)c;
if (!c) return (char *)s + strlen(s);
#ifdef __GNUC__
typedef size_t __attribute__((__may_alias__)) word;
const word *w;
for (; (uintptr_t)s % ALIGN; s++)
if (!*s || *(unsigned char *)s == c) return (char *)s;
size_t k = ONES * c;
for (w = (void *)s; !HASZERO(*w) && !HASZERO(*w^k); w++);
s = (void *)w;
#endif
for (; *s && *(unsigned char *)s != c; s++);
return (char *)s;
}
int __execvpe(const char *file, char *const argv[], char *const envp[])
{
const char *p, *z, *path = getenv("PATH");
size_t l, k;
int seen_eacces = 0;
errno = ENOENT;
if (!*file) return -1;
if (strchr(file, '/'))
return execve(file, argv, envp);
if (!path) path = "/usr/local/bin:/bin:/usr/bin";
k = strnlen(file, NAME_MAX+1);
if (k > NAME_MAX) {
errno = ENAMETOOLONG;
return -1;
}
l = strnlen(path, PATH_MAX-1)+1;
for(p=path; ; p=z) {
char b[l+k+1];
z = __strchrnul(p, ':');
if (z-p >= l) {
if (!*z++) break;
continue;
}
memcpy(b, p, z-p);
b[z-p] = '/';
memcpy(b+(z-p)+(z>p), file, k+1);
execve(b, argv, envp);
switch (errno) {
case EACCES:
seen_eacces = 1;
case ENOENT:
case ENOTDIR:
break;
default:
return -1;
}
if (!*z++) break;
}
if (seen_eacces) errno = EACCES;
return -1;
}
#include "spawn.h"
int posix_spawnp(pid_t *restrict res, const char *restrict file,
const posix_spawn_file_actions_t *fa,
const posix_spawnattr_t *restrict attr,
char *const argv[restrict], char *const envp[restrict])
{
posix_spawnattr_t spawnp_attr = { 0 };
if (attr) spawnp_attr = *attr;
spawnp_attr.__fn = (void *)__execvpe;
return posix_spawn(res, file, fa, &spawnp_attr, argv, envp);
}
#define FDOP_CLOSE 1
#define FDOP_DUP2 2
#define FDOP_OPEN 3
#define FDOP_CHDIR 4
#define FDOP_FCHDIR 5
#define ENOMEM 12
#define EBADF 9
struct fdop {
struct fdop *next, *prev;
int cmd, fd, srcfd, oflag;
mode_t mode;
char path[];
};
int posix_spawn_file_actions_init(posix_spawn_file_actions_t *fa) {
fa->__actions = 0;
return 0;
}
int posix_spawn_file_actions_addchdir_np(posix_spawn_file_actions_t *restrict fa, const char *restrict path) {
struct fdop *op = malloc(sizeof *op + strlen(path) + 1);
if (!op) return ENOMEM;
op->cmd = FDOP_CHDIR;
op->fd = -1;
strcpy(op->path, path);
if ((op->next = fa->__actions)) op->next->prev = op;
op->prev = 0;
fa->__actions = op;
return 0;
}
int posix_spawn_file_actions_addclose(posix_spawn_file_actions_t *fa, int fd) {
if (fd < 0) return EBADF;
struct fdop *op = malloc(sizeof *op);
if (!op) return ENOMEM;
op->cmd = FDOP_CLOSE;
op->fd = fd;
if ((op->next = fa->__actions)) op->next->prev = op;
op->prev = 0;
fa->__actions = op;
return 0;
}
int posix_spawn_file_actions_adddup2(posix_spawn_file_actions_t *fa, int srcfd, int fd) {
if (srcfd < 0 || fd < 0) return EBADF;
struct fdop *op = malloc(sizeof *op);
if (!op) return ENOMEM;
op->cmd = FDOP_DUP2;
op->srcfd = srcfd;
op->fd = fd;
if ((op->next = fa->__actions)) op->next->prev = op;
op->prev = 0;
fa->__actions = op;
return 0;
}
int posix_spawn_file_actions_addfchdir_np(posix_spawn_file_actions_t *fa, int fd) {
if (fd < 0) return EBADF;
struct fdop *op = malloc(sizeof *op);
if (!op) return ENOMEM;
op->cmd = FDOP_FCHDIR;
op->fd = fd;
if ((op->next = fa->__actions)) op->next->prev = op;
op->prev = 0;
fa->__actions = op;
return 0;
}
int posix_spawn_file_actions_addopen(posix_spawn_file_actions_t *restrict fa, int fd, const char *restrict path, int flags, mode_t mode) {
if (fd < 0) return EBADF;
struct fdop *op = malloc(sizeof *op + strlen(path) + 1);
if (!op) return ENOMEM;
op->cmd = FDOP_OPEN;
op->fd = fd;
op->oflag = flags;
op->mode = mode;
strcpy(op->path, path);
if ((op->next = fa->__actions)) op->next->prev = op;
op->prev = 0;
fa->__actions = op;
return 0;
}
int posix_spawn_file_actions_destroy(posix_spawn_file_actions_t *fa) {
struct fdop *op = fa->__actions, *next;
while (op) {
next = op->next;
free(op);
op = next;
}
return 0;
}
#if defined (__cplusplus)
}
#endif

View File

@ -0,0 +1,32 @@
#ifndef _SPAWN_H
#define _SPAWN_H
#ifdef __cplusplus
extern "C" {
#endif
#include <features.h>
typedef struct {
int __flags;
pid_t __pgrp;
sigset_t __def, __mask;
int __prio, __pol;
void *__fn;
char __pad[64-sizeof(void *)];
} posix_spawnattr_t;
typedef struct {
int __pad0[2];
void *__actions;
int __pad[16];
} posix_spawn_file_actions_t;
int posix_spawn(pid_t *__restrict, const char *__restrict, const posix_spawn_file_actions_t *,
const posix_spawnattr_t *__restrict, char *const *__restrict, char *const *__restrict);
#ifdef __cplusplus
}
#endif
#endif

2
contrib/AMQP-CPP vendored

@ -1 +1 @@
Subproject commit 1a6c51f4ac51ac56610fa95081bd2f349911375a
Subproject commit 818c2d8ad96a08a5d20fece7d1e1e8855a2b0860

View File

@ -74,7 +74,11 @@ add_contrib (re2-cmake re2)
add_contrib (xz-cmake xz)
add_contrib (brotli-cmake brotli)
add_contrib (double-conversion-cmake double-conversion)
add_contrib (boringssl-cmake boringssl)
if (NOT ENABLE_EXTERNAL_OPENSSL)
add_contrib (boringssl-cmake boringssl)
else ()
add_contrib (openssl-cmake openssl)
endif ()
add_contrib (poco-cmake poco)
add_contrib (croaring-cmake croaring)
add_contrib (zstd-cmake zstd)
@ -92,6 +96,8 @@ add_contrib (openldap-cmake openldap)
add_contrib (grpc-cmake grpc)
add_contrib (msgpack-c-cmake msgpack-c)
add_contrib (corrosion-cmake corrosion)
if (ENABLE_FUZZING)
add_contrib (libprotobuf-mutator-cmake libprotobuf-mutator)
endif()

View File

@ -4,6 +4,11 @@ if (NOT ENABLE_AMQPCPP)
message(STATUS "Not using AMQP-CPP")
return()
endif()
if (OS_FREEBSD)
message(STATUS "Not using AMQP-CPP because libuv is disabled")
return()
endif()
# can be removed once libuv build on MacOS with GCC is possible
if (NOT TARGET ch_contrib::uv)

1
contrib/corrosion vendored Submodule

@ -0,0 +1 @@
Subproject commit d9dfdefaa3d9ec4ba1245c7070727359c65c7869

View File

@ -0,0 +1,46 @@
if (NOT ENABLE_LIBRARIES)
set(DEFAULT_ENABLE_RUST FALSE)
elseif((CMAKE_TOOLCHAIN_FILE MATCHES "darwin") AND (CMAKE_TOOLCHAIN_FILE MATCHES "aarch64"))
message(STATUS "Rust is not available on aarch64-apple-darwin")
set(DEFAULT_ENABLE_RUST FALSE)
else()
list (APPEND CMAKE_MODULE_PATH "${ClickHouse_SOURCE_DIR}/contrib/corrosion/cmake")
find_package(Rust)
set(DEFAULT_ENABLE_RUST ${Rust_FOUND})
endif()
option(ENABLE_RUST "Enable rust" ${DEFAULT_ENABLE_RUST})
message(STATUS ${ENABLE_RUST})
if(NOT ENABLE_RUST)
message(STATUS "Not using rust")
return()
endif()
message(STATUS "Checking Rust toolchain for current target")
if(CMAKE_TOOLCHAIN_FILE MATCHES "linux/toolchain-x86_64")
set(Rust_CARGO_TARGET "x86_64-unknown-linux-gnu")
endif()
if(CMAKE_TOOLCHAIN_FILE MATCHES "linux/toolchain-aarch64")
set(Rust_CARGO_TARGET "aarch64-unknown-linux-gnu")
endif()
if((CMAKE_TOOLCHAIN_FILE MATCHES "darwin") AND (CMAKE_TOOLCHAIN_FILE MATCHES "x86_64"))
set(Rust_CARGO_TARGET "x86_64-apple-darwin")
endif()
if((CMAKE_TOOLCHAIN_FILE MATCHES "freebsd") AND (CMAKE_TOOLCHAIN_FILE MATCHES "x86_64"))
set(Rust_CARGO_TARGET "x86_64-unknown-freebsd")
endif()
if(CMAKE_TOOLCHAIN_FILE MATCHES "ppc64le")
set(Rust_CARGO_TARGET "powerpc64le-unknown-linux-gnu")
endif()
message(STATUS "Switched Rust target to ${Rust_CARGO_TARGET}")
# Define function corrosion_import_crate()
include ("${ClickHouse_SOURCE_DIR}/contrib/corrosion/cmake/Corrosion.cmake")

View File

@ -578,6 +578,12 @@ if(CMAKE_SYSTEM_NAME MATCHES "Darwin")
list(APPEND ALL_SRCS "${CMAKE_CURRENT_BINARY_DIR}/include_private/kcmrpc.c")
endif()
if (ENABLE_EXTERNAL_OPENSSL)
list(REMOVE_ITEM ALL_SRCS "${KRB5_SOURCE_DIR}/lib/crypto/openssl/enc_provider/aes.c")
list(APPEND ALL_SRCS "${CMAKE_CURRENT_SOURCE_DIR}/aes.c")
endif ()
target_sources(_krb5 PRIVATE
${ALL_SRCS}
)

View File

@ -59,6 +59,12 @@ set(SRCS
add_library(_libpq ${SRCS})
if (ENABLE_EXTERNAL_OPENSSL)
add_definitions(-DHAVE_BIO_METH_NEW)
add_definitions(-DHAVE_HMAC_CTX_NEW)
add_definitions(-DHAVE_HMAC_CTX_FREE)
endif ()
target_include_directories (_libpq SYSTEM PUBLIC ${LIBPQ_SOURCE_DIR})
target_include_directories (_libpq SYSTEM PUBLIC "${LIBPQ_SOURCE_DIR}/include")
target_include_directories (_libpq SYSTEM PRIVATE "${LIBPQ_SOURCE_DIR}/configs")

@ -1 +1 @@
Subproject commit 6ca2b5b3927226f6bcf6c656f502ff5d012ad9b6
Subproject commit 328e4602120ddd6b2c1fb91bf2d50bd7bc249711

View File

@ -14,7 +14,7 @@ endif()
# TODO: Enable shared library build
# TODO: Enable compilation on AArch64
set (LLVM_VERSION "13.0.0bundled")
set (LLVM_VERSION "14.0.0bundled")
set (LLVM_INCLUDE_DIRS
"${ClickHouse_SOURCE_DIR}/contrib/llvm-project/llvm/include"
"${ClickHouse_BINARY_DIR}/contrib/llvm-project/llvm/include"

View File

@ -3,6 +3,33 @@
ARG FROM_TAG=latest
FROM clickhouse/test-util:$FROM_TAG
# Rust toolchain and libraries
ENV RUSTUP_HOME=/rust/rustup
ENV CARGO_HOME=/rust/cargo
RUN curl https://sh.rustup.rs -sSf | bash -s -- -y
RUN chmod 777 -R /rust
ENV PATH="/rust/cargo/env:${PATH}"
ENV PATH="/rust/cargo/bin:${PATH}"
RUN rustup target add aarch64-unknown-linux-gnu && \
rustup target add x86_64-apple-darwin && \
rustup target add x86_64-unknown-freebsd && \
rustup target add aarch64-apple-darwin && \
rustup target add powerpc64le-unknown-linux-gnu
RUN apt-get install \
gcc-aarch64-linux-gnu \
build-essential \
libc6 \
libc6-dev \
libc6-dev-arm64-cross \
--yes
# Install CMake 3.20+ for Rust compilation
# Used https://askubuntu.com/a/1157132 as reference
RUN apt purge cmake --yes
RUN wget -O - https://apt.kitware.com/keys/kitware-archive-latest.asc 2>/dev/null | gpg --dearmor - | tee /etc/apt/trusted.gpg.d/kitware.gpg >/dev/null
RUN apt-add-repository 'deb https://apt.kitware.com/ubuntu/ focal main'
RUN apt update && apt install cmake --yes
ENV CC=clang-${LLVM_VERSION}
ENV CXX=clang++-${LLVM_VERSION}

View File

@ -19,6 +19,12 @@ RUN apt-get update \
pv \
--yes --no-install-recommends
# Install CMake 3.20+ for Rust compilation
RUN apt purge cmake --yes
RUN wget -O - https://apt.kitware.com/keys/kitware-archive-latest.asc 2>/dev/null | gpg --dearmor - | tee /etc/apt/trusted.gpg.d/kitware.gpg >/dev/null
RUN apt-add-repository 'deb https://apt.kitware.com/ubuntu/ focal main'
RUN apt update && apt install cmake --yes
RUN pip3 install numpy scipy pandas Jinja2
ARG odbc_driver_url="https://github.com/ClickHouse/clickhouse-odbc/releases/download/v1.1.4.20200302/clickhouse-odbc-1.1.4-Linux.tar.gz"

View File

@ -0,0 +1,6 @@
<clickhouse>
<!-- Allow nullable key to avoid errors while fuzzing definitions of tables -->
<merge_tree>
<allow_nullable_key>1</allow_nullable_key>
</merge_tree>
</clickhouse>

View File

@ -94,6 +94,7 @@ function configure
# TODO figure out which ones are needed
cp -av --dereference "$repo_dir"/tests/config/config.d/listen.xml db/config.d
cp -av --dereference "$script_dir"/query-fuzzer-tweaks-users.xml db/users.d
cp -av --dereference "$script_dir"/allow-nullable-key.xml db/config.d
cat > db/config.d/core.xml <<EOL
<clickhouse>
@ -240,6 +241,7 @@ quit
--receive_data_timeout_ms=10000 \
--stacktrace \
--query-fuzzer-runs=1000 \
--create-query-fuzzer-runs=50 \
--queries-file $(ls -1 ch/tests/queries/0_stateless/*.sql | sort -R) \
$NEW_TESTS_OPT \
> >(tail -n 100000 > fuzzer.log) \

View File

@ -35,6 +35,8 @@ RUN apt-get update \
tzdata \
vim \
wget \
rustc \
cargo \
&& pip3 --no-cache-dir install 'clickhouse-driver==0.2.1' scipy \
&& apt-get purge --yes python3-dev g++ \
&& apt-get autoremove --yes \

View File

@ -11,6 +11,7 @@ RUN apt-get update -y \
apt-get install --yes --no-install-recommends \
awscli \
brotli \
lz4 \
expect \
golang \
lsof \
@ -35,12 +36,13 @@ RUN apt-get update -y \
tree \
unixodbc \
wget \
rustc \
cargo \
zstd \
file \
pv \
&& apt-get clean
RUN pip3 install numpy scipy pandas Jinja2
RUN mkdir -p /tmp/clickhouse-odbc-tmp \

View File

@ -0,0 +1,23 @@
---
sidebar_position: 1
sidebar_label: 2022
---
# 2022 Changelog
### ClickHouse release v22.6.9.11-stable (9ec61dcac49) FIXME as compared to v22.6.8.35-stable (b91dc59a565)
#### Improvement
* Backported in [#42089](https://github.com/ClickHouse/ClickHouse/issues/42089): Replace back `clickhouse su` command with `sudo -u` in start in order to respect limits in `/etc/security/limits.conf`. [#41847](https://github.com/ClickHouse/ClickHouse/pull/41847) ([Eugene Konkov](https://github.com/ekonkov)).
#### Build/Testing/Packaging Improvement
* Backported in [#41558](https://github.com/ClickHouse/ClickHouse/issues/41558): Add `source` field to deb packages, update `nfpm`. [#41531](https://github.com/ClickHouse/ClickHouse/pull/41531) ([Mikhail f. Shiryaev](https://github.com/Felixoid)).
#### Bug Fix (user-visible misbehavior in official stable or prestable release)
* Backported in [#41504](https://github.com/ClickHouse/ClickHouse/issues/41504): Writing data in Apache `ORC` format might lead to a buffer overrun. [#41458](https://github.com/ClickHouse/ClickHouse/pull/41458) ([Alexey Milovidov](https://github.com/alexey-milovidov)).
#### NOT FOR CHANGELOG / INSIGNIFICANT
* Build latest tags ONLY from master branch [#41567](https://github.com/ClickHouse/ClickHouse/pull/41567) ([Mikhail f. Shiryaev](https://github.com/Felixoid)).

View File

@ -1,8 +1,7 @@
position: 10
position: 1
label: 'Example Datasets'
collapsible: true
collapsed: true
link:
type: generated-index
title: Example Datasets
slug: /en/getting-started/example-datasets
type: doc
id: en/getting-started/example-datasets/

View File

@ -1,9 +1,16 @@
---
slug: /en/getting-started/example-datasets/cell-towers
sidebar_label: Cell Towers
sidebar_position: 3
title: "Cell Towers"
---
import Tabs from '@theme/Tabs';
import TabItem from '@theme/TabItem';
import CodeBlock from '@theme/CodeBlock';
import ActionsMenu from '@site/docs/en/_snippets/_service_actions_menu.md';
import SQLConsoleDetail from '@site/docs/en/_snippets/_launch_sql_console.md';
This dataset is from [OpenCellid](https://www.opencellid.org/) - The world's largest Open Database of Cell Towers.
As of 2021, it contains more than 40 million records about cell towers (GSM, LTE, UMTS, etc.) around the world with their geographical coordinates and metadata (country code, network, etc).
@ -13,6 +20,26 @@ OpenCelliD Project is licensed under a Creative Commons Attribution-ShareAlike 4
## Get the Dataset {#get-the-dataset}
<Tabs groupId="deployMethod">
<TabItem value="serverless" label="ClickHouse Cloud" default>
ClickHouse Cloud provides an easy-button for uploading this dataset from S3. Log in to your ClickHouse Cloud organization, or create a free trial at [ClickHouse.cloud](https://clickhouse.cloud).
<ActionsMenu menu="Load Data" />
Choose the **Cell Towers** dataset from the **Sample data** tab, and **Load data**:
![Load cell towers dataset](@site/docs/en/_snippets/images/cloud-load-data-sample.png)
Examine the schema of the cell_towers table:
```sql
DESCRIBE TABLE cell_towers
```
<SQLConsoleDetail />
</TabItem>
<TabItem value="selfmanaged" label="Self-managed">
1. Download the snapshot of the dataset from February 2021: [cell_towers.csv.xz](https://datasets.clickhouse.com/cell_towers.csv.xz) (729 MB).
2. Validate the integrity (optional step):
@ -56,7 +83,10 @@ ENGINE = MergeTree ORDER BY (radio, mcc, net, created);
clickhouse-client --query "INSERT INTO cell_towers FORMAT CSVWithNames" < cell_towers.csv
```
## Examples {#examples}
</TabItem>
</Tabs>
## Example queries {#examples}
1. A number of cell towers by type:
@ -101,18 +131,31 @@ So, the top countries are: the USA, Germany, and Russia.
You may want to create an [External Dictionary](../../sql-reference/dictionaries/external-dictionaries/external-dicts.md) in ClickHouse to decode these values.
## Use case {#use-case}
## Use case: Incorporate geo data {#use-case}
Using `pointInPolygon` function.
1. Create a table where we will store polygons:
<Tabs groupId="deployMethod">
<TabItem value="serverless" label="ClickHouse Cloud" default>
```sql
CREATE TABLE moscow (polygon Array(Tuple(Float64, Float64)))
ORDER BY polygon;
```
</TabItem>
<TabItem value="selfmanaged" label="Self-managed">
```sql
CREATE TEMPORARY TABLE
moscow (polygon Array(Tuple(Float64, Float64)));
```
</TabItem>
</Tabs>
2. This is a rough shape of Moscow (without "new Moscow"):
```sql

File diff suppressed because one or more lines are too long

View File

@ -13,16 +13,6 @@ Description of the fields: https://www.gov.uk/guidance/about-the-price-paid-data
Contains HM Land Registry data © Crown copyright and database right 2021. This data is licensed under the Open Government Licence v3.0.
## Download the Dataset {#download-dataset}
Run the command:
```bash
wget http://prod.publicdata.landregistry.gov.uk.s3-website-eu-west-1.amazonaws.com/pp-complete.csv
```
Download will take about 2 minutes with good internet connection.
## Create the Table {#create-table}
```sql
@ -41,31 +31,49 @@ CREATE TABLE uk_price_paid
locality LowCardinality(String),
town LowCardinality(String),
district LowCardinality(String),
county LowCardinality(String),
category UInt8
) ENGINE = MergeTree ORDER BY (postcode1, postcode2, addr1, addr2);
county LowCardinality(String)
)
ENGINE = MergeTree
ORDER BY (postcode1, postcode2, addr1, addr2);
```
## Preprocess and Import Data {#preprocess-import-data}
## Preprocess and Insert the Data {#preprocess-import-data}
We will use `clickhouse-local` tool for data preprocessing and `clickhouse-client` to upload it.
We will use the `url` function to stream the data into ClickHouse. We need to preprocess some of the incoming data first, which includes:
- splitting the `postcode` to two different columns - `postcode1` and `postcode2`, which is better for storage and queries
- converting the `time` field to date as it only contains 00:00 time
- ignoring the [UUid](../../sql-reference/data-types/uuid.md) field because we don't need it for analysis
- transforming `type` and `duration` to more readable `Enum` fields using the [transform](../../sql-reference/functions/other-functions.md#transform) function
- transforming the `is_new` field from a single-character string (`Y`/`N`) to a [UInt8](../../sql-reference/data-types/int-uint.md#uint8-uint16-uint32-uint64-uint256-int8-int16-int32-int64-int128-int256) field with 0 or 1
- drop the last two columns since they all have the same value (which is 0)
In this example, we define the structure of source data from the CSV file and specify a query to preprocess the data with `clickhouse-local`.
The `url` function streams the data from the web server into your ClickHouse table. The following command inserts 5 million rows into the `uk_price_paid` table:
The preprocessing is:
- splitting the postcode to two different columns `postcode1` and `postcode2` that is better for storage and queries;
- coverting the `time` field to date as it only contains 00:00 time;
- ignoring the [UUid](../../sql-reference/data-types/uuid.md) field because we don't need it for analysis;
- transforming `type` and `duration` to more readable Enum fields with function [transform](../../sql-reference/functions/other-functions.md#transform);
- transforming `is_new` and `category` fields from single-character string (`Y`/`N` and `A`/`B`) to [UInt8](../../sql-reference/data-types/int-uint.md#uint8-uint16-uint32-uint64-uint256-int8-int16-int32-int64-int128-int256) field with 0 and 1.
Preprocessed data is piped directly to `clickhouse-client` to be inserted into ClickHouse table in streaming fashion.
```bash
clickhouse-local --input-format CSV --structure '
uuid String,
price UInt32,
time DateTime,
```sql
INSERT INTO uk_price_paid
WITH
splitByChar(' ', postcode) AS p
SELECT
toUInt32(price_string) AS price,
parseDateTimeBestEffortUS(time) AS date,
p[1] AS postcode1,
p[2] AS postcode2,
transform(a, ['T', 'S', 'D', 'F', 'O'], ['terraced', 'semi-detached', 'detached', 'flat', 'other']) AS type,
b = 'Y' AS is_new,
transform(c, ['F', 'L', 'U'], ['freehold', 'leasehold', 'unknown']) AS duration,
addr1,
addr2,
street,
locality,
town,
district,
county
FROM url(
'http://prod.publicdata.landregistry.gov.uk.s3-website-eu-west-1.amazonaws.com/pp-complete.csv',
'CSV',
'uuid_string String,
price_string String,
time String,
postcode String,
a String,
b String,
@ -78,154 +86,136 @@ clickhouse-local --input-format CSV --structure '
district String,
county String,
d String,
e String
' --query "
WITH splitByChar(' ', postcode) AS p
SELECT
price,
toDate(time) AS date,
p[1] AS postcode1,
p[2] AS postcode2,
transform(a, ['T', 'S', 'D', 'F', 'O'], ['terraced', 'semi-detached', 'detached', 'flat', 'other']) AS type,
b = 'Y' AS is_new,
transform(c, ['F', 'L', 'U'], ['freehold', 'leasehold', 'unknown']) AS duration,
addr1,
addr2,
street,
locality,
town,
district,
county,
d = 'B' AS category
FROM table" --date_time_input_format best_effort < pp-complete.csv | clickhouse-client --query "INSERT INTO uk_price_paid FORMAT TSV"
e String'
) SETTINGS max_http_get_redirects=10;
```
It will take about 40 seconds.
Wait for the data to insert - it will take a minute or two depending on the network speed.
## Validate the Data {#validate-data}
Query:
Let's verify it worked by seeing how many rows were inserted:
```sql
SELECT count() FROM uk_price_paid;
SELECT count()
FROM uk_price_paid
```
Result:
```text
┌──count()─┐
│ 26321785 │
└──────────┘
```
The size of dataset in ClickHouse is just 278 MiB, check it.
Query:
At the time this query was executed, the dataset had 27,450,499 rows. Let's see what the storage size is of the table in ClickHouse:
```sql
SELECT formatReadableSize(total_bytes) FROM system.tables WHERE name = 'uk_price_paid';
SELECT formatReadableSize(total_bytes)
FROM system.tables
WHERE name = 'uk_price_paid'
```
Result:
```text
┌─formatReadableSize(total_bytes)─┐
│ 278.80 MiB │
└─────────────────────────────────┘
```
Notice the size of the table is just 221.43 MiB!
## Run Some Queries {#run-queries}
Let's run some queries to analyze the data:
### Query 1. Average Price Per Year {#average-price}
Query:
```sql
SELECT toYear(date) AS year, round(avg(price)) AS price, bar(price, 0, 1000000, 80) FROM uk_price_paid GROUP BY year ORDER BY year;
SELECT
toYear(date) AS year,
round(avg(price)) AS price,
bar(price, 0, 1000000, 80
)
FROM uk_price_paid
GROUP BY year
ORDER BY year
```
Result:
The result looks like:
```text
```response
┌─year─┬──price─┬─bar(round(avg(price)), 0, 1000000, 80)─┐
│ 1995 │ 67932 │ █████▍ │
│ 1996 │ 71505 │ █████▋ │
│ 1997 │ 78532 │ ██████▎ │
│ 1998 │ 85436 │ ██████▋ │
│ 1999 │ 96037 │ ███████▋ │
│ 2000 │ 107479 │ ████████▌ │
│ 2001 │ 118885 │ █████████▌ │
│ 2002 │ 137941 │ ███████████ │
│ 2003 │ 155889 │ ████████████▍ │
│ 2004 │ 178885 │ ██████████████▎ │
│ 2005 │ 189351 │ ███████████████▏ │
│ 2006 │ 203528 │ ████████████████▎ │
│ 2007 │ 219378 │ █████████████████▌ │
│ 1995 │ 67934 │ █████▍ │
│ 1996 │ 71508 │ █████▋ │
│ 1997 │ 78536 │ ██████▎ │
│ 1998 │ 85441 │ ██████▋ │
│ 1999 │ 96038 │ ███████▋ │
│ 2000 │ 107487 │ ████████▌ │
│ 2001 │ 118888 │ █████████▌ │
│ 2002 │ 137948 │ ███████████ │
│ 2003 │ 155893 │ ████████████▍ │
│ 2004 │ 178888 │ ██████████████▎ │
│ 2005 │ 189359 │ ███████████████▏ │
│ 2006 │ 203532 │ ████████████████▎ │
│ 2007 │ 219375 │ █████████████████▌ │
│ 2008 │ 217056 │ █████████████████▎ │
│ 2009 │ 213419 │ █████████████████ │
│ 2010 │ 236109 │ ██████████████████▊ │
│ 2010 │ 236110 │ ██████████████████▊ │
│ 2011 │ 232805 │ ██████████████████▌ │
│ 2012 │ 238367 │ ███████████████████ │
│ 2013 │ 256931 │ ████████████████████▌ │
│ 2014 │ 279915 │ ██████████████████████▍ │
│ 2015 │ 297266 │ ███████████████████████▋ │
│ 2016 │ 313201 │ █████████████████████████ │
│ 2017 │ 346097 │ ███████████████████████████▋ │
│ 2018 │ 350116 │ ████████████████████████████ │
│ 2019 │ 351013 │ ████████████████████████████ │
│ 2020 │ 369420 │ █████████████████████████████▌ │
│ 2021 │ 386903 │ ██████████████████████████████▊ │
│ 2012 │ 238381 │ ███████████████████ │
│ 2013 │ 256927 │ ████████████████████▌ │
│ 2014 │ 280008 │ ██████████████████████▍ │
│ 2015 │ 297263 │ ███████████████████████▋ │
│ 2016 │ 313518 │ █████████████████████████ │
│ 2017 │ 346371 │ ███████████████████████████▋ │
│ 2018 │ 350556 │ ████████████████████████████ │
│ 2019 │ 352184 │ ████████████████████████████▏ │
│ 2020 │ 375808 │ ██████████████████████████████ │
│ 2021 │ 381105 │ ██████████████████████████████▍ │
│ 2022 │ 362572 │ █████████████████████████████ │
└──────┴────────┴────────────────────────────────────────┘
```
### Query 2. Average Price per Year in London {#average-price-london}
Query:
```sql
SELECT toYear(date) AS year, round(avg(price)) AS price, bar(price, 0, 2000000, 100) FROM uk_price_paid WHERE town = 'LONDON' GROUP BY year ORDER BY year;
SELECT
toYear(date) AS year,
round(avg(price)) AS price,
bar(price, 0, 2000000, 100
)
FROM uk_price_paid
WHERE town = 'LONDON'
GROUP BY year
ORDER BY year
```
Result:
The result looks like:
```text
```response
┌─year─┬───price─┬─bar(round(avg(price)), 0, 2000000, 100)───────────────┐
│ 1995 │ 109116 │ █████▍ │
│ 1996 │ 118667 │ █████▊ │
│ 1997 │ 136518 │ ██████▋ │
│ 1998 │ 152983 │ ███████▋ │
│ 1999 │ 180637 │ █████████ │
│ 2000 │ 215838 │ ██████████▋ │
│ 2001 │ 232994 │ ███████████▋ │
│ 2002 │ 263670 │ █████████████▏ │
│ 2003 │ 278394 │ █████████████▊ │
│ 2004 │ 304666 │ ███████████████▏ │
│ 2005 │ 322875 │ ████████████████▏ │
│ 2006 │ 356191 │ █████████████████▋ │
│ 2007 │ 404054 │ ████████████████████▏ │
│ 1995 │ 109110 │ █████▍ │
│ 1996 │ 118659 │ █████▊ │
│ 1997 │ 136526 │ ██████▋ │
│ 1998 │ 153002 │ ███████▋ │
│ 1999 │ 180633 │ █████████ │
│ 2000 │ 215849 │ ██████████▋ │
│ 2001 │ 232987 │ ███████████▋ │
│ 2002 │ 263668 │ █████████████▏ │
│ 2003 │ 278424 │ █████████████▊ │
│ 2004 │ 304664 │ ███████████████▏ │
│ 2005 │ 322887 │ ████████████████▏ │
│ 2006 │ 356195 │ █████████████████▋ │
│ 2007 │ 404062 │ ████████████████████▏ │
│ 2008 │ 420741 │ █████████████████████ │
│ 2009 │ 427753 │ █████████████████████▍ │
│ 2010 │ 480306 │ ████████████████████████ │
│ 2011 │ 496274 │ ████████████████████████▋ │
│ 2012 │ 519442 │ █████████████████████████▊ │
│ 2013 │ 616212 │ ██████████████████████████████▋ │
│ 2014 │ 724154 │ ████████████████████████████████████▏ │
│ 2015 │ 792129 │ ███████████████████████████████████████▌ │
│ 2016 │ 843655 │ ██████████████████████████████████████████▏ │
│ 2017 │ 982642 │ █████████████████████████████████████████████████▏ │
│ 2018 │ 1016835 │ ██████████████████████████████████████████████████▋ │
│ 2019 │ 1042849 │ ████████████████████████████████████████████████████▏ │
│ 2020 │ 1011889 │ ██████████████████████████████████████████████████▌ │
│ 2021 │ 960343 │ ████████████████████████████████████████████████ │
│ 2009 │ 427754 │ █████████████████████▍ │
│ 2010 │ 480322 │ ████████████████████████ │
│ 2011 │ 496278 │ ████████████████████████▋ │
│ 2012 │ 519482 │ █████████████████████████▊ │
│ 2013 │ 616195 │ ██████████████████████████████▋ │
│ 2014 │ 724121 │ ████████████████████████████████████▏ │
│ 2015 │ 792101 │ ███████████████████████████████████████▌ │
│ 2016 │ 843589 │ ██████████████████████████████████████████▏ │
│ 2017 │ 983523 │ █████████████████████████████████████████████████▏ │
│ 2018 │ 1016753 │ ██████████████████████████████████████████████████▋ │
│ 2019 │ 1041673 │ ████████████████████████████████████████████████████ │
│ 2020 │ 1060027 │ █████████████████████████████████████████████████████ │
│ 2021 │ 958249 │ ███████████████████████████████████████████████▊ │
│ 2022 │ 902596 │ █████████████████████████████████████████████▏ │
└──────┴─────────┴───────────────────────────────────────────────────────┘
```
Something happened in 2013. I don't have a clue. Maybe you have a clue what happened in 2020?
Something happened to home prices in 2020! But that is probably not a surprise...
### Query 3. The Most Expensive Neighborhoods {#most-expensive-neighborhoods}
Query:
```sql
SELECT
town,
@ -240,124 +230,123 @@ GROUP BY
district
HAVING c >= 100
ORDER BY price DESC
LIMIT 100;
LIMIT 100
```
Result:
The result looks like:
```text
┌─town─────────────────┬─district───────────────┬────c─┬───price─┬─bar(round(avg(price)), 0, 5000000, 100)────────────────────────────┐
│ LONDON │ CITY OF WESTMINSTER │ 3606 │ 3280239 │ █████████████████████████████████████████████████████████████████▌ │
│ LONDON │ CITY OF LONDON │ 274 │ 3160502 │ ███████████████████████████████████████████████████████████████▏ │
│ LONDON │ KENSINGTON AND CHELSEA │ 2550 │ 2308478 │ ██████████████████████████████████████████████▏ │
│ LEATHERHEAD │ ELMBRIDGE │ 114 │ 1897407 │ █████████████████████████████████████▊ │
│ LONDON │ CAMDEN │ 3033 │ 1805404 │ ████████████████████████████████████ │
│ VIRGINIA WATER │ RUNNYMEDE │ 156 │ 1753247 │ ███████████████████████████████████ │
│ WINDLESHAM │ SURREY HEATH │ 108 │ 1677613 │ █████████████████████████████████▌ │
│ THORNTON HEATH │ CROYDON │ 546 │ 1671721 │ █████████████████████████████████▍ │
│ BARNET │ ENFIELD │ 124 │ 1505840 │ ██████████████████████████████ │
│ COBHAM │ ELMBRIDGE │ 387 │ 1237250 │ ████████████████████████▋ │
│ LONDON │ ISLINGTON │ 2668 │ 1236980 │ ████████████████████████▋ │
│ OXFORD │ SOUTH OXFORDSHIRE │ 321 │ 1220907 │ ████████████████████████▍ │
│ LONDON │ RICHMOND UPON THAMES │ 704 │ 1215551 │ ████████████████████████▎ │
│ LONDON │ HOUNSLOW │ 671 │ 1207493 │ ████████████████████████▏ │
│ ASCOT │ WINDSOR AND MAIDENHEAD │ 407 │ 1183299 │ ███████████████████████▋ │
│ BEACONSFIELD │ BUCKINGHAMSHIRE │ 330 │ 1175615 │ ███████████████████████▌ │
│ RICHMOND │ RICHMOND UPON THAMES │ 874 │ 1110444 │ ██████████████████████▏ │
│ LONDON │ HAMMERSMITH AND FULHAM │ 3086 │ 1053983 │ █████████████████████ │
│ SURBITON │ ELMBRIDGE │ 100 │ 1011800 │ ████████████████████▏ │
│ RADLETT │ HERTSMERE │ 283 │ 1011712 │ ████████████████████▏ │
│ SALCOMBE │ SOUTH HAMS │ 127 │ 1011624 │ ████████████████████▏ │
│ WEYBRIDGE │ ELMBRIDGE │ 655 │ 1007265 │ ████████████████████▏ │
│ ESHER │ ELMBRIDGE │ 485 │ 986581 │ ███████████████████▋ │
│ LEATHERHEAD │ GUILDFORD │ 202 │ 977320 │ ███████████████████▌ │
│ BURFORD │ WEST OXFORDSHIRE │ 111 │ 966893 │ ███████████████████▎ │
│ BROCKENHURST │ NEW FOREST │ 129 │ 956675 │ ███████████████████▏ │
│ HINDHEAD │ WAVERLEY │ 137 │ 953753 │ ███████████████████ │
│ GERRARDS CROSS │ BUCKINGHAMSHIRE │ 419 │ 951121 │ ███████████████████ │
│ EAST MOLESEY │ ELMBRIDGE │ 192 │ 936769 │ ██████████████████▋ │
│ CHALFONT ST GILES │ BUCKINGHAMSHIRE │ 146 │ 925515 │ ██████████████████▌ │
│ LONDON │ TOWER HAMLETS │ 4388 │ 918304 │ ██████████████████▎ │
│ OLNEY │ MILTON KEYNES │ 235 │ 910646 │ ██████████████████▏ │
│ HENLEY-ON-THAMES │ SOUTH OXFORDSHIRE │ 540 │ 902418 │ ██████████████████ │
│ LONDON │ SOUTHWARK │ 3885 │ 892997 │ █████████████████▋ │
│ KINGSTON UPON THAMES │ KINGSTON UPON THAMES │ 960 │ 885969 │ █████████████████▋ │
│ LONDON │ EALING │ 2658 │ 871755 │ █████████████████▍ │
│ CRANBROOK │ TUNBRIDGE WELLS │ 431 │ 862348 │ █████████████████▏ │
│ LONDON │ MERTON │ 2099 │ 859118 │ █████████████████▏ │
│ BELVEDERE │ BEXLEY │ 346 │ 842423 │ ████████████████▋ │
│ GUILDFORD │ WAVERLEY │ 143 │ 841277 │ ████████████████▋ │
│ HARPENDEN │ ST ALBANS │ 657 │ 841216 │ ████████████████▋ │
│ LONDON │ HACKNEY │ 3307 │ 837090 │ ████████████████▋ │
│ LONDON │ WANDSWORTH │ 6566 │ 832663 │ ████████████████▋ │
│ MAIDENHEAD │ BUCKINGHAMSHIRE │ 123 │ 824299 │ ████████████████▍ │
│ KINGS LANGLEY │ DACORUM │ 145 │ 821331 │ ████████████████▍ │
│ BERKHAMSTED │ DACORUM │ 543 │ 818415 │ ████████████████▎ │
│ GREAT MISSENDEN │ BUCKINGHAMSHIRE │ 226 │ 802807 │ ████████████████ │
│ BILLINGSHURST │ CHICHESTER │ 144 │ 797829 │ ███████████████▊ │
│ WOKING │ GUILDFORD │ 176 │ 793494 │ ███████████████▋ │
│ STOCKBRIDGE │ TEST VALLEY │ 178 │ 793269 │ ███████████████▋ │
│ EPSOM │ REIGATE AND BANSTEAD │ 172 │ 791862 │ ███████████████▋ │
│ TONBRIDGE │ TUNBRIDGE WELLS │ 360 │ 787876 │ ███████████████▋ │
│ TEDDINGTON │ RICHMOND UPON THAMES │ 595 │ 786492 │ ███████████████▋ │
│ TWICKENHAM │ RICHMOND UPON THAMES │ 1155 │ 786193 │ ███████████████▋ │
│ LYNDHURST │ NEW FOREST │ 102 │ 785593 │ ███████████████▋ │
│ LONDON │ LAMBETH │ 5228 │ 774574 │ ███████████████▍ │
│ LONDON │ BARNET │ 3955 │ 773259 │ ███████████████▍ │
│ OXFORD │ VALE OF WHITE HORSE │ 353 │ 772088 │ ███████████████▍ │
│ TONBRIDGE │ MAIDSTONE │ 305 │ 770740 │ ███████████████▍ │
│ LUTTERWORTH │ HARBOROUGH │ 538 │ 768634 │ ███████████████▎ │
│ WOODSTOCK │ WEST OXFORDSHIRE │ 140 │ 766037 │ ███████████████▎ │
│ MIDHURST │ CHICHESTER │ 257 │ 764815 │ ███████████████▎ │
│ MARLOW │ BUCKINGHAMSHIRE │ 327 │ 761876 │ ███████████████▏ │
│ LONDON │ NEWHAM │ 3237 │ 761784 │ ███████████████▏ │
│ ALDERLEY EDGE │ CHESHIRE EAST │ 178 │ 757318 │ ███████████████▏ │
│ LUTON │ CENTRAL BEDFORDSHIRE │ 212 │ 754283 │ ███████████████ │
│ PETWORTH │ CHICHESTER │ 154 │ 754220 │ ███████████████ │
│ ALRESFORD │ WINCHESTER │ 219 │ 752718 │ ███████████████ │
│ POTTERS BAR │ WELWYN HATFIELD │ 174 │ 748465 │ ██████████████▊ │
│ HASLEMERE │ CHICHESTER │ 128 │ 746907 │ ██████████████▊ │
│ TADWORTH │ REIGATE AND BANSTEAD │ 502 │ 743252 │ ██████████████▋ │
│ THAMES DITTON │ ELMBRIDGE │ 244 │ 741913 │ ██████████████▋ │
│ REIGATE │ REIGATE AND BANSTEAD │ 581 │ 738198 │ ██████████████▋ │
│ BOURNE END │ BUCKINGHAMSHIRE │ 138 │ 735190 │ ██████████████▋ │
│ SEVENOAKS │ SEVENOAKS │ 1156 │ 730018 │ ██████████████▌ │
│ OXTED │ TANDRIDGE │ 336 │ 729123 │ ██████████████▌ │
│ INGATESTONE │ BRENTWOOD │ 166 │ 728103 │ ██████████████▌ │
│ LONDON │ BRENT │ 2079 │ 720605 │ ██████████████▍ │
│ LONDON │ HARINGEY │ 3216 │ 717780 │ ██████████████▎ │
│ PURLEY │ CROYDON │ 575 │ 716108 │ ██████████████▎ │
│ WELWYN │ WELWYN HATFIELD │ 222 │ 710603 │ ██████████████▏ │
│ RICKMANSWORTH │ THREE RIVERS │ 798 │ 704571 │ ██████████████ │
│ BANSTEAD │ REIGATE AND BANSTEAD │ 401 │ 701293 │ ██████████████ │
│ CHIGWELL │ EPPING FOREST │ 261 │ 701203 │ ██████████████ │
│ PINNER │ HARROW │ 528 │ 698885 │ █████████████▊ │
│ HASLEMERE │ WAVERLEY │ 280 │ 696659 │ █████████████▊ │
│ SLOUGH │ BUCKINGHAMSHIRE │ 396 │ 694917 │ █████████████▊ │
│ WALTON-ON-THAMES │ ELMBRIDGE │ 946 │ 692395 │ █████████████▋ │
│ READING │ SOUTH OXFORDSHIRE │ 318 │ 691988 │ █████████████▋ │
│ NORTHWOOD │ HILLINGDON │ 271 │ 690643 │ █████████████▋ │
│ FELTHAM │ HOUNSLOW │ 763 │ 688595 │ █████████████▋ │
│ ASHTEAD │ MOLE VALLEY │ 303 │ 687923 │ █████████████▋ │
│ BARNET │ BARNET │ 975 │ 686980 │ █████████████▋ │
│ WOKING │ SURREY HEATH │ 283 │ 686669 │ █████████████▋ │
│ MALMESBURY │ WILTSHIRE │ 323 │ 683324 │ █████████████▋ │
│ AMERSHAM │ BUCKINGHAMSHIRE │ 496 │ 680962 │ █████████████▌ │
│ CHISLEHURST │ BROMLEY │ 430 │ 680209 │ █████████████▌ │
│ HYTHE │ FOLKESTONE AND HYTHE │ 490 │ 676908 │ █████████████▌ │
│ MAYFIELD │ WEALDEN │ 101 │ 676210 │ █████████████▌ │
│ ASCOT │ BRACKNELL FOREST │ 168 │ 676004 │ █████████████▌ │
└──────────────────────┴────────────────────────┴──────┴─────────┴────────────────────────────────────────────────────────────────────┘
```response
┌─town─────────────────┬─district───────────────┬─────c─┬───price─┬─bar(round(avg(price)), 0, 5000000, 100)─────────────────────────┐
│ LONDON │ CITY OF LONDON │ 578 │ 3149590 │ ██████████████████████████████████████████████████████████████▊ │
│ LONDON │ CITY OF WESTMINSTER │ 7083 │ 2903794 │ ██████████████████████████████████████████████████████████ │
│ LONDON │ KENSINGTON AND CHELSEA │ 4986 │ 2333782 │ ██████████████████████████████████████████████▋ │
│ LEATHERHEAD │ ELMBRIDGE │ 203 │ 2071595 │ █████████████████████████████████████████▍ │
│ VIRGINIA WATER │ RUNNYMEDE │ 308 │ 1939465 │ ██████████████████████████████████████▋ │
│ LONDON │ CAMDEN │ 5750 │ 1673687 │ █████████████████████████████████▍ │
│ WINDLESHAM │ SURREY HEATH │ 182 │ 1428358 │ ████████████████████████████▌ │
│ NORTHWOOD │ THREE RIVERS │ 112 │ 1404170 │ ████████████████████████████ │
│ BARNET │ ENFIELD │ 259 │ 1338299 │ ██████████████████████████▋ │
│ LONDON │ ISLINGTON │ 5504 │ 1275520 │ █████████████████████████▌ │
│ LONDON │ RICHMOND UPON THAMES │ 1345 │ 1261935 │ █████████████████████████▏ │
│ COBHAM │ ELMBRIDGE │ 727 │ 1251403 │ █████████████████████████ │
│ BEACONSFIELD │ BUCKINGHAMSHIRE │ 680 │ 1199970 │ ███████████████████████▊ │
│ LONDON │ TOWER HAMLETS │ 10012 │ 1157827 │ ███████████████████████▏ │
│ LONDON │ HOUNSLOW │ 1278 │ 1144389 │ ██████████████████████▊ │
│ BURFORD │ WEST OXFORDSHIRE │ 182 │ 1139393 │ ██████████████████████▋ │
│ RICHMOND │ RICHMOND UPON THAMES │ 1649 │ 1130076 │ ██████████████████████▌ │
│ KINGSTON UPON THAMES │ RICHMOND UPON THAMES │ 147 │ 1126111 │ ██████████████████████▌ │
│ ASCOT │ WINDSOR AND MAIDENHEAD │ 773 │ 1106109 │ ██████████████████████ │
│ LONDON │ HAMMERSMITH AND FULHAM │ 6162 │ 1056198 │ █████████████████████ │
│ RADLETT │ HERTSMERE │ 513 │ 1045758 │ ████████████████████▊ │
│ LEATHERHEAD │ GUILDFORD │ 354 │ 1045175 │ ████████████████████▊ │
│ WEYBRIDGE │ ELMBRIDGE │ 1275 │ 1036702 │ ████████████████████▋ │
│ FARNHAM │ EAST HAMPSHIRE │ 107 │ 1033682 │ ████████████████████▋ │
│ ESHER │ ELMBRIDGE │ 915 │ 1032753 │ ████████████████████▋ │
│ FARNHAM │ HART │ 102 │ 1002692 │ ████████████████████ │
│ GERRARDS CROSS │ BUCKINGHAMSHIRE │ 845 │ 983639 │ ███████████████████▋ │
│ CHALFONT ST GILES │ BUCKINGHAMSHIRE │ 286 │ 973993 │ ███████████████████▍ │
│ SALCOMBE │ SOUTH HAMS │ 215 │ 965724 │ ███████████████████▎ │
│ SURBITON │ ELMBRIDGE │ 181 │ 960346 │ ███████████████████▏ │
│ BROCKENHURST │ NEW FOREST │ 226 │ 951278 │ ███████████████████ │
│ SUTTON COLDFIELD │ LICHFIELD │ 110 │ 930757 │ ██████████████████▌ │
│ EAST MOLESEY │ ELMBRIDGE │ 372 │ 927026 │ ██████████████████▌ │
│ LLANGOLLEN │ WREXHAM │ 127 │ 925681 │ ██████████████████▌ │
│ OXFORD │ SOUTH OXFORDSHIRE │ 638 │ 923830 │ ██████████████████▍ │
│ LONDON │ MERTON │ 4383 │ 923194 │ ██████████████████▍ │
│ GUILDFORD │ WAVERLEY │ 261 │ 905733 │ ██████████████████ │
│ TEDDINGTON │ RICHMOND UPON THAMES │ 1147 │ 894856 │ █████████████████▊ │
│ HARPENDEN │ ST ALBANS │ 1271 │ 893079 │ █████████████████▋ │
│ HENLEY-ON-THAMES │ SOUTH OXFORDSHIRE │ 1042 │ 887557 │ █████████████████▋ │
│ POTTERS BAR │ WELWYN HATFIELD │ 314 │ 863037 │ █████████████████▎ │
│ LONDON │ WANDSWORTH │ 13210 │ 857318 │ █████████████████▏ │
│ BILLINGSHURST │ CHICHESTER │ 255 │ 856508 │ █████████████████▏ │
│ LONDON │ SOUTHWARK │ 7742 │ 843145 │ ████████████████▋ │
│ LONDON │ HACKNEY │ 6656 │ 839716 │ ████████████████▋ │
│ LUTTERWORTH │ HARBOROUGH │ 1096 │ 836546 │ ████████████████▋ │
│ KINGSTON UPON THAMES │ KINGSTON UPON THAMES │ 1846 │ 828990 │ ████████████████▌ │
│ LONDON │ EALING │ 5583 │ 820135 │ ████████████████▍ │
│ INGATESTONE │ CHELMSFORD │ 120 │ 815379 │ ████████████████▎ │
│ MARLOW │ BUCKINGHAMSHIRE │ 718 │ 809943 │ ████████████████▏ │
│ EAST GRINSTEAD │ TANDRIDGE │ 105 │ 809461 │ ████████████████▏ │
│ CHIGWELL │ EPPING FOREST │ 484 │ 809338 │ ████████████████▏ │
│ EGHAM │ RUNNYMEDE │ 989 │ 807858 │ ████████████████▏ │
│ HASLEMERE │ CHICHESTER │ 223 │ 804173 │ ████████████████ │
│ PETWORTH │ CHICHESTER │ 288 │ 803206 │ ████████████████ │
│ TWICKENHAM │ RICHMOND UPON THAMES │ 2194 │ 802616 │ ████████████████ │
│ WEMBLEY │ BRENT │ 1698 │ 801733 │ ████████████████ │
│ HINDHEAD │ WAVERLEY │ 233 │ 801482 │ ████████████████ │
│ LONDON │ BARNET │ 8083 │ 792066 │ ███████████████▋ │
│ WOKING │ GUILDFORD │ 343 │ 789360 │ ███████████████▋ │
│ STOCKBRIDGE │ TEST VALLEY │ 318 │ 777909 │ ███████████████▌ │
│ BERKHAMSTED │ DACORUM │ 1049 │ 776138 │ ███████████████▌ │
│ MAIDENHEAD │ BUCKINGHAMSHIRE │ 236 │ 775572 │ ███████████████▌ │
│ SOLIHULL │ STRATFORD-ON-AVON │ 142 │ 770727 │ ███████████████▍ │
│ GREAT MISSENDEN │ BUCKINGHAMSHIRE │ 431 │ 764493 │ ███████████████▎ │
│ TADWORTH │ REIGATE AND BANSTEAD │ 920 │ 757511 │ ███████████████▏ │
│ LONDON │ BRENT │ 4124 │ 757194 │ ███████████████▏ │
│ THAMES DITTON │ ELMBRIDGE │ 470 │ 750828 │ ███████████████ │
│ LONDON │ LAMBETH │ 10431 │ 750532 │ ███████████████ │
│ RICKMANSWORTH │ THREE RIVERS │ 1500 │ 747029 │ ██████████████▊ │
│ KINGS LANGLEY │ DACORUM │ 281 │ 746536 │ ██████████████▊ │
│ HARLOW │ EPPING FOREST │ 172 │ 739423 │ ██████████████▋ │
│ TONBRIDGE │ SEVENOAKS │ 103 │ 738740 │ ██████████████▋ │
│ BELVEDERE │ BEXLEY │ 686 │ 736385 │ ██████████████▋ │
│ CRANBROOK │ TUNBRIDGE WELLS │ 769 │ 734328 │ ██████████████▋ │
│ SOLIHULL │ WARWICK │ 116 │ 733286 │ ██████████████▋ │
│ ALDERLEY EDGE │ CHESHIRE EAST │ 357 │ 732882 │ ██████████████▋ │
│ WELWYN │ WELWYN HATFIELD │ 404 │ 730281 │ ██████████████▌ │
│ CHISLEHURST │ BROMLEY │ 870 │ 730279 │ ██████████████▌ │
│ LONDON │ HARINGEY │ 6488 │ 726715 │ ██████████████▌ │
│ AMERSHAM │ BUCKINGHAMSHIRE │ 965 │ 725426 │ ██████████████▌ │
│ SEVENOAKS │ SEVENOAKS │ 2183 │ 725102 │ ██████████████▌ │
│ BOURNE END │ BUCKINGHAMSHIRE │ 269 │ 724595 │ ██████████████▍ │
│ NORTHWOOD │ HILLINGDON │ 568 │ 722436 │ ██████████████▍ │
│ PURFLEET │ THURROCK │ 143 │ 722205 │ ██████████████▍ │
│ SLOUGH │ BUCKINGHAMSHIRE │ 832 │ 721529 │ ██████████████▍ │
│ INGATESTONE │ BRENTWOOD │ 301 │ 718292 │ ██████████████▎ │
│ EPSOM │ REIGATE AND BANSTEAD │ 315 │ 709264 │ ██████████████▏ │
│ ASHTEAD │ MOLE VALLEY │ 524 │ 708646 │ ██████████████▏ │
│ BETCHWORTH │ MOLE VALLEY │ 155 │ 708525 │ ██████████████▏ │
│ OXTED │ TANDRIDGE │ 645 │ 706946 │ ██████████████▏ │
│ READING │ SOUTH OXFORDSHIRE │ 593 │ 705466 │ ██████████████ │
│ FELTHAM │ HOUNSLOW │ 1536 │ 703815 │ ██████████████ │
│ TUNBRIDGE WELLS │ WEALDEN │ 207 │ 703296 │ ██████████████ │
│ LEWES │ WEALDEN │ 116 │ 701349 │ ██████████████ │
│ OXFORD │ OXFORD │ 3656 │ 700813 │ ██████████████ │
│ MAYFIELD │ WEALDEN │ 177 │ 698158 │ █████████████▊ │
│ PINNER │ HARROW │ 997 │ 697876 │ █████████████▊ │
│ LECHLADE │ COTSWOLD │ 155 │ 696262 │ █████████████▊ │
│ WALTON-ON-THAMES │ ELMBRIDGE │ 1850 │ 690102 │ █████████████▋ │
└──────────────────────┴────────────────────────┴───────┴─────────┴─────────────────────────────────────────────────────────────────┘
```
## Let's Speed Up Queries Using Projections {#speedup-with-projections}
[Projections](../../sql-reference/statements/alter/projection.md) allow to improve queries speed by storing pre-aggregated data.
[Projections](../../sql-reference/statements/alter/projection.md) allow you to improve query speeds by storing pre-aggregated data in whatever format you want. In this example, we create a projection that keeps track of the average price, total price, and count of properties grouped by the year, district and town. At execution time, ClickHouse will use your projection if it thinks the projection can improve the performance fo the query (you don't have to do anything special to use the projection - ClickHouse decides for you when the projection will be useful).
### Build a Projection {#build-projection}
Create an aggregate projection by dimensions `toYear(date)`, `district`, `town`:
Let's create an aggregate projection by the dimensions `toYear(date)`, `district`, and `town`:
```sql
ALTER TABLE uk_price_paid
@ -374,25 +363,23 @@ ALTER TABLE uk_price_paid
toYear(date),
district,
town
);
)
```
Populate the projection for existing data (without it projection will be created for only newly inserted data):
Populate the projection for existing data. (Without materializing it, the projection will be created for only newly inserted data):
```sql
ALTER TABLE uk_price_paid
MATERIALIZE PROJECTION projection_by_year_district_town
SETTINGS mutations_sync = 1;
SETTINGS mutations_sync = 1
```
## Test Performance {#test-performance}
Let's run the same 3 queries.
Let's run the same 3 queries again:
### Query 1. Average Price Per Year {#average-price-projections}
Query:
```sql
SELECT
toYear(date) AS year,
@ -400,47 +387,18 @@ SELECT
bar(price, 0, 1000000, 80)
FROM uk_price_paid
GROUP BY year
ORDER BY year ASC;
ORDER BY year ASC
```
Result:
```text
┌─year─┬──price─┬─bar(round(avg(price)), 0, 1000000, 80)─┐
│ 1995 │ 67932 │ █████▍ │
│ 1996 │ 71505 │ █████▋ │
│ 1997 │ 78532 │ ██████▎ │
│ 1998 │ 85436 │ ██████▋ │
│ 1999 │ 96037 │ ███████▋ │
│ 2000 │ 107479 │ ████████▌ │
│ 2001 │ 118885 │ █████████▌ │
│ 2002 │ 137941 │ ███████████ │
│ 2003 │ 155889 │ ████████████▍ │
│ 2004 │ 178885 │ ██████████████▎ │
│ 2005 │ 189351 │ ███████████████▏ │
│ 2006 │ 203528 │ ████████████████▎ │
│ 2007 │ 219378 │ █████████████████▌ │
│ 2008 │ 217056 │ █████████████████▎ │
│ 2009 │ 213419 │ █████████████████ │
│ 2010 │ 236109 │ ██████████████████▊ │
│ 2011 │ 232805 │ ██████████████████▌ │
│ 2012 │ 238367 │ ███████████████████ │
│ 2013 │ 256931 │ ████████████████████▌ │
│ 2014 │ 279915 │ ██████████████████████▍ │
│ 2015 │ 297266 │ ███████████████████████▋ │
│ 2016 │ 313201 │ █████████████████████████ │
│ 2017 │ 346097 │ ███████████████████████████▋ │
│ 2018 │ 350116 │ ████████████████████████████ │
│ 2019 │ 351013 │ ████████████████████████████ │
│ 2020 │ 369420 │ █████████████████████████████▌ │
│ 2021 │ 386903 │ ██████████████████████████████▊ │
└──────┴────────┴────────────────────────────────────────┘
The result is the same, but the performance is better!
```response
No projection: 28 rows in set. Elapsed: 1.775 sec. Processed 27.45 million rows, 164.70 MB (15.47 million rows/s., 92.79 MB/s.)
With projection: 28 rows in set. Elapsed: 0.665 sec. Processed 87.51 thousand rows, 3.21 MB (131.51 thousand rows/s., 4.82 MB/s.)
```
### Query 2. Average Price Per Year in London {#average-price-london-projections}
Query:
```sql
SELECT
toYear(date) AS year,
@ -449,48 +407,19 @@ SELECT
FROM uk_price_paid
WHERE town = 'LONDON'
GROUP BY year
ORDER BY year ASC;
ORDER BY year ASC
```
Result:
Same result, but notice the improvement in query performance:
```text
┌─year─┬───price─┬─bar(round(avg(price)), 0, 2000000, 100)───────────────┐
│ 1995 │ 109116 │ █████▍ │
│ 1996 │ 118667 │ █████▊ │
│ 1997 │ 136518 │ ██████▋ │
│ 1998 │ 152983 │ ███████▋ │
│ 1999 │ 180637 │ █████████ │
│ 2000 │ 215838 │ ██████████▋ │
│ 2001 │ 232994 │ ███████████▋ │
│ 2002 │ 263670 │ █████████████▏ │
│ 2003 │ 278394 │ █████████████▊ │
│ 2004 │ 304666 │ ███████████████▏ │
│ 2005 │ 322875 │ ████████████████▏ │
│ 2006 │ 356191 │ █████████████████▋ │
│ 2007 │ 404054 │ ████████████████████▏ │
│ 2008 │ 420741 │ █████████████████████ │
│ 2009 │ 427753 │ █████████████████████▍ │
│ 2010 │ 480306 │ ████████████████████████ │
│ 2011 │ 496274 │ ████████████████████████▋ │
│ 2012 │ 519442 │ █████████████████████████▊ │
│ 2013 │ 616212 │ ██████████████████████████████▋ │
│ 2014 │ 724154 │ ████████████████████████████████████▏ │
│ 2015 │ 792129 │ ███████████████████████████████████████▌ │
│ 2016 │ 843655 │ ██████████████████████████████████████████▏ │
│ 2017 │ 982642 │ █████████████████████████████████████████████████▏ │
│ 2018 │ 1016835 │ ██████████████████████████████████████████████████▋ │
│ 2019 │ 1042849 │ ████████████████████████████████████████████████████▏ │
│ 2020 │ 1011889 │ ██████████████████████████████████████████████████▌ │
│ 2021 │ 960343 │ ████████████████████████████████████████████████ │
└──────┴─────────┴───────────────────────────────────────────────────────┘
```response
No projection: 28 rows in set. Elapsed: 0.720 sec. Processed 27.45 million rows, 46.61 MB (38.13 million rows/s., 64.74 MB/s.)
With projection: 28 rows in set. Elapsed: 0.015 sec. Processed 87.51 thousand rows, 3.51 MB (5.74 million rows/s., 230.24 MB/s.)
```
### Query 3. The Most Expensive Neighborhoods {#most-expensive-neighborhoods-projections}
The condition (date >= '2020-01-01') needs to be modified to match projection dimension (toYear(date) >= 2020).
Query:
The condition (date >= '2020-01-01') needs to be modified so that it matches the projection dimension (`toYear(date) >= 2020)`:
```sql
SELECT
@ -506,138 +435,16 @@ GROUP BY
district
HAVING c >= 100
ORDER BY price DESC
LIMIT 100;
LIMIT 100
```
Result:
Again, the result is the same but notice the improvement in query performance:
```text
┌─town─────────────────┬─district───────────────┬────c─┬───price─┬─bar(round(avg(price)), 0, 5000000, 100)────────────────────────────┐
│ LONDON │ CITY OF WESTMINSTER │ 3606 │ 3280239 │ █████████████████████████████████████████████████████████████████▌ │
│ LONDON │ CITY OF LONDON │ 274 │ 3160502 │ ███████████████████████████████████████████████████████████████▏ │
│ LONDON │ KENSINGTON AND CHELSEA │ 2550 │ 2308478 │ ██████████████████████████████████████████████▏ │
│ LEATHERHEAD │ ELMBRIDGE │ 114 │ 1897407 │ █████████████████████████████████████▊ │
│ LONDON │ CAMDEN │ 3033 │ 1805404 │ ████████████████████████████████████ │
│ VIRGINIA WATER │ RUNNYMEDE │ 156 │ 1753247 │ ███████████████████████████████████ │
│ WINDLESHAM │ SURREY HEATH │ 108 │ 1677613 │ █████████████████████████████████▌ │
│ THORNTON HEATH │ CROYDON │ 546 │ 1671721 │ █████████████████████████████████▍ │
│ BARNET │ ENFIELD │ 124 │ 1505840 │ ██████████████████████████████ │
│ COBHAM │ ELMBRIDGE │ 387 │ 1237250 │ ████████████████████████▋ │
│ LONDON │ ISLINGTON │ 2668 │ 1236980 │ ████████████████████████▋ │
│ OXFORD │ SOUTH OXFORDSHIRE │ 321 │ 1220907 │ ████████████████████████▍ │
│ LONDON │ RICHMOND UPON THAMES │ 704 │ 1215551 │ ████████████████████████▎ │
│ LONDON │ HOUNSLOW │ 671 │ 1207493 │ ████████████████████████▏ │
│ ASCOT │ WINDSOR AND MAIDENHEAD │ 407 │ 1183299 │ ███████████████████████▋ │
│ BEACONSFIELD │ BUCKINGHAMSHIRE │ 330 │ 1175615 │ ███████████████████████▌ │
│ RICHMOND │ RICHMOND UPON THAMES │ 874 │ 1110444 │ ██████████████████████▏ │
│ LONDON │ HAMMERSMITH AND FULHAM │ 3086 │ 1053983 │ █████████████████████ │
│ SURBITON │ ELMBRIDGE │ 100 │ 1011800 │ ████████████████████▏ │
│ RADLETT │ HERTSMERE │ 283 │ 1011712 │ ████████████████████▏ │
│ SALCOMBE │ SOUTH HAMS │ 127 │ 1011624 │ ████████████████████▏ │
│ WEYBRIDGE │ ELMBRIDGE │ 655 │ 1007265 │ ████████████████████▏ │
│ ESHER │ ELMBRIDGE │ 485 │ 986581 │ ███████████████████▋ │
│ LEATHERHEAD │ GUILDFORD │ 202 │ 977320 │ ███████████████████▌ │
│ BURFORD │ WEST OXFORDSHIRE │ 111 │ 966893 │ ███████████████████▎ │
│ BROCKENHURST │ NEW FOREST │ 129 │ 956675 │ ███████████████████▏ │
│ HINDHEAD │ WAVERLEY │ 137 │ 953753 │ ███████████████████ │
│ GERRARDS CROSS │ BUCKINGHAMSHIRE │ 419 │ 951121 │ ███████████████████ │
│ EAST MOLESEY │ ELMBRIDGE │ 192 │ 936769 │ ██████████████████▋ │
│ CHALFONT ST GILES │ BUCKINGHAMSHIRE │ 146 │ 925515 │ ██████████████████▌ │
│ LONDON │ TOWER HAMLETS │ 4388 │ 918304 │ ██████████████████▎ │
│ OLNEY │ MILTON KEYNES │ 235 │ 910646 │ ██████████████████▏ │
│ HENLEY-ON-THAMES │ SOUTH OXFORDSHIRE │ 540 │ 902418 │ ██████████████████ │
│ LONDON │ SOUTHWARK │ 3885 │ 892997 │ █████████████████▋ │
│ KINGSTON UPON THAMES │ KINGSTON UPON THAMES │ 960 │ 885969 │ █████████████████▋ │
│ LONDON │ EALING │ 2658 │ 871755 │ █████████████████▍ │
│ CRANBROOK │ TUNBRIDGE WELLS │ 431 │ 862348 │ █████████████████▏ │
│ LONDON │ MERTON │ 2099 │ 859118 │ █████████████████▏ │
│ BELVEDERE │ BEXLEY │ 346 │ 842423 │ ████████████████▋ │
│ GUILDFORD │ WAVERLEY │ 143 │ 841277 │ ████████████████▋ │
│ HARPENDEN │ ST ALBANS │ 657 │ 841216 │ ████████████████▋ │
│ LONDON │ HACKNEY │ 3307 │ 837090 │ ████████████████▋ │
│ LONDON │ WANDSWORTH │ 6566 │ 832663 │ ████████████████▋ │
│ MAIDENHEAD │ BUCKINGHAMSHIRE │ 123 │ 824299 │ ████████████████▍ │
│ KINGS LANGLEY │ DACORUM │ 145 │ 821331 │ ████████████████▍ │
│ BERKHAMSTED │ DACORUM │ 543 │ 818415 │ ████████████████▎ │
│ GREAT MISSENDEN │ BUCKINGHAMSHIRE │ 226 │ 802807 │ ████████████████ │
│ BILLINGSHURST │ CHICHESTER │ 144 │ 797829 │ ███████████████▊ │
│ WOKING │ GUILDFORD │ 176 │ 793494 │ ███████████████▋ │
│ STOCKBRIDGE │ TEST VALLEY │ 178 │ 793269 │ ███████████████▋ │
│ EPSOM │ REIGATE AND BANSTEAD │ 172 │ 791862 │ ███████████████▋ │
│ TONBRIDGE │ TUNBRIDGE WELLS │ 360 │ 787876 │ ███████████████▋ │
│ TEDDINGTON │ RICHMOND UPON THAMES │ 595 │ 786492 │ ███████████████▋ │
│ TWICKENHAM │ RICHMOND UPON THAMES │ 1155 │ 786193 │ ███████████████▋ │
│ LYNDHURST │ NEW FOREST │ 102 │ 785593 │ ███████████████▋ │
│ LONDON │ LAMBETH │ 5228 │ 774574 │ ███████████████▍ │
│ LONDON │ BARNET │ 3955 │ 773259 │ ███████████████▍ │
│ OXFORD │ VALE OF WHITE HORSE │ 353 │ 772088 │ ███████████████▍ │
│ TONBRIDGE │ MAIDSTONE │ 305 │ 770740 │ ███████████████▍ │
│ LUTTERWORTH │ HARBOROUGH │ 538 │ 768634 │ ███████████████▎ │
│ WOODSTOCK │ WEST OXFORDSHIRE │ 140 │ 766037 │ ███████████████▎ │
│ MIDHURST │ CHICHESTER │ 257 │ 764815 │ ███████████████▎ │
│ MARLOW │ BUCKINGHAMSHIRE │ 327 │ 761876 │ ███████████████▏ │
│ LONDON │ NEWHAM │ 3237 │ 761784 │ ███████████████▏ │
│ ALDERLEY EDGE │ CHESHIRE EAST │ 178 │ 757318 │ ███████████████▏ │
│ LUTON │ CENTRAL BEDFORDSHIRE │ 212 │ 754283 │ ███████████████ │
│ PETWORTH │ CHICHESTER │ 154 │ 754220 │ ███████████████ │
│ ALRESFORD │ WINCHESTER │ 219 │ 752718 │ ███████████████ │
│ POTTERS BAR │ WELWYN HATFIELD │ 174 │ 748465 │ ██████████████▊ │
│ HASLEMERE │ CHICHESTER │ 128 │ 746907 │ ██████████████▊ │
│ TADWORTH │ REIGATE AND BANSTEAD │ 502 │ 743252 │ ██████████████▋ │
│ THAMES DITTON │ ELMBRIDGE │ 244 │ 741913 │ ██████████████▋ │
│ REIGATE │ REIGATE AND BANSTEAD │ 581 │ 738198 │ ██████████████▋ │
│ BOURNE END │ BUCKINGHAMSHIRE │ 138 │ 735190 │ ██████████████▋ │
│ SEVENOAKS │ SEVENOAKS │ 1156 │ 730018 │ ██████████████▌ │
│ OXTED │ TANDRIDGE │ 336 │ 729123 │ ██████████████▌ │
│ INGATESTONE │ BRENTWOOD │ 166 │ 728103 │ ██████████████▌ │
│ LONDON │ BRENT │ 2079 │ 720605 │ ██████████████▍ │
│ LONDON │ HARINGEY │ 3216 │ 717780 │ ██████████████▎ │
│ PURLEY │ CROYDON │ 575 │ 716108 │ ██████████████▎ │
│ WELWYN │ WELWYN HATFIELD │ 222 │ 710603 │ ██████████████▏ │
│ RICKMANSWORTH │ THREE RIVERS │ 798 │ 704571 │ ██████████████ │
│ BANSTEAD │ REIGATE AND BANSTEAD │ 401 │ 701293 │ ██████████████ │
│ CHIGWELL │ EPPING FOREST │ 261 │ 701203 │ ██████████████ │
│ PINNER │ HARROW │ 528 │ 698885 │ █████████████▊ │
│ HASLEMERE │ WAVERLEY │ 280 │ 696659 │ █████████████▊ │
│ SLOUGH │ BUCKINGHAMSHIRE │ 396 │ 694917 │ █████████████▊ │
│ WALTON-ON-THAMES │ ELMBRIDGE │ 946 │ 692395 │ █████████████▋ │
│ READING │ SOUTH OXFORDSHIRE │ 318 │ 691988 │ █████████████▋ │
│ NORTHWOOD │ HILLINGDON │ 271 │ 690643 │ █████████████▋ │
│ FELTHAM │ HOUNSLOW │ 763 │ 688595 │ █████████████▋ │
│ ASHTEAD │ MOLE VALLEY │ 303 │ 687923 │ █████████████▋ │
│ BARNET │ BARNET │ 975 │ 686980 │ █████████████▋ │
│ WOKING │ SURREY HEATH │ 283 │ 686669 │ █████████████▋ │
│ MALMESBURY │ WILTSHIRE │ 323 │ 683324 │ █████████████▋ │
│ AMERSHAM │ BUCKINGHAMSHIRE │ 496 │ 680962 │ █████████████▌ │
│ CHISLEHURST │ BROMLEY │ 430 │ 680209 │ █████████████▌ │
│ HYTHE │ FOLKESTONE AND HYTHE │ 490 │ 676908 │ █████████████▌ │
│ MAYFIELD │ WEALDEN │ 101 │ 676210 │ █████████████▌ │
│ ASCOT │ BRACKNELL FOREST │ 168 │ 676004 │ █████████████▌ │
└──────────────────────┴────────────────────────┴──────┴─────────┴────────────────────────────────────────────────────────────────────┘
```response
No projection: 100 rows in set. Elapsed: 0.928 sec. Processed 27.45 million rows, 103.80 MB (29.56 million rows/s., 111.80 MB/s.)
With projection: 100 rows in set. Elapsed: 0.336 sec. Processed 17.32 thousand rows, 1.23 MB (51.61 thousand rows/s., 3.65 MB/s.)
```
### Summary {#summary}
All 3 queries work much faster and read fewer rows.
```text
Query 1
no projection: 27 rows in set. Elapsed: 0.158 sec. Processed 26.32 million rows, 157.93 MB (166.57 million rows/s., 999.39 MB/s.)
projection: 27 rows in set. Elapsed: 0.007 sec. Processed 105.96 thousand rows, 3.33 MB (14.58 million rows/s., 458.13 MB/s.)
Query 2
no projection: 27 rows in set. Elapsed: 0.163 sec. Processed 26.32 million rows, 80.01 MB (161.75 million rows/s., 491.64 MB/s.)
projection: 27 rows in set. Elapsed: 0.008 sec. Processed 105.96 thousand rows, 3.67 MB (13.29 million rows/s., 459.89 MB/s.)
Query 3
no projection: 100 rows in set. Elapsed: 0.069 sec. Processed 26.32 million rows, 62.47 MB (382.13 million rows/s., 906.93 MB/s.)
projection: 100 rows in set. Elapsed: 0.029 sec. Processed 8.08 thousand rows, 511.08 KB (276.06 thousand rows/s., 17.47 MB/s.)
```
### Test It in Playground {#playground}
### Test it in the Playground {#playground}
The dataset is also available in the [Online Playground](https://play.clickhouse.com/play?user=play#U0VMRUNUIHRvd24sIGRpc3RyaWN0LCBjb3VudCgpIEFTIGMsIHJvdW5kKGF2ZyhwcmljZSkpIEFTIHByaWNlLCBiYXIocHJpY2UsIDAsIDUwMDAwMDAsIDEwMCkgRlJPTSB1a19wcmljZV9wYWlkIFdIRVJFIGRhdGUgPj0gJzIwMjAtMDEtMDEnIEdST1VQIEJZIHRvd24sIGRpc3RyaWN0IEhBVklORyBjID49IDEwMCBPUkRFUiBCWSBwcmljZSBERVNDIExJTUlUIDEwMA==).

View File

@ -0,0 +1,26 @@
---
slug: /en/getting-started/example-datasets/
sidebar_position: 0
sidebar_label: Overview
keywords: [clickhouse, install, tutorial, sample, datasets]
pagination_next: 'en/tutorial'
---
# Tutorials and Example Datasets
We have a lot of resources for helping you get started and learn how ClickHouse works:
- If you need to get ClickHouse up and running, check out our [Quick Start](../quick-start.mdx)
- The [ClickHouse Tutorial](../tutorial.md) analyzes a dataset of New York City taxi rides
In addition, the sample datasets provide a great experience on working with ClickHouse,
learning important techniques and tricks, and seeing how to take advantage of the many powerful
functions in ClickHouse. The sample datasets include:
- The [UK Property Price Paid dataset](../getting-started/example-datasets/uk-price-paid.md) is a good starting point with some interesting SQL queries
- The [New York Taxi Data](../getting-started/example-datasets/nyc-taxi.md) has an example of how to insert data from S3 into ClickHouse
- The [Cell Towers dataset](../getting-started/example-datasets/cell-towers.md) imports a CSV into ClickHouse
- The [NYPD Complaint Data](../getting-started/example-datasets/nypd_complaint_data.md) demonstrates how to use data inference to simplify creating tables
- The ["What's on the Menu?" dataset](../getting-started/example-datasets/menus.md) has an example of denormalizing data
View the **Tutorials and Datasets** menu for a complete list of sample datasets.

View File

@ -1,13 +1,34 @@
---
sidebar_label: Installation
sidebar_position: 1
keywords: [clickhouse, install, installation, docs]
description: ClickHouse can run on any Linux, FreeBSD, or Mac OS X with x86_64, AArch64, or PowerPC64LE CPU architecture.
slug: /en/getting-started/install
title: Installation
sidebar_label: Install
keywords: [clickhouse, install, getting started, quick start]
slug: /en/install
---
## System Requirements {#system-requirements}
# Installing ClickHouse
You have two options for getting up and running with ClickHouse:
- **[ClickHouse Cloud](https://clickhouse.cloud/):** the official ClickHouse as a service, - built by, maintained, and supported by the creators of ClickHouse
- **Self-managed ClickHouse:** ClickHouse can run on any Linux, FreeBSD, or Mac OS X with x86_64, AArch64, or PowerPC64LE CPU architecture
## ClickHouse Cloud
The quickest and easiest way to get up and running with ClickHouse is to create a new service in [ClickHouse Cloud](https://clickhouse.cloud/):
<div class="eighty-percent">
![Create a ClickHouse Cloud service](@site/docs/en/_snippets/images/createservice1.png)
</div>
Once your Cloud service is provisioned, you will be able to [connect to it](/docs/en/integrations/connect-a-client.md) and start [inserting data](/docs/en/integrations/data-ingestion.md).
:::note
The [Quick Start](/docs/en/quick-start.mdx) walks through the steps to get a ClickHouse Cloud service up and running, connecting to it, and inserting data.
:::
## Self-Managed Requirements
### CPU Architecture
ClickHouse can run on any Linux, FreeBSD, or Mac OS X with x86_64, AArch64, or PowerPC64LE CPU architecture.
@ -19,6 +40,55 @@ $ grep -q sse4_2 /proc/cpuinfo && echo "SSE 4.2 supported" || echo "SSE 4.2 not
To run ClickHouse on processors that do not support SSE 4.2 or have AArch64 or PowerPC64LE architecture, you should [build ClickHouse from sources](#from-sources) with proper configuration adjustments.
ClickHouse implements parallel data processing and uses all the hardware resources available. When choosing a processor, take into account that ClickHouse works more efficiently at configurations with a large number of cores but a lower clock rate than at configurations with fewer cores and a higher clock rate. For example, 16 cores with 2600 MHz is preferable to 8 cores with 3600 MHz.
It is recommended to use **Turbo Boost** and **hyper-threading** technologies. It significantly improves performance with a typical workload.
### RAM {#ram}
We recommend using a minimum of 4GB of RAM to perform non-trivial queries. The ClickHouse server can run with a much smaller amount of RAM, but it requires memory for processing queries.
The required volume of RAM depends on:
- The complexity of queries.
- The amount of data that is processed in queries.
To calculate the required volume of RAM, you should estimate the size of temporary data for [GROUP BY](/docs/en/sql-reference/statements/select/group-by.md#select-group-by-clause), [DISTINCT](/docs/en/sql-reference/statements/select/distinct.md#select-distinct), [JOIN](/docs/en/sql-reference/statements/select/join.md#select-join) and other operations you use.
ClickHouse can use external memory for temporary data. See [GROUP BY in External Memory](/docs/en/sql-reference/statements/select/group-by.md#select-group-by-in-external-memory) for details.
### Swap File {#swap-file}
Disable the swap file for production environments.
### Storage Subsystem {#storage-subsystem}
You need to have 2GB of free disk space to install ClickHouse.
The volume of storage required for your data should be calculated separately. Assessment should include:
- Estimation of the data volume.
You can take a sample of the data and get the average size of a row from it. Then multiply the value by the number of rows you plan to store.
- The data compression coefficient.
To estimate the data compression coefficient, load a sample of your data into ClickHouse, and compare the actual size of the data with the size of the table stored. For example, clickstream data is usually compressed by 6-10 times.
To calculate the final volume of data to be stored, apply the compression coefficient to the estimated data volume. If you plan to store data in several replicas, then multiply the estimated volume by the number of replicas.
### Network {#network}
If possible, use networks of 10G or higher class.
The network bandwidth is critical for processing distributed queries with a large amount of intermediate data. Besides, network speed affects replication processes.
### Software {#software}
ClickHouse is developed primarily for the Linux family of operating systems. The recommended Linux distribution is Ubuntu. The `tzdata` package should be installed in the system.
## Self-Managed Install
## Available Installation Options {#available-installation-options}
### From DEB Packages {#install-from-deb-packages}
@ -58,9 +128,9 @@ clickhouse-client # or "clickhouse-client --password" if you set up a password.
</details>
You can replace `stable` with `lts` to use different [release kinds](../faq/operations/production.md) based on your needs.
You can replace `stable` with `lts` to use different [release kinds](/docs/en/faq/operations/production.md) based on your needs.
You can also download and install packages manually from [here](https://packages.clickhouse.com/deb/pool/stable).
You can also download and install packages manually from [here](https://packages.clickhouse.com/deb/pool/main/c/).
#### Packages {#packages}
@ -105,7 +175,7 @@ clickhouse-client # or "clickhouse-client --password" if you set up a password.
</details>
You can replace `stable` with `lts` to use different [release kinds](../faq/operations/production.md) based on your needs.
You can replace `stable` with `lts` to use different [release kinds](/docs/en/faq/operations/production.md) based on your needs.
Then run these commands to install packages:
@ -226,7 +296,7 @@ Use the `clickhouse client` to connect to the server, or `clickhouse local` to p
### From Sources {#from-sources}
To manually compile ClickHouse, follow the instructions for [Linux](../development/build.md) or [Mac OS X](../development/build-osx.md).
To manually compile ClickHouse, follow the instructions for [Linux](/docs/en/development/build.md) or [Mac OS X](/docs/en/development/build-osx.md).
You can compile packages and install them or use programs without installing packages. Also by building manually you can disable SSE 4.2 requirement or build for AArch64 CPUs.
@ -281,7 +351,7 @@ If the configuration file is in the current directory, you do not need to specif
ClickHouse supports access restriction settings. They are located in the `users.xml` file (next to `config.xml`).
By default, access is allowed from anywhere for the `default` user, without a password. See `user/default/networks`.
For more information, see the section [“Configuration Files”](../operations/configuration-files.md).
For more information, see the section [“Configuration Files”](/docs/en/operations/configuration-files.md).
After launching server, you can use the command-line client to connect to it:
@ -292,7 +362,7 @@ $ clickhouse-client
By default, it connects to `localhost:9000` on behalf of the user `default` without a password. It can also be used to connect to a remote server using `--host` argument.
The terminal must use UTF-8 encoding.
For more information, see the section [“Command-line client”](../interfaces/cli.md).
For more information, see the section [“Command-line client”](/docs/en/interfaces/cli.md).
Example:
@ -317,6 +387,5 @@ SELECT 1
**Congratulations, the system works!**
To continue experimenting, you can download one of the test data sets or go through [tutorial](./../tutorial.md).
To continue experimenting, you can download one of the test data sets or go through [tutorial](/docs/en/tutorial.md).
[Original article](https://clickhouse.com/docs/en/getting_started/install/) <!--hide-->

View File

@ -3,6 +3,7 @@ slug: /en/interfaces/cli
sidebar_position: 17
sidebar_label: Command-Line Client
---
import ConnectionDetails from '@site/docs/en/_snippets/_gather_your_details_native.md';
# Command-line Client
@ -24,26 +25,76 @@ Connected to ClickHouse server version 20.13.1 revision 54442.
Different client and server versions are compatible with one another, but some features may not be available in older clients. We recommend using the same version of the client as the server app. When you try to use a client of the older version, then the server, `clickhouse-client` displays the message:
```response
ClickHouse client version is older than ClickHouse server. It may lack support for new features.
ClickHouse client version is older than ClickHouse server.
It may lack support for new features.
```
## Usage {#cli_usage}
The client can be used in interactive and non-interactive (batch) mode. To use batch mode, specify the query parameter, or send data to stdin (it verifies that stdin is not a terminal), or both. Similar to the HTTP interface, when using the query parameter and sending data to stdin, the request is a concatenation of the query parameter, a line feed, and the data in stdin. This is convenient for large INSERT queries.
The client can be used in interactive and non-interactive (batch) mode.
Example of using the client to insert data:
### Gather your connection details
<ConnectionDetails />
### Interactive
To connect to your ClickHouse Cloud service, or any ClickHouse server using TLS and passwords, interactively use `--secure`, port 9440, and provide your username and password:
```bash
clickhouse-client --host <HOSTNAME> \
--secure \
--port 9440 \
--user <USERNAME> \
--password <PASSWORD>
```
To connect to a self-managed ClickHouse server you will need the details for that server. Whether or not TLS is used, port numbers, and passwords are all configurable. Use the above example for ClickHouse Cloud as a starting point.
### Batch
To use batch mode, specify the query parameter, or send data to stdin (it verifies that stdin is not a terminal), or both. Similar to the HTTP interface, when using the query parameter and sending data to stdin, the request is a concatenation of the query parameter, a line feed, and the data in stdin. This is convenient for large INSERT queries.
Examples of using the client to insert data:
#### Inserting a CSV file into a remote ClickHouse service
This example is appropriate for ClickHouse Cloud, or any ClickHouse server using TLS and a password. In this example a sample dataset CSV file, `cell_towers.csv` is inserted into an existing table `cell_towers` in the `default` database:
```bash
clickhouse-client --host HOSTNAME.clickhouse.cloud \
--secure \
--port 9440 \
--user default \
--password PASSWORD \
--query "INSERT INTO cell_towers FORMAT CSVWithNames" \
< cell_towers.csv
```
:::note
To concentrate on the query syntax, the rest of the examples leave off the connection details (`--host`, `--port`, etc.). Add them in when you try the commands.
:::
#### Three different ways of inserting data
``` bash
$ echo -ne "1, 'some text', '2016-08-14 00:00:00'\n2, 'some more text', '2016-08-14 00:00:01'" | clickhouse-client --database=test --query="INSERT INTO test FORMAT CSV";
echo -ne "1, 'some text', '2016-08-14 00:00:00'\n2, 'some more text', '2016-08-14 00:00:01'" | \
clickhouse-client --database=test --query="INSERT INTO test FORMAT CSV";
```
$ cat <<_EOF | clickhouse-client --database=test --query="INSERT INTO test FORMAT CSV";
```bash
cat <<_EOF | clickhouse-client --database=test --query="INSERT INTO test FORMAT CSV";
3, 'some text', '2016-08-14 00:00:00'
4, 'some more text', '2016-08-14 00:00:01'
_EOF
$ cat file.csv | clickhouse-client --database=test --query="INSERT INTO test FORMAT CSV";
```
```bash
cat file.csv | clickhouse-client --database=test --query="INSERT INTO test FORMAT CSV";
```
### Notes
In batch mode, the default data format is TabSeparated. You can set the format in the FORMAT clause of the query.
By default, you can only process a single query in batch mode. To make multiple queries from a “script,” use the `--multiquery` parameter. This works for all queries except INSERT. Query results are output consecutively without additional separators. Similarly, to process a large number of queries, you can run clickhouse-client for each query. Note that it may take tens of milliseconds to launch the clickhouse-client program.

View File

@ -6,16 +6,32 @@ sidebar_label: MySQL Interface
# MySQL Interface
ClickHouse supports MySQL wire protocol. It can be enabled by [mysql_port](../operations/server-configuration-parameters/settings.md#server_configuration_parameters-mysql_port) setting in configuration file:
ClickHouse supports MySQL wire protocol. To enable the MySQL wire protocol, add the [mysql_port](../operations/server-configuration-parameters/settings.md#server_configuration_parameters-mysql_port) setting to your server's configuration file. For example, you could define the port in a new XML file in your `config.d` folder:
``` xml
<mysql_port>9004</mysql_port>
<clickhouse>
<mysql_port>9004</mysql_port>
</clickhouse>
```
Example of connecting using command-line tool `mysql`:
Startup your ClickHouse server and look for a log message similar to the following that mentions Listening for MySQL compatibility protocol:
```
{} <Information> Application: Listening for MySQL compatibility protocol: 127.0.0.1:9004
```
## Connect mysql to ClickHouse
The following command demonstrates how to connect the MySQL client `mysql` to ClickHouse:
```bash
mysql --protocol tcp -h [hostname] -u [username] -P [port_number] [database_name]
```
For example:
``` bash
$ mysql --protocol tcp -u default -P 9004
$ mysql --protocol tcp -h 127.0.0.1 -u default -P 9004 default
```
Output if a connection succeeded:

View File

@ -5,6 +5,9 @@ sidebar_label: ClickHouse Keeper
---
# ClickHouse Keeper
import SelfManaged from '@site/docs/en/_snippets/_self_managed_only_automated.md';
<SelfManaged />
ClickHouse Keeper provides the coordination system for data [replication](../engines/table-engines/mergetree-family/replication.md) and [distributed DDL](../sql-reference/distributed-ddl.md) queries execution. ClickHouse Keeper is compatible with ZooKeeper.

View File

@ -3,7 +3,11 @@ slug: /en/operations/external-authenticators/
sidebar_position: 48
sidebar_label: External User Authenticators and Directories
title: "External User Authenticators and Directories"
pagination_next: 'en/operations/external-authenticators/kerberos'
---
import SelfManaged from '@site/docs/en/_snippets/_self_managed_only_no_roadmap.md';
<SelfManaged />
ClickHouse supports authenticating and managing users using external services.

View File

@ -2,6 +2,9 @@
slug: /en/operations/external-authenticators/kerberos
---
# Kerberos
import SelfManaged from '@site/docs/en/_snippets/_self_managed_only_no_roadmap.md';
<SelfManaged />
Existing and properly configured ClickHouse users can be authenticated via Kerberos authentication protocol.

View File

@ -2,6 +2,9 @@
slug: /en/operations/external-authenticators/ldap
title: "LDAP"
---
import SelfManaged from '@site/docs/en/_snippets/_self_managed_only_no_roadmap.md';
<SelfManaged />
LDAP server can be used to authenticate ClickHouse users. There are two different approaches for doing this:

View File

@ -2,6 +2,9 @@
slug: /en/operations/external-authenticators/ssl-x509
title: "SSL X.509 certificate authentication"
---
import SelfManaged from '@site/docs/en/_snippets/_self_managed_only_no_roadmap.md';
<SelfManaged />
[SSL 'strict' option](../server-configuration-parameters/settings.md#server_configuration_parameters-openssl) enables mandatory certificate validation for the incoming connections. In this case, only connections with trusted certificates can be established. Connections with untrusted certificates will be rejected. Thus, certificate validation allows to uniquely authenticate an incoming connection. `Common Name` field of the certificate is used to identify connected user. This allows to associate multiple certificates with the same user. Additionally, reissuing and revoking of the certificates does not affect the ClickHouse configuration.

View File

@ -5,6 +5,9 @@ sidebar_label: Monitoring
---
# Monitoring
import SelfManaged from '@site/docs/en/_snippets/_self_managed_only_automated.md';
<SelfManaged />
You can monitor:

View File

@ -3,9 +3,12 @@ slug: /en/operations/optimizing-performance/sampling-query-profiler
sidebar_position: 54
sidebar_label: Query Profiling
---
import SelfManaged from '@site/docs/en/_snippets/_self_managed_only_no_roadmap.md';
# Sampling Query Profiler
<SelfManaged />
ClickHouse runs sampling profiler that allows analyzing query execution. Using profiler you can find source code routines that used the most frequently during query execution. You can trace CPU time and wall-clock time spent including idle time.
To use profiler:

View File

@ -5,6 +5,10 @@ sidebar_label: Testing Hardware
title: "How to Test Your Hardware with ClickHouse"
---
import SelfManaged from '@site/docs/en/_snippets/_self_managed_only_no_roadmap.md';
<SelfManaged />
You can run a basic ClickHouse performance test on any server without installation of ClickHouse packages.

View File

@ -1,60 +0,0 @@
---
slug: /en/operations/requirements
sidebar_position: 44
sidebar_label: Requirements
---
# Requirements
## CPU
For installation from prebuilt deb packages, use a CPU with x86_64 architecture and support for SSE 4.2 instructions. To run ClickHouse with processors that do not support SSE 4.2 or have AArch64 or PowerPC64LE architecture, you should build ClickHouse from sources.
ClickHouse implements parallel data processing and uses all the hardware resources available. When choosing a processor, take into account that ClickHouse works more efficiently at configurations with a large number of cores but a lower clock rate than at configurations with fewer cores and a higher clock rate. For example, 16 cores with 2600 MHz is preferable to 8 cores with 3600 MHz.
It is recommended to use **Turbo Boost** and **hyper-threading** technologies. It significantly improves performance with a typical workload.
## RAM {#ram}
We recommend using a minimum of 4GB of RAM to perform non-trivial queries. The ClickHouse server can run with a much smaller amount of RAM, but it requires memory for processing queries.
The required volume of RAM depends on:
- The complexity of queries.
- The amount of data that is processed in queries.
To calculate the required volume of RAM, you should estimate the size of temporary data for [GROUP BY](../sql-reference/statements/select/group-by.md#select-group-by-clause), [DISTINCT](../sql-reference/statements/select/distinct.md#select-distinct), [JOIN](../sql-reference/statements/select/join.md#select-join) and other operations you use.
ClickHouse can use external memory for temporary data. See [GROUP BY in External Memory](../sql-reference/statements/select/group-by.md#select-group-by-in-external-memory) for details.
## Swap File {#swap-file}
Disable the swap file for production environments.
## Storage Subsystem {#storage-subsystem}
You need to have 2GB of free disk space to install ClickHouse.
The volume of storage required for your data should be calculated separately. Assessment should include:
- Estimation of the data volume.
You can take a sample of the data and get the average size of a row from it. Then multiply the value by the number of rows you plan to store.
- The data compression coefficient.
To estimate the data compression coefficient, load a sample of your data into ClickHouse, and compare the actual size of the data with the size of the table stored. For example, clickstream data is usually compressed by 6-10 times.
To calculate the final volume of data to be stored, apply the compression coefficient to the estimated data volume. If you plan to store data in several replicas, then multiply the estimated volume by the number of replicas.
## Network {#network}
If possible, use networks of 10G or higher class.
The network bandwidth is critical for processing distributed queries with a large amount of intermediate data. Besides, network speed affects replication processes.
## Software {#software}
ClickHouse is developed primarily for the Linux family of operating systems. The recommended Linux distribution is Ubuntu. The `tzdata` package should be installed in the system.
ClickHouse can also work in other operating system families. See details in the [install guide](../getting-started/install.md) section of the documentation.

View File

@ -2,6 +2,7 @@
slug: /en/operations/server-configuration-parameters/
sidebar_position: 54
sidebar_label: Server Configuration Parameters
pagination_next: en/operations/server-configuration-parameters/settings
---
# Server Configuration Parameters

View File

@ -666,6 +666,7 @@ Keys:
- `http_proxy` - Configure HTTP proxy for sending crash reports.
- `debug` - Sets the Sentry client into debug mode.
- `tmp_path` - Filesystem path for temporary crash report state.
- `environment` - An arbitrary name of an environment in which the ClickHouse server is running. It will be mentioned in each crash report. The default value is `test` or `prod` depending on the version of ClickHouse.
**Recommended way to use**

View File

@ -2,6 +2,7 @@
sidebar_label: Settings
sidebar_position: 51
slug: /en/operations/settings/
pagination_next: en/operations/settings/settings
---
# Settings Overview

View File

@ -35,7 +35,7 @@ Structure of the `users` section:
<database_name>
<table_name>
<filter>expression</filter>
<table_name>
</table_name>
</database_name>
</databases>
</user_name>

View File

@ -668,7 +668,7 @@ log_query_views=1
## log_formatted_queries {#settings-log-formatted-queries}
Allows to log formatted queries to the [system.query_log](../../operations/system-tables/query_log.md) system table.
Allows to log formatted queries to the [system.query_log](../../operations/system-tables/query_log.md) system table (populates `formatted_query` column in the [system.query_log](../../operations/system-tables/query_log.md)).
Possible values:

View File

@ -5,6 +5,9 @@ sidebar_label: Secured Communication with Zookeeper
---
# Optional secured communication between ClickHouse and Zookeeper
import SelfManaged from '@site/docs/en/_snippets/_self_managed_only_automated.md';
<SelfManaged />
You should specify `ssl.keyStore.location`, `ssl.keyStore.password` and `ssl.trustStore.location`, `ssl.trustStore.password` for communication with ClickHouse client over SSL. These options are available from Zookeeper version 3.5.2.

View File

@ -11,6 +11,7 @@ Columns:
- `path` ([String](../../sql-reference/data-types/string.md)) — Path to the mount point in the file system.
- `free_space` ([UInt64](../../sql-reference/data-types/int-uint.md)) — Free space on disk in bytes.
- `total_space` ([UInt64](../../sql-reference/data-types/int-uint.md)) — Disk volume in bytes.
- `unreserved_space` ([UInt64](../../sql-reference/data-types/int-uint.md)) — Free space which is not taken by reservations (`free_space` minus the size of reservations taken by merges, inserts, and other disk write operations currently running).
- `keep_free_space` ([UInt64](../../sql-reference/data-types/int-uint.md)) — Amount of disk space that should stay free on disk in bytes. Defined in the `keep_free_space_bytes` parameter of disk configuration.
**Example**

View File

@ -4,6 +4,9 @@ sidebar_position: 58
sidebar_label: Usage Recommendations
title: "Usage Recommendations"
---
import SelfManaged from '@site/docs/en/_snippets/_self_managed_only_automated.md';
<SelfManaged />
## CPU Scaling Governor

View File

@ -294,6 +294,53 @@ Result:
Notice how only a portion of the data was properly decrypted, and the rest is gibberish since either `mode`, `key`, or `iv` were different upon encryption.
## tryDecrypt
Similar to `decrypt`, but returns NULL if decryption fails because of using the wrong key.
**Examples**
Let's create a table where `user_id` is the unique user id, `encrypted` is an encrypted string field, `iv` is an initial vector for decrypt/encrypt. Assume that users know their id and the key to decrypt the encrypted field:
```sql
CREATE TABLE decrypt_null (
dt DateTime,
user_id UInt32,
encrypted String,
iv String
) ENGINE = Memory;
```
Insert some data:
```sql
INSERT INTO decrypt_null VALUES
('2022-08-02 00:00:00', 1, encrypt('aes-256-gcm', 'value1', 'keykeykeykeykeykeykeykeykeykey01', 'iv1'), 'iv1'),
('2022-09-02 00:00:00', 2, encrypt('aes-256-gcm', 'value2', 'keykeykeykeykeykeykeykeykeykey02', 'iv2'), 'iv2'),
('2022-09-02 00:00:01', 3, encrypt('aes-256-gcm', 'value3', 'keykeykeykeykeykeykeykeykeykey03', 'iv3'), 'iv3');
```
Query:
```sql
SELECT
dt,
user_id,
tryDecrypt('aes-256-gcm', encrypted, 'keykeykeykeykeykeykeykeykeykey02', iv) AS value
FROM decrypt_null
ORDER BY user_id ASC
```
Result:
```
┌──────────────────dt─┬─user_id─┬─value──┐
│ 2022-08-02 00:00:00 │ 1 │ ᴺᵁᴸᴸ │
│ 2022-09-02 00:00:00 │ 2 │ value2 │
│ 2022-09-02 00:00:01 │ 3 │ ᴺᵁᴸᴸ │
└─────────────────────┴─────────┴────────┘
```
## aes_decrypt_mysql
Compatible with mysql encryption and decrypts data encrypted with [AES_ENCRYPT](https://dev.mysql.com/doc/refman/8.0/en/encryption-functions.html#function_aes-encrypt) function.

View File

@ -10,7 +10,7 @@ Creates new [roles](../../../operations/access-rights.md#role-management). Role
Syntax:
``` sql
CREATE ROLE [IF NOT EXISTS | OR REPLACE] name1 [, name2 ...]
CREATE ROLE [IF NOT EXISTS | OR REPLACE] name1 [ON CLUSTER cluster_name1] [, name2 [ON CLUSTER cluster_name2] ...]
[SETTINGS variable [= value] [MIN [=] min_value] [MAX [=] max_value] [CONST|READONLY|WRITABLE|CHANGEABLE_IN_READONLY] | PROFILE 'profile_name'] [,...]
```

View File

@ -13,7 +13,7 @@ Creates a new view. Views can be [normal](#normal-view), [materialized](#materia
Syntax:
``` sql
CREATE [OR REPLACE] VIEW [IF NOT EXISTS] [db.]table_name [ON CLUSTER] AS SELECT ...
CREATE [OR REPLACE] VIEW [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster_name] AS SELECT ...
```
Normal views do not store any data. They just perform a read from another table on each access. In other words, a normal view is nothing more than a saved query. When reading from a view, this saved query is used as a subquery in the [FROM](../../../sql-reference/statements/select/from.md) clause.

View File

@ -430,9 +430,9 @@ FROM
### Cumulative sum.
```sql
CREATE TABLE events
CREATE TABLE warehouse
(
`metric` String,
`item` String,
`ts` DateTime,
`value` Float
)

View File

@ -624,6 +624,7 @@ ClickHouse поддерживает динамическое изменение
- `http_proxy` - Настройка HTTP proxy для отсылки отчетов о сбоях.
- `debug` - Настроить клиентскую библиотеку Sentry в debug режим.
- `tmp_path` - Путь в файловой системе для временного хранения состояния отчетов о сбоях перед отправкой на сервер Sentry.
- `environment` - Произвольное название среды, в которой запущен сервер ClickHouse, которое будет упомянуто в каждом отчете от сбое. По умолчанию имеет значение `test` или `prod` в зависимости от версии ClickHouse.
**Рекомендованные настройки**

View File

@ -11,5 +11,6 @@ Cодержит информацию о дисках, заданных в [ко
- `path` ([String](../../sql-reference/data-types/string.md)) — путь к точке монтирования в файловой системе.
- `free_space` ([UInt64](../../sql-reference/data-types/int-uint.md)) — свободное место на диске в байтах.
- `total_space` ([UInt64](../../sql-reference/data-types/int-uint.md)) — объём диска в байтах.
- `unreserved_space` ([UInt64](../../sql-reference/data-types/int-uint.md)) — не зарезервированное cвободное место в байтах (`free_space` минус размер места, зарезервированного на выполняемые в данный момент фоновые слияния, вставки и другие операции записи на диск).
- `keep_free_space` ([UInt64](../../sql-reference/data-types/int-uint.md)) — место, которое должно остаться свободным на диске в байтах. Задаётся значением параметра `keep_free_space_bytes` конфигурации дисков.

View File

@ -1053,6 +1053,7 @@ formatDateTime(Time, Format[, Timezone])
| %w | номер дня недели, начиная с воскресенья (0-6) | 2 |
| %y | год, последние 2 цифры (00-99) | 18 |
| %Y | год, 4 цифры | 2018 |
| %z | Смещение времени от UTC +HHMM или -HHMM | -0500 |
| %% | символ % | % |
**Пример**

View File

@ -11,7 +11,7 @@ sidebar_label: "Роль"
Синтаксис:
```sql
CREATE ROLE [IF NOT EXISTS | OR REPLACE] name1 [, name2 ...]
CREATE ROLE [IF NOT EXISTS | OR REPLACE] name1 [ON CLUSTER cluster_name1] [, name2 [ON CLUSTER cluster_name2] ...]
[SETTINGS variable [= value] [MIN [=] min_value] [MAX [=] max_value] [CONST|READONLY|WRITABLE|CHANGEABLE_IN_READONLY] | PROFILE 'profile_name'] [,...]
```
@ -47,4 +47,4 @@ SET ROLE accountant;
SELECT * FROM db.*;
```
<!--hide-->
<!--hide-->

View File

@ -11,7 +11,7 @@ sidebar_label: "Представление"
## Обычные представления {#normal}
``` sql
CREATE [OR REPLACE] VIEW [IF NOT EXISTS] [db.]table_name [ON CLUSTER] AS SELECT ...
CREATE [OR REPLACE] VIEW [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster_name] AS SELECT ...
```
Обычные представления не хранят никаких данных, они выполняют чтение данных из другой таблицы при каждом доступе. Другими словами, обычное представление — это не что иное, как сохраненный запрос. При чтении данных из представления этот сохраненный запрос используется как подзапрос в секции [FROM](../../../sql-reference/statements/select/from.md).

View File

@ -13,7 +13,7 @@ sidebar_label: VIEW
语法:
``` sql
CREATE [OR REPLACE] VIEW [IF NOT EXISTS] [db.]table_name [ON CLUSTER] AS SELECT ...
CREATE [OR REPLACE] VIEW [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster_name] AS SELECT ...
```
普通视图不存储任何数据。 他们只是在每次访问时从另一个表执行读取。换句话说,普通视图只不过是一个保存的查询。 从视图中读取时,此保存的查询用作[FROM](../../../sql-reference/statements/select/from.md)子句中的子查询.

View File

@ -189,7 +189,7 @@ else()
message(STATUS "ClickHouse su: OFF")
endif()
configure_file (config_tools.h.in ${ConfigIncludePath}/config_tools.h)
configure_file (config_tools.h.in ${CONFIG_INCLUDE_PATH}/config_tools.h)
macro(clickhouse_target_link_split_lib target name)
if(NOT CLICKHOUSE_ONE_SHARED)

View File

@ -12,10 +12,11 @@
#include <string>
#include "Client.h"
#include "Core/Protocol.h"
#include "Parsers/formatAST.h"
#include <base/find_symbols.h>
#include <Common/config_version.h>
#include "config_version.h"
#include <Common/Exception.h>
#include <Common/formatReadable.h>
#include <Common/TerminalSize.h>
@ -514,6 +515,66 @@ static bool queryHasWithClause(const IAST & ast)
return false;
}
std::optional<bool> Client::processFuzzingStep(const String & query_to_execute, const ASTPtr & parsed_query)
{
processParsedSingleQuery(query_to_execute, query_to_execute, parsed_query);
const auto * exception = server_exception ? server_exception.get() : client_exception.get();
// Sometimes you may get TOO_DEEP_RECURSION from the server,
// and TOO_DEEP_RECURSION should not fail the fuzzer check.
if (have_error && exception->code() == ErrorCodes::TOO_DEEP_RECURSION)
{
have_error = false;
server_exception.reset();
client_exception.reset();
return true;
}
if (have_error)
{
fmt::print(stderr, "Error on processing query '{}': {}\n", parsed_query->formatForErrorMessage(), exception->message());
// Try to reconnect after errors, for two reasons:
// 1. We might not have realized that the server died, e.g. if
// it sent us a <Fatal> trace and closed connection properly.
// 2. The connection might have gotten into a wrong state and
// the next query will get false positive about
// "Unknown packet from server".
try
{
connection->forceConnected(connection_parameters.timeouts);
}
catch (...)
{
// Just report it, we'll terminate below.
fmt::print(stderr,
"Error while reconnecting to the server: {}\n",
getCurrentExceptionMessage(true));
// The reconnection might fail, but we'll still be connected
// in the sense of `connection->isConnected() = true`,
// in case when the requested database doesn't exist.
// Disconnect manually now, so that the following code doesn't
// have any doubts, and the connection state is predictable.
connection->disconnect();
}
}
if (!connection->isConnected())
{
// Probably the server is dead because we found an assertion
// failure. Fail fast.
fmt::print(stderr, "Lost connection to the server.\n");
// Print the changed settings because they might be needed to
// reproduce the error.
printChangedSettings();
return false;
}
return std::nullopt;
}
/// Returns false when server is not available.
bool Client::processWithFuzzing(const String & full_query)
@ -558,18 +619,33 @@ bool Client::processWithFuzzing(const String & full_query)
// - SET -- The time to fuzz the settings has not yet come
// (see comments in Client/QueryFuzzer.cpp)
size_t this_query_runs = query_fuzzer_runs;
if (orig_ast->as<ASTInsertQuery>() ||
orig_ast->as<ASTCreateQuery>() ||
orig_ast->as<ASTDropQuery>() ||
orig_ast->as<ASTSetQuery>())
ASTs queries_for_fuzzed_tables;
if (orig_ast->as<ASTSetQuery>())
{
this_query_runs = 1;
}
else if (const auto * create = orig_ast->as<ASTCreateQuery>())
{
if (QueryFuzzer::isSuitableForFuzzing(*create))
this_query_runs = create_query_fuzzer_runs;
else
this_query_runs = 1;
}
else if (const auto * insert = orig_ast->as<ASTInsertQuery>())
{
this_query_runs = 1;
queries_for_fuzzed_tables = fuzzer.getInsertQueriesForFuzzedTables(full_query);
}
else if (const auto * drop = orig_ast->as<ASTDropQuery>())
{
this_query_runs = 1;
queries_for_fuzzed_tables = fuzzer.getDropQueriesForFuzzedTables(*drop);
}
String query_to_execute;
ASTPtr parsed_query;
ASTPtr fuzz_base = orig_ast;
for (size_t fuzz_step = 0; fuzz_step < this_query_runs; ++fuzz_step)
{
fmt::print(stderr, "Fuzzing step {} out of {}\n", fuzz_step, this_query_runs);
@ -630,9 +706,9 @@ bool Client::processWithFuzzing(const String & full_query)
continue;
}
parsed_query = ast_to_process;
query_to_execute = parsed_query->formatForErrorMessage();
processParsedSingleQuery(full_query, query_to_execute, parsed_query);
query_to_execute = ast_to_process->formatForErrorMessage();
if (auto res = processFuzzingStep(query_to_execute, ast_to_process))
return *res;
}
catch (...)
{
@ -645,60 +721,6 @@ bool Client::processWithFuzzing(const String & full_query)
have_error = true;
}
const auto * exception = server_exception ? server_exception.get() : client_exception.get();
// Sometimes you may get TOO_DEEP_RECURSION from the server,
// and TOO_DEEP_RECURSION should not fail the fuzzer check.
if (have_error && exception->code() == ErrorCodes::TOO_DEEP_RECURSION)
{
have_error = false;
server_exception.reset();
client_exception.reset();
return true;
}
if (have_error)
{
fmt::print(stderr, "Error on processing query '{}': {}\n", ast_to_process->formatForErrorMessage(), exception->message());
// Try to reconnect after errors, for two reasons:
// 1. We might not have realized that the server died, e.g. if
// it sent us a <Fatal> trace and closed connection properly.
// 2. The connection might have gotten into a wrong state and
// the next query will get false positive about
// "Unknown packet from server".
try
{
connection->forceConnected(connection_parameters.timeouts);
}
catch (...)
{
// Just report it, we'll terminate below.
fmt::print(stderr,
"Error while reconnecting to the server: {}\n",
getCurrentExceptionMessage(true));
// The reconnection might fail, but we'll still be connected
// in the sense of `connection->isConnected() = true`,
// in case when the requested database doesn't exist.
// Disconnect manually now, so that the following code doesn't
// have any doubts, and the connection state is predictable.
connection->disconnect();
}
}
if (!connection->isConnected())
{
// Probably the server is dead because we found an assertion
// failure. Fail fast.
fmt::print(stderr, "Lost connection to the server.\n");
// Print the changed settings because they might be needed to
// reproduce the error.
printChangedSettings();
return false;
}
// Check that after the query is formatted, we can parse it back,
// format again and get the same result. Unfortunately, we can't
// compare the ASTs, which would be more sensitive to errors. This
@ -729,13 +751,12 @@ bool Client::processWithFuzzing(const String & full_query)
// query, but second and third.
// If you have to add any more workarounds to this check, just remove
// it altogether, it's not so useful.
if (parsed_query && !have_error && !queryHasWithClause(*parsed_query))
if (ast_to_process && !have_error && !queryHasWithClause(*ast_to_process))
{
ASTPtr ast_2;
try
{
const auto * tmp_pos = query_to_execute.c_str();
ast_2 = parseQuery(tmp_pos, tmp_pos + query_to_execute.size(), false /* allow_multi_statements */);
}
catch (Exception & e)
@ -762,7 +783,7 @@ bool Client::processWithFuzzing(const String & full_query)
"Got the following (different) text after formatting the fuzzed query and parsing it back:\n'{}'\n, expected:\n'{}'\n",
text_3, text_2);
fmt::print(stderr, "In more detail:\n");
fmt::print(stderr, "AST-1 (generated by fuzzer):\n'{}'\n", parsed_query->dumpTree());
fmt::print(stderr, "AST-1 (generated by fuzzer):\n'{}'\n", ast_to_process->dumpTree());
fmt::print(stderr, "Text-1 (AST-1 formatted):\n'{}'\n", query_to_execute);
fmt::print(stderr, "AST-2 (Text-1 parsed):\n'{}'\n", ast_2->dumpTree());
fmt::print(stderr, "Text-2 (AST-2 formatted):\n'{}'\n", text_2);
@ -784,6 +805,7 @@ bool Client::processWithFuzzing(const String & full_query)
// so that it doesn't influence the exit code.
server_exception.reset();
client_exception.reset();
fuzzer.notifyQueryFailed(ast_to_process);
have_error = false;
}
else if (ast_to_process->formatForErrorMessage().size() > 500)
@ -800,6 +822,35 @@ bool Client::processWithFuzzing(const String & full_query)
}
}
for (const auto & query : queries_for_fuzzed_tables)
{
std::cout << std::endl;
WriteBufferFromOStream ast_buf(std::cout, 4096);
formatAST(*query, ast_buf, false /*highlight*/);
ast_buf.next();
std::cout << std::endl << std::endl;
try
{
query_to_execute = query->formatForErrorMessage();
if (auto res = processFuzzingStep(query_to_execute, query))
return *res;
}
catch (...)
{
client_exception = std::make_unique<Exception>(getCurrentExceptionMessage(print_stack_trace), getCurrentExceptionCode());
have_error = true;
}
if (have_error)
{
server_exception.reset();
client_exception.reset();
fuzzer.notifyQueryFailed(query);
have_error = false;
}
}
return true;
}
@ -834,6 +885,7 @@ void Client::addOptions(OptionsDescription & options_description)
("compression", po::value<bool>(), "enable or disable compression (enabled by default for remote communication and disabled for localhost communication).")
("query-fuzzer-runs", po::value<int>()->default_value(0), "After executing every SELECT query, do random mutations in it and run again specified number of times. This is used for testing to discover unexpected corner cases.")
("create-query-fuzzer-runs", po::value<int>()->default_value(0), "")
("interleave-queries-file", po::value<std::vector<std::string>>()->multitoken(),
"file path with queries to execute before every file from 'queries-file'; multiple files can be specified (--queries-file file1 file2...); this is needed to enable more aggressive fuzzing of newly added tests (see 'query-fuzzer-runs' option)")
@ -994,6 +1046,17 @@ void Client::processOptions(const OptionsDescription & options_description,
ignore_error = true;
}
if ((create_query_fuzzer_runs = options["create-query-fuzzer-runs"].as<int>()))
{
// Fuzzer implies multiquery.
config().setBool("multiquery", true);
// Ignore errors in parsing queries.
config().setBool("ignore-error", true);
global_context->setSetting("allow_suspicious_low_cardinality_types", true);
ignore_error = true;
}
if (options.count("opentelemetry-traceparent"))
{
String traceparent = options["opentelemetry-traceparent"].as<std::string>();

View File

@ -17,6 +17,7 @@ public:
protected:
bool processWithFuzzing(const String & full_query) override;
std::optional<bool> processFuzzingStep(const String & query_to_execute, const ASTPtr & parsed_query);
void connect() override;

View File

@ -1,6 +1,6 @@
#pragma once
/// This file was autogenerated by CMake
// .h autogenerated by cmake !
#pragma once
#cmakedefine01 ENABLE_CLICKHOUSE_SERVER
#cmakedefine01 ENABLE_CLICKHOUSE_CLIENT

View File

@ -927,7 +927,11 @@ namespace
executable.string(), config.string(), pid_file.string());
if (!user.empty())
command = fmt::format("clickhouse su '{}' {}", user, command);
{
/// sudo respects limits in /etc/security/limits.conf e.g. open files,
/// that's why we are using it instead of the 'clickhouse su' tool.
command = fmt::format("sudo -u '{}' {}", user, command);
}
fmt::print("Will run {}\n", command);
executeScript(command, true);

View File

@ -24,8 +24,8 @@
#include <pwd.h>
#include <Coordination/FourLetterCommand.h>
#include "config_core.h"
#include "Common/config_version.h"
#include "config.h"
#include "config_version.h"
#if USE_SSL
# include <Poco/Net/Context.h>

View File

@ -1,6 +1,6 @@
#pragma once
#include <Common/config.h>
#include "config.h"
#if USE_ODBC

View File

@ -2,7 +2,7 @@
#include <Interpreters/Context.h>
#include <Server/HTTP/HTTPRequestHandler.h>
#include <Common/config.h>
#include "config.h"
#include <Poco/Logger.h>
#if USE_ODBC

View File

@ -20,7 +20,7 @@
#include <Common/BridgeProtocolVersion.h>
#include <Common/logger_useful.h>
#include <Server/HTTP/HTMLForm.h>
#include <Common/config.h>
#include "config.h"
#include <mutex>
#include <memory>

View File

@ -1,7 +1,7 @@
#include "ODBCHandlerFactory.h"
#include "PingHandler.h"
#include "ColumnInfoHandler.h"
#include <Common/config.h>
#include "config.h"
#include <Poco/URI.h>
#include <Poco/Net/HTTPServerRequest.h>
#include <Common/logger_useful.h>

View File

@ -2,7 +2,7 @@
#include <Interpreters/Context.h>
#include <Server/HTTP/HTTPRequestHandler.h>
#include <Common/config.h>
#include "config.h"
#include <Poco/Logger.h>
#if USE_ODBC

View File

@ -1,6 +1,6 @@
#pragma once
#include <Common/config.h>
#include "config.h"
#if USE_ODBC

View File

@ -79,7 +79,9 @@
#include <Common/ThreadFuzzer.h>
#include <Common/getHashOfLoadedBinary.h>
#include <Common/filesystemHelpers.h>
#if USE_BORINGSSL
#include <Compression/CompressionCodecEncrypted.h>
#endif
#include <Server/MySQLHandlerFactory.h>
#include <Server/PostgreSQLHandlerFactory.h>
#include <Server/CertificateReloader.h>
@ -88,8 +90,8 @@
#include <Interpreters/AsynchronousInsertQueue.h>
#include <filesystem>
#include "config_core.h"
#include "Common/config_version.h"
#include "config.h"
#include "config_version.h"
#if defined(OS_LINUX)
# include <sys/mman.h>
@ -1264,8 +1266,9 @@ int Server::main(const std::vector<std::string> & /*args*/)
global_context->updateStorageConfiguration(*config);
global_context->updateInterserverCredentials(*config);
#if USE_BORINGSSL
CompressionCodecEncrypted::Configuration::instance().tryLoad(*config, "encryption_codecs");
#endif
#if USE_SSL
CertificateReloader::instance().tryLoad(*config);
#endif
@ -1418,8 +1421,7 @@ int Server::main(const std::vector<std::string> & /*args*/)
global_context->setAsynchronousInsertQueue(std::make_shared<AsynchronousInsertQueue>(
global_context,
settings.async_insert_threads,
settings.async_insert_max_data_size,
AsynchronousInsertQueue::Timeout{.busy = settings.async_insert_busy_timeout_ms, .stale = settings.async_insert_stale_timeout_ms}));
settings.async_insert_cleanup_timeout_ms));
/// Size of cache for marks (index of MergeTree family of tables).
size_t mark_cache_size = config().getUInt64("mark_cache_size", 5368709120);
@ -1471,9 +1473,10 @@ int Server::main(const std::vector<std::string> & /*args*/)
global_context->getMergeTreeSettings().sanityCheck(background_pool_tasks);
global_context->getReplicatedMergeTreeSettings().sanityCheck(background_pool_tasks);
}
#if USE_BORINGSSL
/// try set up encryption. There are some errors in config, error will be printed and server wouldn't start.
CompressionCodecEncrypted::Configuration::instance().load(config(), "encryption_codecs");
#endif
SCOPE_EXIT({
/// Stop reloading of the main config. This must be done before `global_context->shutdown()` because

View File

@ -1173,6 +1173,18 @@
<flush_interval_milliseconds>7500</flush_interval_milliseconds>
</processors_profile_log>
<!-- Log of asynchronous inserts. It allows to check status
of insert query in fire-and-forget mode.
-->
<asynchronous_insert_log>
<database>system</database>
<table>asynchronous_insert_log</table>
<flush_interval_milliseconds>7500</flush_interval_milliseconds>
<partition_by>event_date</partition_by>
<ttl>event_date + INTERVAL 3 DAY</ttl>
</asynchronous_insert_log>
<!-- <top_level_domains_path>/var/lib/clickhouse/top_level_domains/</top_level_domains_path> -->
<!-- Custom TLD lists.
Format: <name>/path/to/file</name>

4
rust/BLAKE3/CMakeLists.txt Executable file
View File

@ -0,0 +1,4 @@
corrosion_import_crate(MANIFEST_PATH Cargo.toml NO_STD)
target_include_directories(_ch_rust_blake3 INTERFACE include)
add_library(ch_rust::blake3 ALIAS _ch_rust_blake3)

92
rust/BLAKE3/Cargo.lock generated Normal file
View File

@ -0,0 +1,92 @@
# This file is automatically @generated by Cargo.
# It is not intended for manual editing.
version = 3
[[package]]
name = "_ch_rust_blake3"
version = "0.1.0"
dependencies = [
"blake3",
"libc",
]
[[package]]
name = "arrayref"
version = "0.3.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a4c527152e37cf757a3f78aae5a06fbeefdb07ccc535c980a3208ee3060dd544"
[[package]]
name = "arrayvec"
version = "0.7.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8da52d66c7071e2e3fa2a1e5c6d088fec47b593032b254f5e980de8ea54454d6"
[[package]]
name = "blake3"
version = "1.2.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "526c210b4520e416420759af363083471656e819a75e831b8d2c9d5a584f2413"
dependencies = [
"arrayref",
"arrayvec",
"cc",
"cfg-if",
"constant_time_eq",
"digest",
]
[[package]]
name = "cc"
version = "1.0.73"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2fff2a6927b3bb87f9595d67196a70493f627687a71d87a0d692242c33f58c11"
[[package]]
name = "cfg-if"
version = "1.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd"
[[package]]
name = "constant_time_eq"
version = "0.1.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "245097e9a4535ee1e3e3931fcfcd55a796a44c643e8596ff6566d68f09b87bbc"
[[package]]
name = "digest"
version = "0.9.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d3dd60d1080a57a05ab032377049e0591415d2b31afd7028356dbf3cc6dcb066"
dependencies = [
"generic-array",
]
[[package]]
name = "generic-array"
version = "0.14.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "bff49e947297f3312447abdca79f45f4738097cc82b06e72054d2223f601f1b9"
dependencies = [
"typenum",
"version_check",
]
[[package]]
name = "libc"
version = "0.2.132"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8371e4e5341c3a96db127eb2465ac681ced4c433e01dd0e938adbef26ba93ba5"
[[package]]
name = "typenum"
version = "1.15.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "dcf81ac59edc17cc8697ff311e8f5ef2d99fcbd9817b34cec66f90b6c3dfd987"
[[package]]
name = "version_check"
version = "0.9.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "49874b5167b65d7193b8aba1567f5c7d93d001cafc34600cee003eda787e483f"

13
rust/BLAKE3/Cargo.toml Normal file
View File

@ -0,0 +1,13 @@
[package]
name = "_ch_rust_blake3"
version = "0.1.0"
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
[dependencies]
blake3 = "1.2.0"
libc = "0.2.132"
[lib]
crate-type = ["staticlib"]

View File

@ -0,0 +1,17 @@
#ifndef BLAKE3_H
#define BLAKE3_H
#include <cstdint>
extern "C" {
char *blake3_apply_shim(const char *begin, uint32_t _size, uint8_t *out_char_data);
char *blake3_apply_shim_msan_compat(const char *begin, uint32_t size, uint8_t *out_char_data);
void blake3_free_char_pointer(char *ptr_to_free);
} // extern "C"
#endif /* BLAKE3_H */

55
rust/BLAKE3/src/lib.rs Normal file
View File

@ -0,0 +1,55 @@
extern crate blake3;
extern crate libc;
use std::ffi::{CStr, CString};
use std::os::raw::c_char;
use std::mem;
#[no_mangle]
pub unsafe extern "C" fn blake3_apply_shim(
begin: *const c_char,
_size: u32,
out_char_data: *mut u8,
) -> *mut c_char {
if begin.is_null() {
let err_str = CString::new("input was a null pointer").unwrap();
return err_str.into_raw();
}
let mut hasher = blake3::Hasher::new();
let input_bytes = CStr::from_ptr(begin);
let input_res = input_bytes.to_bytes();
hasher.update(input_res);
let mut reader = hasher.finalize_xof();
reader.fill(std::slice::from_raw_parts_mut(out_char_data, blake3::OUT_LEN));
std::ptr::null_mut()
}
#[no_mangle]
pub unsafe extern "C" fn blake3_apply_shim_msan_compat(
mut begin: *const c_char,
size: u32,
out_char_data: *mut u8,
) -> *mut c_char {
if begin.is_null() {
let err_str = CString::new("input was a null pointer").unwrap();
return err_str.into_raw();
}
libc::memset(out_char_data as *mut libc::c_void, 0, mem::size_of::<u8>());
let mut hasher = blake3::Hasher::new();
let mut vec = Vec::<u8>::new();
for _ in 0..size {
vec.push(*begin as u8);
begin = begin.add(1);
}
let input_res = vec.as_mut_slice();
hasher.update(input_res);
let mut reader = hasher.finalize_xof();
reader.fill(std::slice::from_raw_parts_mut(out_char_data, blake3::OUT_LEN));
std::ptr::null_mut()
}
// Freeing memory according to docs: https://doc.rust-lang.org/std/ffi/struct.CString.html#method.into_raw
#[no_mangle]
pub unsafe extern "C" fn blake3_free_char_pointer(ptr_to_free: *mut c_char) {
std::mem::drop(CString::from_raw(ptr_to_free));
}

1
rust/CMakeLists.txt Normal file
View File

@ -0,0 +1 @@
add_subdirectory (BLAKE3)

View File

@ -1,6 +1,6 @@
#pragma once
#include "config_core.h"
#include "config.h"
#include <Access/Credentials.h>
#include <base/types.h>

View File

@ -1,6 +1,6 @@
#pragma once
#include "config_core.h"
#include "config.h"
#include <base/types.h>

View File

@ -1,6 +1,6 @@
#pragma once
#include "config_core.h"
#include "config.h"
#include <base/types.h>

View File

@ -11,7 +11,7 @@
#include <AggregateFunctions/AggregateFunctionSum.h>
#include <Core/DecimalFunctions.h>
#include <Common/config.h>
#include "config.h"
#if USE_EMBEDDED_COMPILER
# include <llvm/IR/IRBuilder.h>

View File

@ -9,7 +9,7 @@
#include <AggregateFunctions/IAggregateFunction.h>
#include <Common/config.h>
#include "config.h"
#if USE_EMBEDDED_COMPILER
# include <llvm/IR/IRBuilder.h>

View File

@ -12,7 +12,7 @@
#include <AggregateFunctions/AggregateFunctionFactory.h>
#include <Common/assert_cast.h>
#include <Common/config.h>
#include "config.h"
#if USE_EMBEDDED_COMPILER
# include <llvm/IR/IRBuilder.h>

View File

@ -5,7 +5,7 @@
#include <Common/assert_cast.h>
#include <AggregateFunctions/IAggregateFunction.h>
#include <Common/config.h>
#include "config.h"
#if USE_EMBEDDED_COMPILER
# include <llvm/IR/IRBuilder.h>

View File

@ -14,7 +14,7 @@
#include <DataTypes/DataTypeNullable.h>
#include <AggregateFunctions/IAggregateFunction.h>
#include <Common/config.h>
#include "config.h"
#if USE_EMBEDDED_COMPILER
# include <llvm/IR/IRBuilder.h>

View File

@ -10,7 +10,7 @@
#include <IO/ReadHelpers.h>
#include <IO/WriteHelpers.h>
#include <Common/config.h>
#include "config.h"
#if USE_EMBEDDED_COMPILER
# include <llvm/IR/IRBuilder.h>
@ -236,15 +236,8 @@ public:
if constexpr (result_is_nullable)
{
#ifdef __clang__
#pragma clang diagnostic push
#pragma clang diagnostic ignored "-Wdeprecated-declarations"
#endif
auto * aggregate_data_is_null_dst_value = b.CreateLoad(aggregate_data_dst_ptr);
auto * aggregate_data_is_null_src_value = b.CreateLoad(aggregate_data_src_ptr);
#ifdef __clang__
#pragma clang diagnostic pop
#endif
auto * aggregate_data_is_null_dst_value = b.CreateLoad(aggregate_data_dst_ptr->getType()->getPointerElementType(), aggregate_data_dst_ptr);
auto * aggregate_data_is_null_src_value = b.CreateLoad(aggregate_data_src_ptr->getType()->getPointerElementType(), aggregate_data_src_ptr);
auto * is_src_null = nativeBoolCast(b, std::make_shared<DataTypeUInt8>(), aggregate_data_is_null_src_value);
auto * is_null_result_value = b.CreateSelect(is_src_null, llvm::ConstantInt::get(b.getInt8Ty(), 1), aggregate_data_is_null_dst_value);

View File

@ -32,7 +32,7 @@ struct RankCorrelationData : public StatisticalSample<Float64, Float64>
std::tie(ranks_y, std::ignore) = computeRanksAndTieCorrection(this->y);
/// Sizes can be non-equal due to skipped NaNs.
const auto size = std::min(this->size_x, this->size_y);
const Float64 size = static_cast<Float64>(std::min(this->size_x, this->size_y));
/// Count d^2 sum
Float64 answer = 0;

View File

@ -13,7 +13,7 @@
#include <AggregateFunctions/IAggregateFunction.h>
#include <Common/config.h>
#include "config.h"
#include <Common/TargetSpecific.h>
#if USE_EMBEDDED_COMPILER

View File

@ -10,7 +10,7 @@
#include <Common/Exception.h>
#include <base/types.h>
#include "config_core.h"
#include "config.h"
#include <cstddef>
#include <memory>

View File

@ -1,6 +1,6 @@
#pragma once
#include <Common/config.h>
#include "config.h"
#if USE_DATASKETCHES

View File

@ -8,7 +8,7 @@
#include <Common/StringUtils/StringUtils.h>
#include <Common/SensitiveDataMasker.h>
#include <Common/config.h>
#include "config.h"
#include <Common/logger_useful.h>
#include <base/errnoToString.h>
#include <IO/ReadHelpers.h>

View File

@ -16,7 +16,7 @@
#include <base/range.h>
#include <BridgeHelper/IBridgeHelper.h>
#include <Common/config.h>
#include "config.h"
namespace DB

View File

@ -18,15 +18,11 @@ else()
endif()
include(../cmake/limit_jobs.cmake)
set (CONFIG_VERSION "${CMAKE_CURRENT_BINARY_DIR}/Common/config_version.h")
set (CONFIG_COMMON "${CMAKE_CURRENT_BINARY_DIR}/Common/config.h")
include (../cmake/version.cmake)
message (STATUS "Will build ${VERSION_FULL} revision ${VERSION_REVISION} ${VERSION_OFFICIAL}")
include (configure_config.cmake)
configure_file (Common/config.h.in ${CONFIG_COMMON})
configure_file (Common/config_version.h.in ${CONFIG_VERSION})
configure_file (Core/config_core.h.in "${CMAKE_CURRENT_BINARY_DIR}/Core/include/config_core.h")
configure_file (Common/config.h.in ${CONFIG_INCLUDE_PATH}/config.h)
configure_file (Common/config_version.h.in ${CONFIG_INCLUDE_PATH}/config_version.h)
if (USE_DEBUG_HELPERS)
get_target_property(MAGIC_ENUM_INCLUDE_DIR ch_contrib::magic_enum INTERFACE_INCLUDE_DIRECTORIES)
@ -75,7 +71,6 @@ add_subdirectory (AggregateFunctions)
add_subdirectory (Client)
add_subdirectory (TableFunctions)
add_subdirectory (Processors)
add_subdirectory (Formats)
add_subdirectory (Compression)
add_subdirectory (Server)
add_subdirectory (Coordination)
@ -153,7 +148,6 @@ else()
endif ()
list (APPEND clickhouse_common_io_sources ${CONFIG_BUILD})
list (APPEND clickhouse_common_io_headers ${CONFIG_VERSION} ${CONFIG_COMMON})
list (APPEND dbms_sources Functions/IFunction.cpp Functions/FunctionFactory.cpp Functions/FunctionHelpers.cpp Functions/extractTimeZoneFromFunctionArguments.cpp Functions/FunctionsLogical.cpp Functions/indexHint.cpp)
list (APPEND dbms_headers Functions/IFunction.h Functions/FunctionFactory.h Functions/FunctionHelpers.h Functions/extractTimeZoneFromFunctionArguments.h Functions/FunctionsLogical.h Functions/indexHint.h)
@ -247,7 +241,13 @@ add_object_library(clickhouse_access Access)
add_object_library(clickhouse_backups Backups)
add_object_library(clickhouse_core Core)
add_object_library(clickhouse_core_mysql Core/MySQL)
add_object_library(clickhouse_compression Compression)
if (NOT ENABLE_EXTERNAL_OPENSSL)
add_object_library(clickhouse_compression Compression)
else ()
add_headers_and_sources(dbms Compression)
list(REMOVE_ITEM dbms_headers Compression/CompressionCodecEncrypted.h)
list(REMOVE_ITEM dbms_sources Compression/CompressionCodecEncrypted.cpp)
endif ()
add_object_library(clickhouse_querypipeline QueryPipeline)
add_object_library(clickhouse_datatypes DataTypes)
add_object_library(clickhouse_datatypes_serializations DataTypes/Serializations)
@ -368,8 +368,6 @@ target_link_libraries(clickhouse_common_io PUBLIC ch_contrib::re2_st)
target_link_libraries(clickhouse_common_io PUBLIC ch_contrib::re2)
target_link_libraries(clickhouse_common_io
PRIVATE
${EXECINFO_LIBRARIES}
PUBLIC
boost::program_options
boost::system

View File

@ -6,6 +6,8 @@
#include <map>
#include <unordered_map>
#include "config.h"
#include <Common/DateLUT.h>
#include <Common/LocalDate.h>
#include <Common/MemoryTracker.h>
@ -17,14 +19,14 @@
#include <Common/getNumberOfPhysicalCPUCores.h>
#include <Common/tests/gtest_global_context.h>
#include <Common/typeid_cast.h>
#include <Common/config.h>
#include <Columns/ColumnString.h>
#include <Columns/ColumnsNumber.h>
#include <Core/Block.h>
#include <Core/Protocol.h>
#include <Formats/FormatFactory.h>
#include <Common/config_version.h>
#include "config_version.h"
#include <Common/UTF8Helpers.h>
#include <Common/TerminalSize.h>
#include <Common/clearPasswordFromCommandLine.h>

View File

@ -251,6 +251,7 @@ protected:
QueryFuzzer fuzzer;
int query_fuzzer_runs = 0;
int create_query_fuzzer_runs = 0;
struct
{

View File

@ -1,7 +1,7 @@
#pragma once
#include <Core/Types.h>
#include <Common/config.h>
#include "config.h"
#if USE_REPLXX
# include <base/ReplxxLineReader.h>

View File

@ -33,8 +33,8 @@
#include <pcg_random.hpp>
#include <base/scope_guard.h>
#include <Common/config_version.h>
#include <Common/config.h>
#include "config_version.h"
#include "config.h"
#if USE_SSL
# include <Poco/Net/SecureStreamSocket.h>

View File

@ -4,7 +4,7 @@
#include <Poco/Net/StreamSocket.h>
#include <Common/config.h>
#include "config.h"
#include <Client/IServerConnection.h>
#include <Core/Defines.h>

View File

@ -1,4 +1,22 @@
#include "QueryFuzzer.h"
#include <DataTypes/DataTypeArray.h>
#include <DataTypes/DataTypeFactory.h>
#include <DataTypes/DataTypeFixedString.h>
#include <DataTypes/DataTypeLowCardinality.h>
#include <DataTypes/DataTypeMap.h>
#include <DataTypes/DataTypeNullable.h>
#include <DataTypes/DataTypeTuple.h>
#include <DataTypes/DataTypesNumber.h>
#include <DataTypes/DataTypesDecimal.h>
#include <DataTypes/IDataType.h>
#include <IO/ReadBufferFromString.h>
#include <IO/WriteHelpers.h>
#include <Parsers/ASTColumnDeclaration.h>
#include <Parsers/ASTCreateQuery.h>
#include <Parsers/IAST_fwd.h>
#include <Parsers/ParserDataType.h>
#include <Parsers/ParserInsertQuery.h>
#include <Parsers/ASTDropQuery.h>
#include <unordered_set>
@ -430,6 +448,303 @@ void QueryFuzzer::fuzzWindowFrame(ASTWindowDefinition & def)
}
}
bool QueryFuzzer::isSuitableForFuzzing(const ASTCreateQuery & create)
{
return create.columns_list && create.columns_list->columns;
}
void QueryFuzzer::fuzzCreateQuery(ASTCreateQuery & create)
{
if (create.columns_list && create.columns_list->columns)
{
for (auto & ast : create.columns_list->columns->children)
{
if (auto * column = ast->as<ASTColumnDeclaration>())
{
fuzzColumnDeclaration(*column);
}
}
}
if (create.storage && create.storage->engine)
{
/// Replace ReplicatedMergeTree to ordinary MergeTree
/// to avoid inconsistency of metadata in zookeeper.
auto & engine_name = create.storage->engine->name;
if (startsWith(engine_name, "Replicated"))
{
engine_name = engine_name.substr(strlen("Replicated"));
if (auto & arguments = create.storage->engine->arguments)
{
auto & children = arguments->children;
if (children.size() <= 2)
arguments.reset();
else
children.erase(children.begin(), children.begin() + 2);
}
}
}
auto full_name = create.getTable();
auto original_name = full_name.substr(0, full_name.find("__fuzz_"));
size_t index = index_of_fuzzed_table[original_name]++;
auto new_name = original_name + "__fuzz_" + toString(index);
create.setTable(new_name);
SipHash sip_hash;
sip_hash.update(original_name);
if (create.columns_list)
create.columns_list->updateTreeHash(sip_hash);
if (create.storage)
create.storage->updateTreeHash(sip_hash);
IAST::Hash hash;
sip_hash.get128(hash);
/// Save only tables with unique definition.
if (created_tables_hashes.insert(hash).second)
original_table_name_to_fuzzed[original_name].insert(new_name);
}
void QueryFuzzer::fuzzColumnDeclaration(ASTColumnDeclaration & column)
{
if (column.type)
{
auto data_type = fuzzDataType(DataTypeFactory::instance().get(column.type));
ParserDataType parser;
column.type = parseQuery(parser, data_type->getName(), DBMS_DEFAULT_MAX_QUERY_SIZE, DBMS_DEFAULT_MAX_PARSER_DEPTH);
}
}
DataTypePtr QueryFuzzer::fuzzDataType(DataTypePtr type)
{
/// Do not replace Array/Tuple/etc. with not Array/Tuple too often.
const auto * type_array = typeid_cast<const DataTypeArray *>(type.get());
if (type_array && fuzz_rand() % 4 != 0)
return std::make_shared<DataTypeArray>(fuzzDataType(type_array->getNestedType()));
const auto * type_tuple = typeid_cast<const DataTypeTuple *>(type.get());
if (type_tuple && fuzz_rand() % 4 != 0)
{
DataTypes elements;
for (const auto & element : type_tuple->getElements())
elements.push_back(fuzzDataType(element));
return type_tuple->haveExplicitNames()
? std::make_shared<DataTypeTuple>(elements, type_tuple->getElementNames())
: std::make_shared<DataTypeTuple>(elements);
}
const auto * type_map = typeid_cast<const DataTypeMap *>(type.get());
if (type_map && fuzz_rand() % 4 != 0)
{
auto key_type = fuzzDataType(type_map->getKeyType());
auto value_type = fuzzDataType(type_map->getValueType());
if (!DataTypeMap::checkKeyType(key_type))
key_type = type_map->getKeyType();
return std::make_shared<DataTypeMap>(key_type, value_type);
}
const auto * type_nullable = typeid_cast<const DataTypeNullable *>(type.get());
if (type_nullable)
{
size_t tmp = fuzz_rand() % 3;
if (tmp == 0)
return fuzzDataType(type_nullable->getNestedType());
if (tmp == 1)
{
auto nested_type = fuzzDataType(type_nullable->getNestedType());
if (nested_type->canBeInsideNullable())
return std::make_shared<DataTypeNullable>(nested_type);
}
}
const auto * type_low_cardinality = typeid_cast<const DataTypeLowCardinality *>(type.get());
if (type_low_cardinality)
{
size_t tmp = fuzz_rand() % 3;
if (tmp == 0)
return fuzzDataType(type_low_cardinality->getDictionaryType());
if (tmp == 1)
{
auto nested_type = fuzzDataType(type_low_cardinality->getDictionaryType());
if (nested_type->canBeInsideLowCardinality())
return std::make_shared<DataTypeLowCardinality>(nested_type);
}
}
size_t tmp = fuzz_rand() % 8;
if (tmp == 0)
return std::make_shared<DataTypeArray>(type);
if (tmp <= 1 && type->canBeInsideNullable())
return std::make_shared<DataTypeNullable>(type);
if (tmp <= 2 && type->canBeInsideLowCardinality())
return std::make_shared<DataTypeLowCardinality>(type);
if (tmp <= 3)
return getRandomType();
return type;
}
DataTypePtr QueryFuzzer::getRandomType()
{
auto type_id = static_cast<TypeIndex>(fuzz_rand() % static_cast<size_t>(TypeIndex::Tuple) + 1);
if (type_id == TypeIndex::Tuple)
{
size_t tuple_size = fuzz_rand() % 6 + 1;
DataTypes elements;
for (size_t i = 0; i < tuple_size; ++i)
elements.push_back(getRandomType());
return std::make_shared<DataTypeTuple>(elements);
}
if (type_id == TypeIndex::Array)
return std::make_shared<DataTypeArray>(getRandomType());
/// NOLINTBEGIN(bugprone-macro-parentheses)
#define DISPATCH(DECIMAL) \
if (type_id == TypeIndex::DECIMAL) \
return std::make_shared<DataTypeDecimal<DECIMAL>>( \
DataTypeDecimal<DECIMAL>::maxPrecision(), \
(fuzz_rand() % DataTypeDecimal<DECIMAL>::maxPrecision()) + 1);
DISPATCH(Decimal32)
DISPATCH(Decimal64)
DISPATCH(Decimal128)
DISPATCH(Decimal256)
#undef DISPATCH
/// NOLINTEND(bugprone-macro-parentheses)
if (type_id == TypeIndex::FixedString)
return std::make_shared<DataTypeFixedString>(fuzz_rand() % 20);
if (type_id == TypeIndex::Enum8)
return std::make_shared<DataTypeUInt8>();
if (type_id == TypeIndex::Enum16)
return std::make_shared<DataTypeUInt16>();
return DataTypeFactory::instance().get(String(magic_enum::enum_name(type_id)));
}
void QueryFuzzer::fuzzTableName(ASTTableExpression & table)
{
if (!table.database_and_table_name || fuzz_rand() % 3 == 0)
return;
const auto * identifier = table.database_and_table_name->as<ASTTableIdentifier>();
if (!identifier)
return;
auto table_id = identifier->getTableId();
if (table_id.empty())
return;
auto it = original_table_name_to_fuzzed.find(table_id.getTableName());
if (it != original_table_name_to_fuzzed.end() && !it->second.empty())
{
auto new_table_name = it->second.begin();
std::advance(new_table_name, fuzz_rand() % it->second.size());
StorageID new_table_id(table_id.database_name, *new_table_name);
table.database_and_table_name = std::make_shared<ASTTableIdentifier>(new_table_id);
}
}
static ASTPtr tryParseInsertQuery(const String & full_query)
{
const char * pos = full_query.data();
const char * end = full_query.data() + full_query.size();
ParserInsertQuery parser(end, false);
String message;
return tryParseQuery(parser, pos, end, message, false, "", false, DBMS_DEFAULT_MAX_QUERY_SIZE, DBMS_DEFAULT_MAX_PARSER_DEPTH);
}
ASTs QueryFuzzer::getInsertQueriesForFuzzedTables(const String & full_query)
{
auto parsed_query = tryParseInsertQuery(full_query);
if (!parsed_query)
return {};
const auto & insert = *parsed_query->as<ASTInsertQuery>();
if (!insert.table)
return {};
auto table_name = insert.getTable();
auto it = original_table_name_to_fuzzed.find(table_name);
if (it == original_table_name_to_fuzzed.end())
return {};
ASTs queries;
for (const auto & fuzzed_name : it->second)
{
/// Parse query from scratch for each table instead of clone,
/// to store proper pointers to inlined data,
/// which are not copied during clone.
auto & query = queries.emplace_back(tryParseInsertQuery(full_query));
query->as<ASTInsertQuery>()->setTable(fuzzed_name);
}
return queries;
}
ASTs QueryFuzzer::getDropQueriesForFuzzedTables(const ASTDropQuery & drop_query)
{
if (drop_query.kind != ASTDropQuery::Drop)
return {};
auto table_name = drop_query.getTable();
auto it = index_of_fuzzed_table.find(table_name);
if (it == index_of_fuzzed_table.end())
return {};
ASTs queries;
/// Drop all created tables, not only unique ones.
for (size_t i = 0; i < it->second; ++i)
{
auto fuzzed_name = table_name + "__fuzz_" + toString(i);
auto & query = queries.emplace_back(drop_query.clone());
query->as<ASTDropQuery>()->setTable(fuzzed_name);
/// Just in case add IF EXISTS to avoid exceptions.
query->as<ASTDropQuery>()->if_exists = true;
}
index_of_fuzzed_table.erase(it);
original_table_name_to_fuzzed.erase(table_name);
return queries;
}
void QueryFuzzer::notifyQueryFailed(ASTPtr ast)
{
auto remove_fuzzed_table = [this](const auto & table_name)
{
auto pos = table_name.find("__fuzz_");
if (pos != std::string::npos)
{
auto original_name = table_name.substr(0, pos);
original_table_name_to_fuzzed[original_name].erase(table_name);
}
};
if (const auto * create = ast->as<ASTCreateQuery>())
remove_fuzzed_table(create->getTable());
if (const auto * insert = ast->as<ASTInsertQuery>())
remove_fuzzed_table(insert->getTable());
}
void QueryFuzzer::fuzz(ASTs & asts)
{
for (auto & ast : asts)
@ -497,6 +812,7 @@ void QueryFuzzer::fuzz(ASTPtr & ast)
}
else if (auto * table_expr = typeid_cast<ASTTableExpression *>(ast.get()))
{
fuzzTableName(*table_expr);
fuzz(table_expr->children);
}
else if (auto * expr_list = typeid_cast<ASTExpressionList *>(ast.get()))
@ -563,6 +879,10 @@ void QueryFuzzer::fuzz(ASTPtr & ast)
literal->value = fuzzField(literal->value);
}
}
else if (auto * create_query = typeid_cast<ASTCreateQuery *>(ast.get()))
{
fuzzCreateQuery(*create_query);
}
else
{
fuzz(ast->children);

View File

@ -1,5 +1,6 @@
#pragma once
#include <DataTypes/IDataType.h>
#include <unordered_set>
#include <unordered_map>
#include <vector>
@ -16,6 +17,11 @@ namespace DB
class ASTExpressionList;
class ASTOrderByElement;
class ASTCreateQuery;
class ASTInsertQuery;
class ASTColumnDeclaration;
class ASTDropQuery;
struct ASTTableExpression;
struct ASTWindowDefinition;
/*
@ -54,6 +60,9 @@ struct QueryFuzzer
std::unordered_set<const IAST *> debug_visited_nodes;
ASTPtr * debug_top_ast = nullptr;
std::unordered_map<std::string, std::unordered_set<std::string>> original_table_name_to_fuzzed;
std::unordered_map<std::string, size_t> index_of_fuzzed_table;
std::set<IAST::Hash> created_tables_hashes;
// This is the only function you have to call -- it will modify the passed
// ASTPtr to point to new AST with some random changes.
@ -63,18 +72,28 @@ struct QueryFuzzer
Field getRandomField(int type);
Field fuzzField(Field field);
ASTPtr getRandomColumnLike();
DataTypePtr fuzzDataType(DataTypePtr type);
DataTypePtr getRandomType();
ASTs getInsertQueriesForFuzzedTables(const String & full_query);
ASTs getDropQueriesForFuzzedTables(const ASTDropQuery & drop_query);
void notifyQueryFailed(ASTPtr ast);
void replaceWithColumnLike(ASTPtr & ast);
void replaceWithTableLike(ASTPtr & ast);
void fuzzOrderByElement(ASTOrderByElement * elem);
void fuzzOrderByList(IAST * ast);
void fuzzColumnLikeExpressionList(IAST * ast);
void fuzzWindowFrame(ASTWindowDefinition & def);
void fuzzCreateQuery(ASTCreateQuery & create);
void fuzzColumnDeclaration(ASTColumnDeclaration & column);
void fuzzTableName(ASTTableExpression & table);
void fuzz(ASTs & asts);
void fuzz(ASTPtr & ast);
void collectFuzzInfoMain(ASTPtr ast);
void addTableLike(ASTPtr ast);
void addColumnLike(ASTPtr ast);
void collectFuzzInfoRecurse(ASTPtr ast);
static bool isSuitableForFuzzing(const ASTCreateQuery & create);
};
}

Some files were not shown because too many files have changed in this diff Show More