Merge branch 'master' into contrib_sparse_checkout

This commit is contained in:
Alexander Tokmakov 2023-04-07 23:44:38 +02:00
commit 3c509214e4
545 changed files with 20419 additions and 5279 deletions

View File

@ -110,6 +110,7 @@ Checks: '*,
-misc-const-correctness,
-misc-no-recursion,
-misc-non-private-member-variables-in-classes,
-misc-confusable-identifiers, # useful but slooow
-modernize-avoid-c-arrays,
-modernize-concat-nested-namespaces,
@ -148,19 +149,6 @@ Checks: '*,
-readability-use-anyofallof,
-zirkon-*,
-misc-*, # temporarily disabled due to being too slow
# also disable checks in other categories which are aliases of checks in misc-*:
# https://releases.llvm.org/15.0.0/tools/clang/tools/extra/docs/clang-tidy/checks/list.html
-cert-dcl54-cpp, # alias of misc-new-delete-overloads
-hicpp-new-delete-operators, # alias of misc-new-delete-overloads
-cert-fio38-c, # alias of misc-non-copyable-objects
-cert-dcl03-c, # alias of misc-static-assert
-hicpp-static-assert, # alias of misc-static-assert
-cert-err09-cpp, # alias of misc-throw-by-value-catch-by-reference
-cert-err61-cpp, # alias of misc-throw-by-value-catch-by-reference
-cppcoreguidelines-c-copy-assignment-signature, # alias of misc-unconventional-assign-operator
-cppcoreguidelines-non-private-member-variables-in-classes, # alias of misc-non-private-member-variables-in-classes
'
WarningsAsErrors: '*'

View File

@ -1,4 +1,4 @@
[![ClickHouse — open source distributed column-oriented DBMS](https://github.com/ClickHouse/clickhouse-presentations/raw/master/images/logo-400x240.png)](https://clickhouse.com)
[<img alt="ClickHouse — open source distributed column-oriented DBMS" width="400px" src="https://clickhouse.com/images/ch_gh_logo_rounded.png" />](https://clickhouse.com?utm_source=github)
ClickHouse® is an open-source column-oriented database management system that allows generating analytical data reports in real-time.

View File

@ -36,7 +36,7 @@
namespace detail
{
template <char ...chars> constexpr bool is_in(char x) { return ((x == chars) || ...); }
template <char ...chars> constexpr bool is_in(char x) { return ((x == chars) || ...); } // NOLINT(misc-redundant-expression)
#if defined(__SSE2__)
template <char s0>

View File

@ -35,7 +35,7 @@ public:
Self & operator=(T && rhs) { t = std::move(rhs); return *this;}
// NOLINTBEGIN(google-explicit-constructor)
operator const T & () const { return t; }
constexpr operator const T & () const { return t; }
operator T & () { return t; }
// NOLINTEND(google-explicit-constructor)

View File

@ -0,0 +1,81 @@
/* origin: FreeBSD /usr/src/lib/msun/src/e_expf.c */
/*
* Conversion to float by Ian Lance Taylor, Cygnus Support, ian@cygnus.com.
*/
/*
* ====================================================
* Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
*
* Developed at SunPro, a Sun Microsystems, Inc. business.
* Permission to use, copy, modify, and distribute this
* software is freely granted, provided that this notice
* is preserved.
* ====================================================
*/
#include "libm.h"
static const float
half[2] = {0.5,-0.5},
ln2hi = 6.9314575195e-1f, /* 0x3f317200 */
ln2lo = 1.4286067653e-6f, /* 0x35bfbe8e */
invln2 = 1.4426950216e+0f, /* 0x3fb8aa3b */
/*
* Domain [-0.34568, 0.34568], range ~[-4.278e-9, 4.447e-9]:
* |x*(exp(x)+1)/(exp(x)-1) - p(x)| < 2**-27.74
*/
P1 = 1.6666625440e-1f, /* 0xaaaa8f.0p-26 */
P2 = -2.7667332906e-3f; /* -0xb55215.0p-32 */
float expf(float x)
{
float_t hi, lo, c, xx, y;
int k, sign;
uint32_t hx;
GET_FLOAT_WORD(hx, x);
sign = hx >> 31; /* sign bit of x */
hx &= 0x7fffffff; /* high word of |x| */
/* special cases */
if (hx >= 0x42aeac50) { /* if |x| >= -87.33655f or NaN */
if (hx >= 0x42b17218 && !sign) { /* x >= 88.722839f */
/* overflow */
x *= 0x1p127f;
return x;
}
if (sign) {
/* underflow */
FORCE_EVAL(-0x1p-149f/x);
if (hx >= 0x42cff1b5) /* x <= -103.972084f */
return 0;
}
}
/* argument reduction */
if (hx > 0x3eb17218) { /* if |x| > 0.5 ln2 */
if (hx > 0x3f851592) /* if |x| > 1.5 ln2 */
k = invln2*x + half[sign];
else
k = 1 - sign - sign;
hi = x - k*ln2hi; /* k*ln2hi is exact here */
lo = k*ln2lo;
x = hi - lo;
} else if (hx > 0x39000000) { /* |x| > 2**-14 */
k = 0;
hi = x;
lo = 0;
} else {
/* raise inexact */
FORCE_EVAL(0x1p127f + x);
return 1 + x;
}
/* x is now in primary range */
xx = x*x;
c = x - xx*(P1+xx*P2);
y = 1 + (x*c/(2-c) - lo + hi);
if (k == 0)
return y;
return scalbnf(y, k);
}

View File

@ -0,0 +1,31 @@
#include <math.h>
#include <stdint.h>
float scalbnf(float x, int n)
{
union {float f; uint32_t i;} u;
float_t y = x;
if (n > 127) {
y *= 0x1p127f;
n -= 127;
if (n > 127) {
y *= 0x1p127f;
n -= 127;
if (n > 127)
n = 127;
}
} else if (n < -126) {
y *= 0x1p-126f;
n += 126;
if (n < -126) {
y *= 0x1p-126f;
n += 126;
if (n < -126)
n = -126;
}
}
u.i = (uint32_t)(0x7f+n)<<23;
x = y * u.f;
return x;
}

2
contrib/arrow vendored

@ -1 +1 @@
Subproject commit d03245f801f798c63ee9a7d2b8914a9e5c5cd666
Subproject commit 1f1b3d35fb6eb73e6492d3afd8a85cde848d174f

View File

@ -202,6 +202,7 @@ set(ARROW_SRCS
"${LIBRARY_DIR}/builder.cc"
"${LIBRARY_DIR}/buffer.cc"
"${LIBRARY_DIR}/chunked_array.cc"
"${LIBRARY_DIR}/chunk_resolver.cc"
"${LIBRARY_DIR}/compare.cc"
"${LIBRARY_DIR}/config.cc"
"${LIBRARY_DIR}/datum.cc"
@ -268,6 +269,10 @@ set(ARROW_SRCS
"${LIBRARY_DIR}/util/uri.cc"
"${LIBRARY_DIR}/util/utf8.cc"
"${LIBRARY_DIR}/util/value_parsing.cc"
"${LIBRARY_DIR}/util/byte_size.cc"
"${LIBRARY_DIR}/util/debug.cc"
"${LIBRARY_DIR}/util/tracing.cc"
"${LIBRARY_DIR}/util/atfork_internal.cc"
"${LIBRARY_DIR}/vendored/base64.cpp"
"${LIBRARY_DIR}/vendored/datetime/tz.cpp"
@ -301,9 +306,11 @@ set(ARROW_SRCS
"${LIBRARY_DIR}/compute/exec/source_node.cc"
"${LIBRARY_DIR}/compute/exec/sink_node.cc"
"${LIBRARY_DIR}/compute/exec/order_by_impl.cc"
"${LIBRARY_DIR}/compute/exec/partition_util.cc"
"${LIBRARY_DIR}/compute/function.cc"
"${LIBRARY_DIR}/compute/function_internal.cc"
"${LIBRARY_DIR}/compute/kernel.cc"
"${LIBRARY_DIR}/compute/light_array.cc"
"${LIBRARY_DIR}/compute/registry.cc"
"${LIBRARY_DIR}/compute/kernels/aggregate_basic.cc"
"${LIBRARY_DIR}/compute/kernels/aggregate_mode.cc"
@ -317,21 +324,28 @@ set(ARROW_SRCS
"${LIBRARY_DIR}/compute/kernels/scalar_cast_boolean.cc"
"${LIBRARY_DIR}/compute/kernels/scalar_cast_dictionary.cc"
"${LIBRARY_DIR}/compute/kernels/scalar_cast_internal.cc"
"${LIBRARY_DIR}/compute/kernels/scalar_cast_extension.cc"
"${LIBRARY_DIR}/compute/kernels/scalar_cast_nested.cc"
"${LIBRARY_DIR}/compute/kernels/scalar_cast_numeric.cc"
"${LIBRARY_DIR}/compute/kernels/scalar_cast_string.cc"
"${LIBRARY_DIR}/compute/kernels/scalar_cast_temporal.cc"
"${LIBRARY_DIR}/compute/kernels/scalar_compare.cc"
"${LIBRARY_DIR}/compute/kernels/scalar_nested.cc"
"${LIBRARY_DIR}/compute/kernels/scalar_random.cc"
"${LIBRARY_DIR}/compute/kernels/scalar_round.cc"
"${LIBRARY_DIR}/compute/kernels/scalar_set_lookup.cc"
"${LIBRARY_DIR}/compute/kernels/scalar_string.cc"
"${LIBRARY_DIR}/compute/kernels/scalar_temporal_binary.cc"
"${LIBRARY_DIR}/compute/kernels/scalar_temporal_unary.cc"
"${LIBRARY_DIR}/compute/kernels/scalar_validity.cc"
"${LIBRARY_DIR}/compute/kernels/scalar_if_else.cc"
"${LIBRARY_DIR}/compute/kernels/scalar_string_ascii.cc"
"${LIBRARY_DIR}/compute/kernels/scalar_string_utf8.cc"
"${LIBRARY_DIR}/compute/kernels/util_internal.cc"
"${LIBRARY_DIR}/compute/kernels/vector_array_sort.cc"
"${LIBRARY_DIR}/compute/kernels/vector_cumulative_ops.cc"
"${LIBRARY_DIR}/compute/kernels/vector_hash.cc"
"${LIBRARY_DIR}/compute/kernels/vector_rank.cc"
"${LIBRARY_DIR}/compute/kernels/vector_select_k.cc"
"${LIBRARY_DIR}/compute/kernels/vector_nested.cc"
"${LIBRARY_DIR}/compute/kernels/vector_replace.cc"
"${LIBRARY_DIR}/compute/kernels/vector_selection.cc"
@ -340,13 +354,15 @@ set(ARROW_SRCS
"${LIBRARY_DIR}/compute/exec/union_node.cc"
"${LIBRARY_DIR}/compute/exec/key_hash.cc"
"${LIBRARY_DIR}/compute/exec/key_map.cc"
"${LIBRARY_DIR}/compute/exec/key_compare.cc"
"${LIBRARY_DIR}/compute/exec/key_encode.cc"
"${LIBRARY_DIR}/compute/exec/util.cc"
"${LIBRARY_DIR}/compute/exec/hash_join_dict.cc"
"${LIBRARY_DIR}/compute/exec/hash_join.cc"
"${LIBRARY_DIR}/compute/exec/hash_join_node.cc"
"${LIBRARY_DIR}/compute/exec/task_util.cc"
"${LIBRARY_DIR}/compute/row/encode_internal.cc"
"${LIBRARY_DIR}/compute/row/grouper.cc"
"${LIBRARY_DIR}/compute/row/compare_internal.cc"
"${LIBRARY_DIR}/compute/row/row_internal.cc"
"${LIBRARY_DIR}/ipc/dictionary.cc"
"${LIBRARY_DIR}/ipc/feather.cc"
@ -357,7 +373,8 @@ set(ARROW_SRCS
"${LIBRARY_DIR}/ipc/writer.cc"
"${ARROW_SRC_DIR}/arrow/adapters/orc/adapter.cc"
"${ARROW_SRC_DIR}/arrow/adapters/orc/adapter_util.cc"
"${ARROW_SRC_DIR}/arrow/adapters/orc/util.cc"
"${ARROW_SRC_DIR}/arrow/adapters/orc/options.cc"
)
add_definitions(-DARROW_WITH_LZ4)

View File

@ -15,10 +15,6 @@ if(NOT AWK_PROGRAM)
message(FATAL_ERROR "You need the awk program to build ClickHouse with krb5 enabled.")
endif()
if (NOT (ENABLE_OPENSSL OR ENABLE_OPENSSL_DYNAMIC))
add_compile_definitions(USE_BORINGSSL=1)
endif ()
set(KRB5_SOURCE_DIR "${ClickHouse_SOURCE_DIR}/contrib/krb5/src")
set(KRB5_ET_BIN_DIR "${CMAKE_CURRENT_BINARY_DIR}/include_private")
@ -162,6 +158,11 @@ set(ALL_SRCS
"${KRB5_SOURCE_DIR}/lib/crypto/builtin/kdf.c"
"${KRB5_SOURCE_DIR}/lib/crypto/builtin/cmac.c"
"${KRB5_SOURCE_DIR}/lib/crypto/builtin/des/des_keys.c"
"${KRB5_SOURCE_DIR}/lib/crypto/builtin/des/f_parity.c"
"${KRB5_SOURCE_DIR}/lib/crypto/builtin/enc_provider/rc4.c"
"${KRB5_SOURCE_DIR}/lib/crypto/builtin/hash_provider/hash_md4.c"
"${KRB5_SOURCE_DIR}/lib/crypto/builtin/md4/md4.c"
"${KRB5_SOURCE_DIR}/lib/crypto/krb/prng.c"
"${KRB5_SOURCE_DIR}/lib/crypto/krb/enc_dk_cmac.c"
# "${KRB5_SOURCE_DIR}/lib/crypto/krb/crc32.c"
@ -226,7 +227,6 @@ set(ALL_SRCS
# "${KRB5_SOURCE_DIR}/lib/crypto/openssl/enc_provider/des.c"
"${KRB5_SOURCE_DIR}/lib/crypto/openssl/enc_provider/rc4.c"
"${KRB5_SOURCE_DIR}/lib/crypto/openssl/enc_provider/des3.c"
#"${KRB5_SOURCE_DIR}/lib/crypto/openssl/enc_provider/camellia.c"
"${KRB5_SOURCE_DIR}/lib/crypto/openssl/cmac.c"
"${KRB5_SOURCE_DIR}/lib/crypto/openssl/sha256.c"
"${KRB5_SOURCE_DIR}/lib/crypto/openssl/hmac.c"
@ -474,6 +474,14 @@ set(ALL_SRCS
"${KRB5_SOURCE_DIR}/lib/krb5/krb5_libinit.c"
)
if (NOT (ENABLE_OPENSSL OR ENABLE_OPENSSL_DYNAMIC))
add_compile_definitions(USE_BORINGSSL=1)
else()
set(ALL_SRCS ${ALL_SRCS}
"${KRB5_SOURCE_DIR}/lib/crypto/openssl/enc_provider/camellia.c"
)
endif()
add_custom_command(
OUTPUT "${CMAKE_CURRENT_BINARY_DIR}/compile_et"
COMMAND /bin/sh
@ -673,6 +681,7 @@ target_include_directories(_krb5 PRIVATE
"${KRB5_SOURCE_DIR}/lib/gssapi/krb5"
"${KRB5_SOURCE_DIR}/lib/gssapi/spnego"
"${KRB5_SOURCE_DIR}/util/et"
"${KRB5_SOURCE_DIR}/lib/crypto/builtin/md4"
"${KRB5_SOURCE_DIR}/lib/crypto/openssl"
"${KRB5_SOURCE_DIR}/lib/crypto/krb"
"${KRB5_SOURCE_DIR}/util/profile"

View File

@ -0,0 +1,26 @@
---
sidebar_position: 1
sidebar_label: 2023
---
# 2023 Changelog
### ClickHouse release v22.12.6.22-stable (10d87f90261) FIXME as compared to v22.12.5.34-stable (b82d6401ca1)
#### Bug Fix (user-visible misbehavior in an official stable release)
* Fix changing an expired role [#46772](https://github.com/ClickHouse/ClickHouse/pull/46772) ([Vitaly Baranov](https://github.com/vitlibar)).
* Fix bug in zero-copy replication disk choice during fetch [#47010](https://github.com/ClickHouse/ClickHouse/pull/47010) ([alesapin](https://github.com/alesapin)).
* Fix NOT_IMPLEMENTED error with CROSS JOIN and algorithm = auto [#47068](https://github.com/ClickHouse/ClickHouse/pull/47068) ([Vladimir C](https://github.com/vdimir)).
* Fix query parameters [#47488](https://github.com/ClickHouse/ClickHouse/pull/47488) ([Alexey Milovidov](https://github.com/alexey-milovidov)).
* Hotfix for too verbose warnings in HTTP [#47903](https://github.com/ClickHouse/ClickHouse/pull/47903) ([Alexander Tokmakov](https://github.com/tavplubix)).
#### NOT FOR CHANGELOG / INSIGNIFICANT
* Better error messages in ReplicatedMergeTreeAttachThread [#47454](https://github.com/ClickHouse/ClickHouse/pull/47454) ([Alexander Tokmakov](https://github.com/tavplubix)).
* Add a fuse for backport branches w/o a created PR [#47760](https://github.com/ClickHouse/ClickHouse/pull/47760) ([Mikhail f. Shiryaev](https://github.com/Felixoid)).
* Only valid Reviews.STATES overwrite existing reviews [#47789](https://github.com/ClickHouse/ClickHouse/pull/47789) ([Mikhail f. Shiryaev](https://github.com/Felixoid)).
* Place short return before big block, improve logging [#47822](https://github.com/ClickHouse/ClickHouse/pull/47822) ([Mikhail f. Shiryaev](https://github.com/Felixoid)).
* Artifacts s3 prefix [#47945](https://github.com/ClickHouse/ClickHouse/pull/47945) ([Mikhail f. Shiryaev](https://github.com/Felixoid)).
* Fix tsan error lock-order-inversion [#47953](https://github.com/ClickHouse/ClickHouse/pull/47953) ([Kruglov Pavel](https://github.com/Avogar)).

View File

@ -0,0 +1,29 @@
---
sidebar_position: 1
sidebar_label: 2023
---
# 2023 Changelog
### ClickHouse release v22.3.20.29-lts (297b4dd5e55) FIXME as compared to v22.3.19.6-lts (467e0a7bd77)
#### Improvement
* Backported in [#46979](https://github.com/ClickHouse/ClickHouse/issues/46979): Apply `ALTER TABLE table_name ON CLUSTER cluster MOVE PARTITION|PART partition_expr TO DISK|VOLUME 'disk_name'` to all replicas. Because `ALTER TABLE t MOVE` is not replicated. [#46402](https://github.com/ClickHouse/ClickHouse/pull/46402) ([lizhuoyu5](https://github.com/lzydmxy)).
#### Bug Fix (user-visible misbehavior in an official stable release)
* Fix incorrect alias recursion in QueryNormalizer [#46609](https://github.com/ClickHouse/ClickHouse/pull/46609) ([Raúl Marín](https://github.com/Algunenano)).
* Fix arithmetic operations in aggregate optimization [#46705](https://github.com/ClickHouse/ClickHouse/pull/46705) ([Duc Canh Le](https://github.com/canhld94)).
* Fix MSan report in `maxIntersections` function [#46847](https://github.com/ClickHouse/ClickHouse/pull/46847) ([Alexey Milovidov](https://github.com/alexey-milovidov)).
* Fix wrong results of some LIKE searches when the LIKE pattern contains quoted non-quotable characters [#46875](https://github.com/ClickHouse/ClickHouse/pull/46875) ([Robert Schulze](https://github.com/rschu1ze)).
* Fix possible deadlock in QueryStatus [#47161](https://github.com/ClickHouse/ClickHouse/pull/47161) ([Kruglov Pavel](https://github.com/Avogar)).
#### NOT FOR CHANGELOG / INSIGNIFICANT
* Update typing for a new PyGithub version [#47123](https://github.com/ClickHouse/ClickHouse/pull/47123) ([Mikhail f. Shiryaev](https://github.com/Felixoid)).
* Add a manual trigger for release workflow [#47302](https://github.com/ClickHouse/ClickHouse/pull/47302) ([Mikhail f. Shiryaev](https://github.com/Felixoid)).
* Add a fuse for backport branches w/o a created PR [#47760](https://github.com/ClickHouse/ClickHouse/pull/47760) ([Mikhail f. Shiryaev](https://github.com/Felixoid)).
* Only valid Reviews.STATES overwrite existing reviews [#47789](https://github.com/ClickHouse/ClickHouse/pull/47789) ([Mikhail f. Shiryaev](https://github.com/Felixoid)).
* Place short return before big block, improve logging [#47822](https://github.com/ClickHouse/ClickHouse/pull/47822) ([Mikhail f. Shiryaev](https://github.com/Felixoid)).
* Fix tsan error lock-order-inversion [#47953](https://github.com/ClickHouse/ClickHouse/pull/47953) ([Kruglov Pavel](https://github.com/Avogar)).

View File

@ -0,0 +1,29 @@
---
sidebar_position: 1
sidebar_label: 2023
---
# 2023 Changelog
### ClickHouse release v22.8.16.32-lts (7c4be737bd0) FIXME as compared to v22.8.15.23-lts (d36fa168bbf)
#### Build/Testing/Packaging Improvement
* Backported in [#48344](https://github.com/ClickHouse/ClickHouse/issues/48344): Use sccache as a replacement for ccache and using S3 as cache backend. [#46240](https://github.com/ClickHouse/ClickHouse/pull/46240) ([Mikhail f. Shiryaev](https://github.com/Felixoid)).
* Backported in [#48250](https://github.com/ClickHouse/ClickHouse/issues/48250): The `clickhouse/clickhouse-keeper` image used to be pushed only with tags `-alpine`, e.g. `latest-alpine`. As it was suggested in https://github.com/ClickHouse/examples/pull/2, now it will be pushed as suffixless too. [#48236](https://github.com/ClickHouse/ClickHouse/pull/48236) ([Mikhail f. Shiryaev](https://github.com/Felixoid)).
#### Bug Fix (user-visible misbehavior in an official stable release)
* Fix bug in zero-copy replication disk choice during fetch [#47010](https://github.com/ClickHouse/ClickHouse/pull/47010) ([alesapin](https://github.com/alesapin)).
* Fix query parameters [#47488](https://github.com/ClickHouse/ClickHouse/pull/47488) ([Alexey Milovidov](https://github.com/alexey-milovidov)).
* Fix wait for zero copy lock during move [#47631](https://github.com/ClickHouse/ClickHouse/pull/47631) ([alesapin](https://github.com/alesapin)).
* Fix crash in polygonsSymDifferenceCartesian [#47702](https://github.com/ClickHouse/ClickHouse/pull/47702) ([pufit](https://github.com/pufit)).
* Backport to 22.8: Fix moving broken parts to the detached for the object storage disk on startup [#48273](https://github.com/ClickHouse/ClickHouse/pull/48273) ([Aleksei Filatov](https://github.com/aalexfvk)).
#### NOT FOR CHANGELOG / INSIGNIFICANT
* Add a fuse for backport branches w/o a created PR [#47760](https://github.com/ClickHouse/ClickHouse/pull/47760) ([Mikhail f. Shiryaev](https://github.com/Felixoid)).
* Only valid Reviews.STATES overwrite existing reviews [#47789](https://github.com/ClickHouse/ClickHouse/pull/47789) ([Mikhail f. Shiryaev](https://github.com/Felixoid)).
* Place short return before big block, improve logging [#47822](https://github.com/ClickHouse/ClickHouse/pull/47822) ([Mikhail f. Shiryaev](https://github.com/Felixoid)).
* Artifacts s3 prefix [#47945](https://github.com/ClickHouse/ClickHouse/pull/47945) ([Mikhail f. Shiryaev](https://github.com/Felixoid)).
* Fix tsan error lock-order-inversion [#47953](https://github.com/ClickHouse/ClickHouse/pull/47953) ([Kruglov Pavel](https://github.com/Avogar)).

View File

@ -0,0 +1,34 @@
---
sidebar_position: 1
sidebar_label: 2023
---
# 2023 Changelog
### ClickHouse release v23.1.6.42-stable (783ddf67991) FIXME as compared to v23.1.5.24-stable (0e51b53ba99)
#### Build/Testing/Packaging Improvement
* Backported in [#48215](https://github.com/ClickHouse/ClickHouse/issues/48215): Use sccache as a replacement for ccache and using S3 as cache backend. [#46240](https://github.com/ClickHouse/ClickHouse/pull/46240) ([Mikhail f. Shiryaev](https://github.com/Felixoid)).
* Backported in [#48254](https://github.com/ClickHouse/ClickHouse/issues/48254): The `clickhouse/clickhouse-keeper` image used to be pushed only with tags `-alpine`, e.g. `latest-alpine`. As it was suggested in https://github.com/ClickHouse/examples/pull/2, now it will be pushed as suffixless too. [#48236](https://github.com/ClickHouse/ClickHouse/pull/48236) ([Mikhail f. Shiryaev](https://github.com/Felixoid)).
#### Bug Fix (user-visible misbehavior in an official stable release)
* Fix changing an expired role [#46772](https://github.com/ClickHouse/ClickHouse/pull/46772) ([Vitaly Baranov](https://github.com/vitlibar)).
* Fix bug in zero-copy replication disk choice during fetch [#47010](https://github.com/ClickHouse/ClickHouse/pull/47010) ([alesapin](https://github.com/alesapin)).
* Fix NOT_IMPLEMENTED error with CROSS JOIN and algorithm = auto [#47068](https://github.com/ClickHouse/ClickHouse/pull/47068) ([Vladimir C](https://github.com/vdimir)).
* Disable logical expression optimizer for expression with aliases. [#47451](https://github.com/ClickHouse/ClickHouse/pull/47451) ([Nikolai Kochetov](https://github.com/KochetovNicolai)).
* Fix query parameters [#47488](https://github.com/ClickHouse/ClickHouse/pull/47488) ([Alexey Milovidov](https://github.com/alexey-milovidov)).
* Parameterized view bug fix 47287 47247 [#47495](https://github.com/ClickHouse/ClickHouse/pull/47495) ([SmitaRKulkarni](https://github.com/SmitaRKulkarni)).
* Fix wait for zero copy lock during move [#47631](https://github.com/ClickHouse/ClickHouse/pull/47631) ([alesapin](https://github.com/alesapin)).
* Hotfix for too verbose warnings in HTTP [#47903](https://github.com/ClickHouse/ClickHouse/pull/47903) ([Alexander Tokmakov](https://github.com/tavplubix)).
#### NOT FOR CHANGELOG / INSIGNIFICANT
* Better error messages in ReplicatedMergeTreeAttachThread [#47454](https://github.com/ClickHouse/ClickHouse/pull/47454) ([Alexander Tokmakov](https://github.com/tavplubix)).
* Fix `00933_test_fix_extra_seek_on_compressed_cache` in releases. [#47490](https://github.com/ClickHouse/ClickHouse/pull/47490) ([Alexey Milovidov](https://github.com/alexey-milovidov)).
* Add a fuse for backport branches w/o a created PR [#47760](https://github.com/ClickHouse/ClickHouse/pull/47760) ([Mikhail f. Shiryaev](https://github.com/Felixoid)).
* Only valid Reviews.STATES overwrite existing reviews [#47789](https://github.com/ClickHouse/ClickHouse/pull/47789) ([Mikhail f. Shiryaev](https://github.com/Felixoid)).
* Place short return before big block, improve logging [#47822](https://github.com/ClickHouse/ClickHouse/pull/47822) ([Mikhail f. Shiryaev](https://github.com/Felixoid)).
* Artifacts s3 prefix [#47945](https://github.com/ClickHouse/ClickHouse/pull/47945) ([Mikhail f. Shiryaev](https://github.com/Felixoid)).
* Fix tsan error lock-order-inversion [#47953](https://github.com/ClickHouse/ClickHouse/pull/47953) ([Kruglov Pavel](https://github.com/Avogar)).

View File

@ -0,0 +1,40 @@
---
sidebar_position: 1
sidebar_label: 2023
---
# 2023 Changelog
### ClickHouse release v23.2.5.46-stable (b50faecbb12) FIXME as compared to v23.2.4.12-stable (8fe866cb035)
#### Improvement
* Backported in [#48164](https://github.com/ClickHouse/ClickHouse/issues/48164): Fixed `UNKNOWN_TABLE` exception when attaching to a materialized view that has dependent tables that are not available. This might be useful when trying to restore state from a backup. [#47975](https://github.com/ClickHouse/ClickHouse/pull/47975) ([MikhailBurdukov](https://github.com/MikhailBurdukov)).
#### Build/Testing/Packaging Improvement
* Backported in [#48216](https://github.com/ClickHouse/ClickHouse/issues/48216): Use sccache as a replacement for ccache and using S3 as cache backend. [#46240](https://github.com/ClickHouse/ClickHouse/pull/46240) ([Mikhail f. Shiryaev](https://github.com/Felixoid)).
* Backported in [#48256](https://github.com/ClickHouse/ClickHouse/issues/48256): The `clickhouse/clickhouse-keeper` image used to be pushed only with tags `-alpine`, e.g. `latest-alpine`. As it was suggested in https://github.com/ClickHouse/examples/pull/2, now it will be pushed as suffixless too. [#48236](https://github.com/ClickHouse/ClickHouse/pull/48236) ([Mikhail f. Shiryaev](https://github.com/Felixoid)).
#### Bug Fix (user-visible misbehavior in an official stable release)
* Fix changing an expired role [#46772](https://github.com/ClickHouse/ClickHouse/pull/46772) ([Vitaly Baranov](https://github.com/vitlibar)).
* Fix bug in zero-copy replication disk choice during fetch [#47010](https://github.com/ClickHouse/ClickHouse/pull/47010) ([alesapin](https://github.com/alesapin)).
* Fix NOT_IMPLEMENTED error with CROSS JOIN and algorithm = auto [#47068](https://github.com/ClickHouse/ClickHouse/pull/47068) ([Vladimir C](https://github.com/vdimir)).
* Disable logical expression optimizer for expression with aliases. [#47451](https://github.com/ClickHouse/ClickHouse/pull/47451) ([Nikolai Kochetov](https://github.com/KochetovNicolai)).
* Fix query parameters [#47488](https://github.com/ClickHouse/ClickHouse/pull/47488) ([Alexey Milovidov](https://github.com/alexey-milovidov)).
* Parameterized view bug fix 47287 47247 [#47495](https://github.com/ClickHouse/ClickHouse/pull/47495) ([SmitaRKulkarni](https://github.com/SmitaRKulkarni)).
* Proper fix for bug in parquet, revert reverted [#45878](https://github.com/ClickHouse/ClickHouse/issues/45878) [#47538](https://github.com/ClickHouse/ClickHouse/pull/47538) ([Kruglov Pavel](https://github.com/Avogar)).
* Fix wait for zero copy lock during move [#47631](https://github.com/ClickHouse/ClickHouse/pull/47631) ([alesapin](https://github.com/alesapin)).
* Hotfix for too verbose warnings in HTTP [#47903](https://github.com/ClickHouse/ClickHouse/pull/47903) ([Alexander Tokmakov](https://github.com/tavplubix)).
#### NOT FOR CHANGELOG / INSIGNIFICANT
* fix: keeper systemd service file include invalid inline comment [#47105](https://github.com/ClickHouse/ClickHouse/pull/47105) ([SuperDJY](https://github.com/cmsxbc)).
* Better error messages in ReplicatedMergeTreeAttachThread [#47454](https://github.com/ClickHouse/ClickHouse/pull/47454) ([Alexander Tokmakov](https://github.com/tavplubix)).
* Fix `00933_test_fix_extra_seek_on_compressed_cache` in releases. [#47490](https://github.com/ClickHouse/ClickHouse/pull/47490) ([Alexey Milovidov](https://github.com/alexey-milovidov)).
* Fix startup on older systemd versions [#47689](https://github.com/ClickHouse/ClickHouse/pull/47689) ([Thomas Casteleyn](https://github.com/Hipska)).
* Add a fuse for backport branches w/o a created PR [#47760](https://github.com/ClickHouse/ClickHouse/pull/47760) ([Mikhail f. Shiryaev](https://github.com/Felixoid)).
* Only valid Reviews.STATES overwrite existing reviews [#47789](https://github.com/ClickHouse/ClickHouse/pull/47789) ([Mikhail f. Shiryaev](https://github.com/Felixoid)).
* Place short return before big block, improve logging [#47822](https://github.com/ClickHouse/ClickHouse/pull/47822) ([Mikhail f. Shiryaev](https://github.com/Felixoid)).
* Artifacts s3 prefix [#47945](https://github.com/ClickHouse/ClickHouse/pull/47945) ([Mikhail f. Shiryaev](https://github.com/Felixoid)).
* Fix tsan error lock-order-inversion [#47953](https://github.com/ClickHouse/ClickHouse/pull/47953) ([Kruglov Pavel](https://github.com/Avogar)).

View File

@ -8,11 +8,18 @@ sidebar_label: Data Replication
:::note
In ClickHouse Cloud replication is managed for you. Please create your tables without adding arguments. For example, in the text below you would replace:
```sql
ENGINE = ReplicatedReplacingMergeTree(
'/clickhouse/tables/{shard}/table_name',
'{replica}',
ver
)
```
ENGINE = ReplicatedReplacingMergeTree('/clickhouse/tables/{shard}/table_name', '{replica}', ver)
```
with:
```
```sql
ENGINE = ReplicatedReplacingMergeTree
```
:::

View File

@ -78,7 +78,8 @@ Of course, it's possible to manually run `CREATE TABLE` with same path on nonrel
### Inserts
When new rows are inserted into `KeeperMap`, if the key already exists, the value will be updated, otherwise new key is created.
When new rows are inserted into `KeeperMap`, if the key does not exist, a new entry for the key is created.
If the key exists, and setting `keeper_map_strict_mode` is set to `true`, an exception is thrown, otherwise, the value for the key is overwritten.
Example:
@ -89,6 +90,7 @@ INSERT INTO keeper_map_table VALUES ('some key', 1, 'value', 3.2);
### Deletes
Rows can be deleted using `DELETE` query or `TRUNCATE`.
If the key exists, and setting `keeper_map_strict_mode` is set to `true`, fetching and deleting data will succeed only if it can be executed atomically.
```sql
DELETE FROM keeper_map_table WHERE key LIKE 'some%' AND v1 > 1;
@ -105,6 +107,7 @@ TRUNCATE TABLE keeper_map_table;
### Updates
Values can be updated using `ALTER TABLE` query. Primary key cannot be updated.
If setting `keeper_map_strict_mode` is set to `true`, fetching and updating data will succeed only if it's executed atomically.
```sql
ALTER TABLE keeper_map_table UPDATE v1 = v1 * 10 + 2 WHERE key LIKE 'some%' AND v3 > 3.1;

View File

@ -67,7 +67,8 @@ CREATE TABLE youtube
(
`id` String,
`fetch_date` DateTime,
`upload_date` String,
`upload_date_str` String,
`upload_date` Date,
`title` String,
`uploader_id` String,
`uploader` String,
@ -87,7 +88,7 @@ CREATE TABLE youtube
`video_badges` String
)
ENGINE = MergeTree
ORDER BY (upload_date, uploader);
ORDER BY (uploader, upload_date);
```
3. The following command streams the records from the S3 files into the `youtube` table.
@ -101,8 +102,9 @@ INSERT INTO youtube
SETTINGS input_format_null_as_default = 1
SELECT
id,
parseDateTimeBestEffortUS(toString(fetch_date)) AS fetch_date,
upload_date,
parseDateTimeBestEffortUSOrZero(toString(fetch_date)) AS fetch_date,
upload_date AS upload_date_str,
toDate(parseDateTimeBestEffortUSOrZero(upload_date::String)) AS upload_date,
ifNull(title, '') AS title,
uploader_id,
ifNull(uploader, '') AS uploader,
@ -121,13 +123,26 @@ SELECT
ifNull(uploader_badges, '') AS uploader_badges,
ifNull(video_badges, '') AS video_badges
FROM s3Cluster(
'default',
'https://clickhouse-public-datasets.s3.amazonaws.com/youtube/original/files/*.zst',
'JSONLines'
);
'default',
'https://clickhouse-public-datasets.s3.amazonaws.com/youtube/original/files/*.zst',
'JSONLines'
)
SETTINGS
max_download_threads = 24,
max_insert_threads = 64,
max_insert_block_size = 100000000,
min_insert_block_size_rows = 100000000,
min_insert_block_size_bytes = 500000000;
```
4. Open a new tab in the SQL Console of ClickHouse Cloud (or a new `clickhouse-client` window) and watch the count increase. It will take a while to insert 4.56B rows, depending on your server resources. (Withtout any tweaking of settings, it takes about 4.5 hours.)
Some comments about our `INSERT` command:
- The `parseDateTimeBestEffortUSOrZero` function is handy when the incoming date fields may not be in the proper format. If `fetch_date` does not get parsed properly, it will be set to `0`
- The `upload_date` column contains valid dates, but it also contains strings like "4 hours ago" - which is certainly not a valid date. We decided to store the original value in `upload_date_str` and attempt to parse it with `toDate(parseDateTimeBestEffortUSOrZero(upload_date::String))`. If the parsing fails we just get `0`
- We used `ifNull` to avoid getting `NULL` values in our table. If an incoming value is `NULL`, the `ifNull` function is setting the value to an empty string
- It takes a long time to download the data, so we added a `SETTINGS` clause to spread out the work over more threads while making sure the block sizes stayed fairly large
4. Open a new tab in the SQL Console of ClickHouse Cloud (or a new `clickhouse-client` window) and watch the count increase. It will take a while to insert 4.56B rows, depending on your server resources. (Without any tweaking of settings, it takes about 4.5 hours.)
```sql
SELECT formatReadableQuantity(count())
@ -200,7 +215,7 @@ FROM youtube
WHERE (title ILIKE '%ClickHouse%') OR (description ILIKE '%ClickHouse%')
ORDER BY
like_count DESC,
view_count DESC
view_count DESC;
```
This query has to process every row, and also parse through two columns of strings. Even then, we get decent performance at 4.15M rows/second:
@ -224,7 +239,6 @@ The results look like:
When commenting is disabled, are people more likely to like or dislike to express their feelings about a video?
```sql
SELECT
concat('< ', formatReadableQuantity(view_range)) AS views,
@ -276,6 +290,127 @@ ORDER BY
Enabling comments seems to be correlated with a higher rate of engagement.
### How does the number of videos change over time - notable events?
```sql
SELECT
toStartOfMonth(toDateTime(upload_date)) AS month,
uniq(uploader_id) AS uploaders,
count() as num_videos,
sum(view_count) as view_count
FROM youtube
GROUP BY month
ORDER BY month ASC;
```
```response
┌──────month─┬─uploaders─┬─num_videos─┬───view_count─┐
│ 2005-04-01 │ 5 │ 6 │ 213597737 │
│ 2005-05-01 │ 6 │ 9 │ 2944005 │
│ 2005-06-01 │ 165 │ 351 │ 18624981 │
│ 2005-07-01 │ 395 │ 1168 │ 94164872 │
│ 2005-08-01 │ 1171 │ 3128 │ 124540774 │
│ 2005-09-01 │ 2418 │ 5206 │ 475536249 │
│ 2005-10-01 │ 6750 │ 13747 │ 737593613 │
│ 2005-11-01 │ 13706 │ 28078 │ 1896116976 │
│ 2005-12-01 │ 24756 │ 49885 │ 2478418930 │
│ 2006-01-01 │ 49992 │ 100447 │ 4532656581 │
│ 2006-02-01 │ 67882 │ 138485 │ 5677516317 │
│ 2006-03-01 │ 103358 │ 212237 │ 8430301366 │
│ 2006-04-01 │ 114615 │ 234174 │ 9980760440 │
│ 2006-05-01 │ 152682 │ 332076 │ 14129117212 │
│ 2006-06-01 │ 193962 │ 429538 │ 17014143263 │
│ 2006-07-01 │ 234401 │ 530311 │ 18721143410 │
│ 2006-08-01 │ 281280 │ 614128 │ 20473502342 │
│ 2006-09-01 │ 312434 │ 679906 │ 23158422265 │
│ 2006-10-01 │ 404873 │ 897590 │ 27357846117 │
```
A spike of uploaders [around covid is noticeable](https://www.theverge.com/2020/3/27/21197642/youtube-with-me-style-videos-views-coronavirus-cook-workout-study-home-beauty).
### More subtitiles over time and when
With advances in speech recognition, its easier than ever to create subtitles for video with youtube adding auto-captioning in late 2009 - was the jump then?
```sql
SELECT
toStartOfMonth(upload_date) AS month,
countIf(has_subtitles) / count() AS percent_subtitles,
percent_subtitles - any(percent_subtitles) OVER (
ORDER BY month ASC ROWS BETWEEN 1 PRECEDING AND 1 PRECEDING
) AS previous
FROM youtube
GROUP BY month
ORDER BY month ASC;
```
```response
┌──────month─┬───percent_subtitles─┬────────────────previous─┐
│ 2015-01-01 │ 0.2652653881082824 │ 0.2652653881082824 │
│ 2015-02-01 │ 0.3147556050309162 │ 0.049490216922633834 │
│ 2015-03-01 │ 0.32460464492371877 │ 0.009849039892802558 │
│ 2015-04-01 │ 0.33471963051468445 │ 0.010114985590965686 │
│ 2015-05-01 │ 0.3168087575501062 │ -0.017910872964578273 │
│ 2015-06-01 │ 0.3162609788438222 │ -0.0005477787062839745 │
│ 2015-07-01 │ 0.31828767677518033 │ 0.0020266979313581235 │
│ 2015-08-01 │ 0.3045551564286859 │ -0.013732520346494415 │
│ 2015-09-01 │ 0.311221133995152 │ 0.006665977566466086 │
│ 2015-10-01 │ 0.30574870926812175 │ -0.005472424727030245 │
│ 2015-11-01 │ 0.31125409712077234 │ 0.0055053878526505895 │
│ 2015-12-01 │ 0.3190967954651779 │ 0.007842698344405541 │
│ 2016-01-01 │ 0.32636021432496176 │ 0.007263418859783877 │
```
The data results show a spike in 2009. Apparently at that, time YouTube was removing their community captions feature, which allowed you to upload captions for other people's video.
This prompted a very successful campaign to have creators add captions to their videos for hard of hearing and deaf viewers.
### Top uploaders over time
```sql
WITH uploaders AS
(
SELECT uploader
FROM youtube
GROUP BY uploader
ORDER BY sum(view_count) DESC
LIMIT 10
)
SELECT
month,
uploader,
sum(view_count) AS total_views,
avg(dislike_count / like_count) AS like_to_dislike_ratio
FROM youtube
WHERE uploader IN (uploaders)
GROUP BY
toStartOfMonth(upload_date) AS month,
uploader
ORDER BY
month ASC,
total_views DESC;
```
```response
┌──────month─┬─uploader───────────────────┬─total_views─┬─like_to_dislike_ratio─┐
│ 1970-01-01 │ T-Series │ 10957099 │ 0.022784656361208206 │
│ 1970-01-01 │ Ryan's World │ 0 │ 0.003035559410234172 │
│ 1970-01-01 │ SET India │ 0 │ nan │
│ 2006-09-01 │ Cocomelon - Nursery Rhymes │ 256406497 │ 0.7005566715978622 │
│ 2007-06-01 │ Cocomelon - Nursery Rhymes │ 33641320 │ 0.7088650914344298 │
│ 2008-02-01 │ WWE │ 43733469 │ 0.07198856488734842 │
│ 2008-03-01 │ WWE │ 16514541 │ 0.1230603715431997 │
│ 2008-04-01 │ WWE │ 5907295 │ 0.2089399470159618 │
│ 2008-05-01 │ WWE │ 7779627 │ 0.09101676560436774 │
│ 2008-06-01 │ WWE │ 7018780 │ 0.0974184753155297 │
│ 2008-07-01 │ WWE │ 4686447 │ 0.1263845422065158 │
│ 2008-08-01 │ WWE │ 4514312 │ 0.08384574274791441 │
│ 2008-09-01 │ WWE │ 3717092 │ 0.07872802579349912 │
```
### How do like ratio changes as views go up?
```sql
@ -322,8 +457,6 @@ ORDER BY
< 10.00 billion false 1.77
< 10.00 billion true 19.5
└───────────────────┴─────────────────────┴────────────┘
20 rows in set. Elapsed: 63.664 sec. Processed 4.56 billion rows, 113.93 GB (71.59 million rows/s., 1.79 GB/s.)
```
### How are views distributed?
@ -359,6 +492,4 @@ ARRAY JOIN
│ 20th │ 16 │
│ 10th │ 6 │
└────────────┴─────────┘
12 rows in set. Elapsed: 1.864 sec. Processed 4.56 billion rows, 36.46 GB (2.45 billion rows/s., 19.56 GB/s.)
```

View File

@ -1235,8 +1235,8 @@ For output it uses the following correspondence between ClickHouse types and BSO
| ClickHouse type | BSON Type |
|-----------------------------------------------------------------------------------------------------------------------|---------------------------------------------------------------------------------------------------------------|
| [Bool](/docs/en/sql-reference/data-types/boolean.md) | `\x08` boolean |
| [Int8/UInt8](/docs/en/sql-reference/data-types/int-uint.md) | `\x10` int32 |
| [Int16UInt16](/docs/en/sql-reference/data-types/int-uint.md) | `\x10` int32 |
| [Int8/UInt8](/docs/en/sql-reference/data-types/int-uint.md)/[Enum8](/docs/en/sql-reference/data-types/enum.md) | `\x10` int32 |
| [Int16/UInt16](/docs/en/sql-reference/data-types/int-uint.md)/[Enum16](/docs/en/sql-reference/data-types/enum.md) | `\x10` int32 |
| [Int32](/docs/en/sql-reference/data-types/int-uint.md) | `\x10` int32 |
| [UInt32](/docs/en/sql-reference/data-types/int-uint.md) | `\x12` int64 |
| [Int64/UInt64](/docs/en/sql-reference/data-types/int-uint.md) | `\x12` int64 |
@ -1255,30 +1255,30 @@ For output it uses the following correspondence between ClickHouse types and BSO
| [Array](/docs/en/sql-reference/data-types/array.md) | `\x04` array |
| [Tuple](/docs/en/sql-reference/data-types/tuple.md) | `\x04` array |
| [Named Tuple](/docs/en/sql-reference/data-types/tuple.md) | `\x03` document |
| [Map](/docs/en/sql-reference/data-types/map.md) (with String keys) | `\x03` document |
| [Map](/docs/en/sql-reference/data-types/map.md) | `\x03` document |
| [IPv4](/docs/en/sql-reference/data-types/domains/ipv4.md) | `\x10` int32 |
| [IPv6](/docs/en/sql-reference/data-types/domains/ipv6.md) | `\x05` binary, `\x00` binary subtype |
For input it uses the following correspondence between BSON types and ClickHouse types:
| BSON Type | ClickHouse Type |
|------------------------------------------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
| `\x01` double | [Float32/Float64](/docs/en/sql-reference/data-types/float.md) |
| `\x02` string | [String](/docs/en/sql-reference/data-types/string.md)/[FixedString](/docs/en/sql-reference/data-types/fixedstring.md) |
| `\x03` document | [Map](/docs/en/sql-reference/data-types/map.md)/[Named Tuple](/docs/en/sql-reference/data-types/tuple.md) |
| `\x04` array | [Array](/docs/en/sql-reference/data-types/array.md)/[Tuple](/docs/en/sql-reference/data-types/tuple.md) |
| `\x05` binary, `\x00` binary subtype | [String](/docs/en/sql-reference/data-types/string.md)/[FixedString](/docs/en/sql-reference/data-types/fixedstring.md)/[IPv6](/docs/en/sql-reference/data-types/domains/ipv6.md) |
| `\x05` binary, `\x02` old binary subtype | [String](/docs/en/sql-reference/data-types/string.md)/[FixedString](/docs/en/sql-reference/data-types/fixedstring.md) |
| `\x05` binary, `\x03` old uuid subtype | [UUID](/docs/en/sql-reference/data-types/uuid.md) |
| `\x05` binary, `\x04` uuid subtype | [UUID](/docs/en/sql-reference/data-types/uuid.md) |
| `\x07` ObjectId | [String](/docs/en/sql-reference/data-types/string.md)/[FixedString](/docs/en/sql-reference/data-types/fixedstring.md) |
| `\x08` boolean | [Bool](/docs/en/sql-reference/data-types/boolean.md) |
| `\x09` datetime | [DateTime64](/docs/en/sql-reference/data-types/datetime64.md) |
| `\x0A` null value | [NULL](/docs/en/sql-reference/data-types/nullable.md) |
| `\x0D` JavaScript code | [String](/docs/en/sql-reference/data-types/string.md)/[FixedString](/docs/en/sql-reference/data-types/fixedstring.md) |
| `\x0E` symbol | [String](/docs/en/sql-reference/data-types/string.md)/[FixedString](/docs/en/sql-reference/data-types/fixedstring.md) |
| `\x10` int32 | [Int32/UInt32](/docs/en/sql-reference/data-types/int-uint.md)/[Decimal32](/docs/en/sql-reference/data-types/decimal.md)/[IPv4](/docs/en/sql-reference/data-types/domains/ipv4.md) |
| `\x12` int64 | [Int64/UInt64](/docs/en/sql-reference/data-types/int-uint.md)/[Decimal64](/docs/en/sql-reference/data-types/decimal.md)/[DateTime64](/docs/en/sql-reference/data-types/datetime64.md) |
| BSON Type | ClickHouse Type |
|------------------------------------------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
| `\x01` double | [Float32/Float64](/docs/en/sql-reference/data-types/float.md) |
| `\x02` string | [String](/docs/en/sql-reference/data-types/string.md)/[FixedString](/docs/en/sql-reference/data-types/fixedstring.md) |
| `\x03` document | [Map](/docs/en/sql-reference/data-types/map.md)/[Named Tuple](/docs/en/sql-reference/data-types/tuple.md) |
| `\x04` array | [Array](/docs/en/sql-reference/data-types/array.md)/[Tuple](/docs/en/sql-reference/data-types/tuple.md) |
| `\x05` binary, `\x00` binary subtype | [String](/docs/en/sql-reference/data-types/string.md)/[FixedString](/docs/en/sql-reference/data-types/fixedstring.md)/[IPv6](/docs/en/sql-reference/data-types/domains/ipv6.md) |
| `\x05` binary, `\x02` old binary subtype | [String](/docs/en/sql-reference/data-types/string.md)/[FixedString](/docs/en/sql-reference/data-types/fixedstring.md) |
| `\x05` binary, `\x03` old uuid subtype | [UUID](/docs/en/sql-reference/data-types/uuid.md) |
| `\x05` binary, `\x04` uuid subtype | [UUID](/docs/en/sql-reference/data-types/uuid.md) |
| `\x07` ObjectId | [String](/docs/en/sql-reference/data-types/string.md)/[FixedString](/docs/en/sql-reference/data-types/fixedstring.md) |
| `\x08` boolean | [Bool](/docs/en/sql-reference/data-types/boolean.md) |
| `\x09` datetime | [DateTime64](/docs/en/sql-reference/data-types/datetime64.md) |
| `\x0A` null value | [NULL](/docs/en/sql-reference/data-types/nullable.md) |
| `\x0D` JavaScript code | [String](/docs/en/sql-reference/data-types/string.md)/[FixedString](/docs/en/sql-reference/data-types/fixedstring.md) |
| `\x0E` symbol | [String](/docs/en/sql-reference/data-types/string.md)/[FixedString](/docs/en/sql-reference/data-types/fixedstring.md) |
| `\x10` int32 | [Int32/UInt32](/docs/en/sql-reference/data-types/int-uint.md)/[Decimal32](/docs/en/sql-reference/data-types/decimal.md)/[IPv4](/docs/en/sql-reference/data-types/domains/ipv4.md)/[Enum8/Enum16](/docs/en/sql-reference/data-types/enum.md) |
| `\x12` int64 | [Int64/UInt64](/docs/en/sql-reference/data-types/int-uint.md)/[Decimal64](/docs/en/sql-reference/data-types/decimal.md)/[DateTime64](/docs/en/sql-reference/data-types/datetime64.md) |
Other BSON types are not supported. Also, it performs conversion between different integer types (for example, you can insert BSON int32 value into ClickHouse UInt8).
Big integers and decimals (Int128/UInt128/Int256/UInt256/Decimal128/Decimal256) can be parsed from BSON Binary value with `\x00` binary subtype. In this case this format will validate that the size of binary data equals the size of expected value.
@ -1610,29 +1610,34 @@ See also [Format Schema](#formatschema).
The table below shows supported data types and how they match ClickHouse [data types](/docs/en/sql-reference/data-types/index.md) in `INSERT` and `SELECT` queries.
| CapnProto data type (`INSERT`) | ClickHouse data type | CapnProto data type (`SELECT`) |
|----------------------------------|------------------------------------------------------------------------------------------------------------------------|------------------------------|
| `UINT8`, `BOOL` | [UInt8](/docs/en/sql-reference/data-types/int-uint.md) | `UINT8` |
| `INT8` | [Int8](/docs/en/sql-reference/data-types/int-uint.md) | `INT8` |
| `UINT16` | [UInt16](/docs/en/sql-reference/data-types/int-uint.md), [Date](/docs/en/sql-reference/data-types/date.md) | `UINT16` |
| `INT16` | [Int16](/docs/en/sql-reference/data-types/int-uint.md) | `INT16` |
| `UINT32` | [UInt32](/docs/en/sql-reference/data-types/int-uint.md), [DateTime](/docs/en/sql-reference/data-types/datetime.md) | `UINT32` |
| `INT32` | [Int32](/docs/en/sql-reference/data-types/int-uint.md) | `INT32` |
| `UINT64` | [UInt64](/docs/en/sql-reference/data-types/int-uint.md) | `UINT64` |
| `INT64` | [Int64](/docs/en/sql-reference/data-types/int-uint.md), [DateTime64](/docs/en/sql-reference/data-types/datetime.md) | `INT64` |
| `FLOAT32` | [Float32](/docs/en/sql-reference/data-types/float.md) | `FLOAT32` |
| `FLOAT64` | [Float64](/docs/en/sql-reference/data-types/float.md) | `FLOAT64` |
| `TEXT, DATA` | [String](/docs/en/sql-reference/data-types/string.md), [FixedString](/docs/en/sql-reference/data-types/fixedstring.md) | `TEXT, DATA` |
| `union(T, Void), union(Void, T)` | [Nullable(T)](/docs/en/sql-reference/data-types/date.md) | `union(T, Void), union(Void, T)` |
| `ENUM` | [Enum(8\ |16)](/docs/en/sql-reference/data-types/enum.md) | `ENUM` |
| `LIST` | [Array](/docs/en/sql-reference/data-types/array.md) | `LIST` |
| `STRUCT` | [Tuple](/docs/en/sql-reference/data-types/tuple.md) | `STRUCT` |
| `UINT32` | [IPv4](/docs/en/sql-reference/data-types/domains/ipv4.md) | `UINT32` |
| `DATA` | [IPv6](/docs/en/sql-reference/data-types/domains/ipv6.md) | `DATA` |
| CapnProto data type (`INSERT`) | ClickHouse data type | CapnProto data type (`SELECT`) |
|------------------------------------------------------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|------------------------------------------------------|
| `UINT8`, `BOOL` | [UInt8](/docs/en/sql-reference/data-types/int-uint.md) | `UINT8` |
| `INT8` | [Int8](/docs/en/sql-reference/data-types/int-uint.md) | `INT8` |
| `UINT16` | [UInt16](/docs/en/sql-reference/data-types/int-uint.md), [Date](/docs/en/sql-reference/data-types/date.md) | `UINT16` |
| `INT16` | [Int16](/docs/en/sql-reference/data-types/int-uint.md) | `INT16` |
| `UINT32` | [UInt32](/docs/en/sql-reference/data-types/int-uint.md), [DateTime](/docs/en/sql-reference/data-types/datetime.md) | `UINT32` |
| `INT32` | [Int32](/docs/en/sql-reference/data-types/int-uint.md), [Decimal32](/docs/en/sql-reference/data-types/decimal.md) | `INT32` |
| `UINT64` | [UInt64](/docs/en/sql-reference/data-types/int-uint.md) | `UINT64` |
| `INT64` | [Int64](/docs/en/sql-reference/data-types/int-uint.md), [DateTime64](/docs/en/sql-reference/data-types/datetime.md), [Decimal64](/docs/en/sql-reference/data-types/decimal.md) | `INT64` |
| `FLOAT32` | [Float32](/docs/en/sql-reference/data-types/float.md) | `FLOAT32` |
| `FLOAT64` | [Float64](/docs/en/sql-reference/data-types/float.md) | `FLOAT64` |
| `TEXT, DATA` | [String](/docs/en/sql-reference/data-types/string.md), [FixedString](/docs/en/sql-reference/data-types/fixedstring.md) | `TEXT, DATA` |
| `union(T, Void), union(Void, T)` | [Nullable(T)](/docs/en/sql-reference/data-types/date.md) | `union(T, Void), union(Void, T)` |
| `ENUM` | [Enum(8/16)](/docs/en/sql-reference/data-types/enum.md) | `ENUM` |
| `LIST` | [Array](/docs/en/sql-reference/data-types/array.md) | `LIST` |
| `STRUCT` | [Tuple](/docs/en/sql-reference/data-types/tuple.md) | `STRUCT` |
| `UINT32` | [IPv4](/docs/en/sql-reference/data-types/domains/ipv4.md) | `UINT32` |
| `DATA` | [IPv6](/docs/en/sql-reference/data-types/domains/ipv6.md) | `DATA` |
| `DATA` | [Int128/UInt128/Int256/UInt256](/docs/en/sql-reference/data-types/int-uint.md) | `DATA` |
| `DATA` | [Decimal128/Decimal256](/docs/en/sql-reference/data-types/decimal.md) | `DATA` |
| `STRUCT(entries LIST(STRUCT(key Key, value Value)))` | [Map](/docs/en/sql-reference/data-types/map.md) | `STRUCT(entries LIST(STRUCT(key Key, value Value)))` |
Integer types can be converted into each other during input/output.
For working with `Enum` in CapnProto format use the [format_capn_proto_enum_comparising_mode](/docs/en/operations/settings/settings-formats.md/#format_capn_proto_enum_comparising_mode) setting.
Arrays can be nested and can have a value of the `Nullable` type as an argument. `Tuple` type also can be nested.
Arrays can be nested and can have a value of the `Nullable` type as an argument. `Tuple` and `Map` types also can be nested.
### Inserting and Selecting Data {#inserting-and-selecting-data-capnproto}
@ -1931,30 +1936,31 @@ Setting `format_avro_schema_registry_url` needs to be configured in `users.xml`
The table below shows supported data types and how they match ClickHouse [data types](/docs/en/sql-reference/data-types/index.md) in `INSERT` and `SELECT` queries.
| Parquet data type (`INSERT`) | ClickHouse data type | Parquet data type (`SELECT`) |
|----------------------------------------------------|-----------------------------------------------------------------|------------------------------|
| `BOOL` | [Bool](/docs/en/sql-reference/data-types/boolean.md) | `BOOL` |
| `UINT8`, `BOOL` | [UInt8](/docs/en/sql-reference/data-types/int-uint.md) | `UINT8` |
| `INT8` | [Int8](/docs/en/sql-reference/data-types/int-uint.md) | `INT8` |
| `UINT16` | [UInt16](/docs/en/sql-reference/data-types/int-uint.md) | `UINT16` |
| `INT16` | [Int16](/docs/en/sql-reference/data-types/int-uint.md) | `INT16` |
| `UINT32` | [UInt32](/docs/en/sql-reference/data-types/int-uint.md) | `UINT32` |
| `INT32` | [Int32](/docs/en/sql-reference/data-types/int-uint.md) | `INT32` |
| `UINT64` | [UInt64](/docs/en/sql-reference/data-types/int-uint.md) | `UINT64` |
| `INT64` | [Int64](/docs/en/sql-reference/data-types/int-uint.md) | `INT64` |
| `FLOAT` | [Float32](/docs/en/sql-reference/data-types/float.md) | `FLOAT` |
| `DOUBLE` | [Float64](/docs/en/sql-reference/data-types/float.md) | `DOUBLE` |
| `DATE` | [Date32](/docs/en/sql-reference/data-types/date.md) | `DATE` |
| `TIME (ms)` | [DateTime](/docs/en/sql-reference/data-types/datetime.md) | `UINT32` |
| `TIMESTAMP`, `TIME (us, ns)` | [DateTime64](/docs/en/sql-reference/data-types/datetime64.md) | `TIMESTAMP` |
| `STRING`, `BINARY` | [String](/docs/en/sql-reference/data-types/string.md) | `BINARY` |
| `STRING`, `BINARY`, `FIXED_LENGTH_BYTE_ARRAY` | [FixedString](/docs/en/sql-reference/data-types/fixedstring.md) | `FIXED_LENGTH_BYTE_ARRAY` |
| `DECIMAL` | [Decimal](/docs/en/sql-reference/data-types/decimal.md) | `DECIMAL` |
| `LIST` | [Array](/docs/en/sql-reference/data-types/array.md) | `LIST` |
| `STRUCT` | [Tuple](/docs/en/sql-reference/data-types/tuple.md) | `STRUCT` |
| `MAP` | [Map](/docs/en/sql-reference/data-types/map.md) | `MAP` |
| `UINT32` | [IPv4](/docs/en/sql-reference/data-types/domains/ipv4.md) | `UINT32` |
| `FIXED_LENGTH_BYTE_ARRAY` | [IPv6](/docs/en/sql-reference/data-types/domains/ipv6.md) | `FIXED_LENGTH_BYTE_ARRAY` |
| Parquet data type (`INSERT`) | ClickHouse data type | Parquet data type (`SELECT`) |
|-----------------------------------------------|------------------------------------------------------------------------------------------------------------|-------------------------------|
| `BOOL` | [Bool](/docs/en/sql-reference/data-types/boolean.md) | `BOOL` |
| `UINT8`, `BOOL` | [UInt8](/docs/en/sql-reference/data-types/int-uint.md) | `UINT8` |
| `INT8` | [Int8](/docs/en/sql-reference/data-types/int-uint.md)/[Enum8](/docs/en/sql-reference/data-types/enum.md) | `INT8` |
| `UINT16` | [UInt16](/docs/en/sql-reference/data-types/int-uint.md) | `UINT16` |
| `INT16` | [Int16](/docs/en/sql-reference/data-types/int-uint.md)/[Enum16](/docs/en/sql-reference/data-types/enum.md) | `INT16` |
| `UINT32` | [UInt32](/docs/en/sql-reference/data-types/int-uint.md) | `UINT32` |
| `INT32` | [Int32](/docs/en/sql-reference/data-types/int-uint.md) | `INT32` |
| `UINT64` | [UInt64](/docs/en/sql-reference/data-types/int-uint.md) | `UINT64` |
| `INT64` | [Int64](/docs/en/sql-reference/data-types/int-uint.md) | `INT64` |
| `FLOAT` | [Float32](/docs/en/sql-reference/data-types/float.md) | `FLOAT` |
| `DOUBLE` | [Float64](/docs/en/sql-reference/data-types/float.md) | `DOUBLE` |
| `DATE` | [Date32](/docs/en/sql-reference/data-types/date.md) | `DATE` |
| `TIME (ms)` | [DateTime](/docs/en/sql-reference/data-types/datetime.md) | `UINT32` |
| `TIMESTAMP`, `TIME (us, ns)` | [DateTime64](/docs/en/sql-reference/data-types/datetime64.md) | `TIMESTAMP` |
| `STRING`, `BINARY` | [String](/docs/en/sql-reference/data-types/string.md) | `BINARY` |
| `STRING`, `BINARY`, `FIXED_LENGTH_BYTE_ARRAY` | [FixedString](/docs/en/sql-reference/data-types/fixedstring.md) | `FIXED_LENGTH_BYTE_ARRAY` |
| `DECIMAL` | [Decimal](/docs/en/sql-reference/data-types/decimal.md) | `DECIMAL` |
| `LIST` | [Array](/docs/en/sql-reference/data-types/array.md) | `LIST` |
| `STRUCT` | [Tuple](/docs/en/sql-reference/data-types/tuple.md) | `STRUCT` |
| `MAP` | [Map](/docs/en/sql-reference/data-types/map.md) | `MAP` |
| `UINT32` | [IPv4](/docs/en/sql-reference/data-types/domains/ipv4.md) | `UINT32` |
| `FIXED_LENGTH_BYTE_ARRAY`, `BINARY` | [IPv6](/docs/en/sql-reference/data-types/domains/ipv6.md) | `FIXED_LENGTH_BYTE_ARRAY` |
| `FIXED_LENGTH_BYTE_ARRAY`, `BINARY` | [Int128/UInt128/Int256/UInt256](/docs/en/sql-reference/data-types/int-uint.md) | `FIXED_LENGTH_BYTE_ARRAY` |
Arrays can be nested and can have a value of the `Nullable` type as an argument. `Tuple` and `Map` types also can be nested.
@ -2000,31 +2006,32 @@ To exchange data with Hadoop, you can use [HDFS table engine](/docs/en/engines/t
The table below shows supported data types and how they match ClickHouse [data types](/docs/en/sql-reference/data-types/index.md) in `INSERT` and `SELECT` queries.
| Arrow data type (`INSERT`) | ClickHouse data type | Arrow data type (`SELECT`) |
|-----------------------------------------|-----------------------------------------------------------------|----------------------------|
| `BOOL` | [Bool](/docs/en/sql-reference/data-types/boolean.md) | `BOOL` |
| `UINT8`, `BOOL` | [UInt8](/docs/en/sql-reference/data-types/int-uint.md) | `UINT8` |
| `INT8` | [Int8](/docs/en/sql-reference/data-types/int-uint.md) | `INT8` |
| `UINT16` | [UInt16](/docs/en/sql-reference/data-types/int-uint.md) | `UINT16` |
| `INT16` | [Int16](/docs/en/sql-reference/data-types/int-uint.md) | `INT16` |
| `UINT32` | [UInt32](/docs/en/sql-reference/data-types/int-uint.md) | `UINT32` |
| `INT32` | [Int32](/docs/en/sql-reference/data-types/int-uint.md) | `INT32` |
| `UINT64` | [UInt64](/docs/en/sql-reference/data-types/int-uint.md) | `UINT64` |
| `INT64` | [Int64](/docs/en/sql-reference/data-types/int-uint.md) | `INT64` |
| `FLOAT`, `HALF_FLOAT` | [Float32](/docs/en/sql-reference/data-types/float.md) | `FLOAT32` |
| `DOUBLE` | [Float64](/docs/en/sql-reference/data-types/float.md) | `FLOAT64` |
| `DATE32` | [Date32](/docs/en/sql-reference/data-types/date32.md) | `UINT16` |
| `DATE64` | [DateTime](/docs/en/sql-reference/data-types/datetime.md) | `UINT32` |
| `TIMESTAMP`, `TIME32`, `TIME64` | [DateTime64](/docs/en/sql-reference/data-types/datetime64.md) | `UINT32` |
| `STRING`, `BINARY` | [String](/docs/en/sql-reference/data-types/string.md) | `BINARY` |
| `STRING`, `BINARY`, `FIXED_SIZE_BINARY` | [FixedString](/docs/en/sql-reference/data-types/fixedstring.md) | `FIXED_SIZE_BINARY` |
| `DECIMAL` | [Decimal](/docs/en/sql-reference/data-types/decimal.md) | `DECIMAL` |
| `DECIMAL256` | [Decimal256](/docs/en/sql-reference/data-types/decimal.md) | `DECIMAL256` |
| `LIST` | [Array](/docs/en/sql-reference/data-types/array.md) | `LIST` |
| `STRUCT` | [Tuple](/docs/en/sql-reference/data-types/tuple.md) | `STRUCT` |
| `MAP` | [Map](/docs/en/sql-reference/data-types/map.md) | `MAP` |
| `UINT32` | [IPv4](/docs/en/sql-reference/data-types/domains/ipv4.md) | `UINT32` |
| `FIXED_SIZE_BINARY`, `BINARY` | [IPv6](/docs/en/sql-reference/data-types/domains/ipv6.md) | `FIXED_SIZE_BINARY` |
| Arrow data type (`INSERT`) | ClickHouse data type | Arrow data type (`SELECT`) |
|-----------------------------------------|------------------------------------------------------------------------------------------------------------|----------------------------|
| `BOOL` | [Bool](/docs/en/sql-reference/data-types/boolean.md) | `BOOL` |
| `UINT8`, `BOOL` | [UInt8](/docs/en/sql-reference/data-types/int-uint.md) | `UINT8` |
| `INT8` | [Int8](/docs/en/sql-reference/data-types/int-uint.md)/[Enum8](/docs/en/sql-reference/data-types/enum.md) | `INT8` |
| `UINT16` | [UInt16](/docs/en/sql-reference/data-types/int-uint.md) | `UINT16` |
| `INT16` | [Int16](/docs/en/sql-reference/data-types/int-uint.md)/[Enum16](/docs/en/sql-reference/data-types/enum.md) | `INT16` |
| `UINT32` | [UInt32](/docs/en/sql-reference/data-types/int-uint.md) | `UINT32` |
| `INT32` | [Int32](/docs/en/sql-reference/data-types/int-uint.md) | `INT32` |
| `UINT64` | [UInt64](/docs/en/sql-reference/data-types/int-uint.md) | `UINT64` |
| `INT64` | [Int64](/docs/en/sql-reference/data-types/int-uint.md) | `INT64` |
| `FLOAT`, `HALF_FLOAT` | [Float32](/docs/en/sql-reference/data-types/float.md) | `FLOAT32` |
| `DOUBLE` | [Float64](/docs/en/sql-reference/data-types/float.md) | `FLOAT64` |
| `DATE32` | [Date32](/docs/en/sql-reference/data-types/date32.md) | `UINT16` |
| `DATE64` | [DateTime](/docs/en/sql-reference/data-types/datetime.md) | `UINT32` |
| `TIMESTAMP`, `TIME32`, `TIME64` | [DateTime64](/docs/en/sql-reference/data-types/datetime64.md) | `UINT32` |
| `STRING`, `BINARY` | [String](/docs/en/sql-reference/data-types/string.md) | `BINARY` |
| `STRING`, `BINARY`, `FIXED_SIZE_BINARY` | [FixedString](/docs/en/sql-reference/data-types/fixedstring.md) | `FIXED_SIZE_BINARY` |
| `DECIMAL` | [Decimal](/docs/en/sql-reference/data-types/decimal.md) | `DECIMAL` |
| `DECIMAL256` | [Decimal256](/docs/en/sql-reference/data-types/decimal.md) | `DECIMAL256` |
| `LIST` | [Array](/docs/en/sql-reference/data-types/array.md) | `LIST` |
| `STRUCT` | [Tuple](/docs/en/sql-reference/data-types/tuple.md) | `STRUCT` |
| `MAP` | [Map](/docs/en/sql-reference/data-types/map.md) | `MAP` |
| `UINT32` | [IPv4](/docs/en/sql-reference/data-types/domains/ipv4.md) | `UINT32` |
| `FIXED_SIZE_BINARY`, `BINARY` | [IPv6](/docs/en/sql-reference/data-types/domains/ipv6.md) | `FIXED_SIZE_BINARY` |
| `FIXED_SIZE_BINARY`, `BINARY` | [Int128/UInt128/Int256/UInt256](/docs/en/sql-reference/data-types/int-uint.md) | `FIXED_SIZE_BINARY` |
Arrays can be nested and can have a value of the `Nullable` type as an argument. `Tuple` and `Map` types also can be nested.
@ -2073,23 +2080,26 @@ $ clickhouse-client --query="SELECT * FROM {some_table} FORMAT Arrow" > {filenam
The table below shows supported data types and how they match ClickHouse [data types](/docs/en/sql-reference/data-types/index.md) in `INSERT` and `SELECT` queries.
| ORC data type (`INSERT`) | ClickHouse data type | ORC data type (`SELECT`) |
|---------------------------------------|---------------------------------------------------------------|--------------------------|
| `Boolean` | [UInt8](/docs/en/sql-reference/data-types/int-uint.md) | `Boolean` |
| `Tinyint` | [Int8](/docs/en/sql-reference/data-types/int-uint.md) | `Tinyint` |
| `Smallint` | [Int16](/docs/en/sql-reference/data-types/int-uint.md) | `Smallint` |
| `Int` | [Int32](/docs/en/sql-reference/data-types/int-uint.md) | `Int` |
| `Bigint` | [Int64](/docs/en/sql-reference/data-types/int-uint.md) | `Bigint` |
| `Float` | [Float32](/docs/en/sql-reference/data-types/float.md) | `Float` |
| `Double` | [Float64](/docs/en/sql-reference/data-types/float.md) | `Double` |
| `Decimal` | [Decimal](/docs/en/sql-reference/data-types/decimal.md) | `Decimal` |
| `Date` | [Date32](/docs/en/sql-reference/data-types/date32.md) | `Date` |
| `Timestamp` | [DateTime64](/docs/en/sql-reference/data-types/datetime64.md) | `Timestamp` |
| `String`, `Char`, `Varchar`, `Binary` | [String](/docs/en/sql-reference/data-types/string.md) | `Binary` |
| `List` | [Array](/docs/en/sql-reference/data-types/array.md) | `List` |
| `Struct` | [Tuple](/docs/en/sql-reference/data-types/tuple.md) | `Struct` |
| `Map` | [Map](/docs/en/sql-reference/data-types/map.md) | `Map` |
| `-` | [IPv4](/docs/en/sql-reference/data-types/int-uint.md) | `Int` |
| ORC data type (`INSERT`) | ClickHouse data type | ORC data type (`SELECT`) |
|---------------------------------------|-------------------------------------------------------------------------------------------------------------------|--------------------------|
| `Boolean` | [UInt8](/docs/en/sql-reference/data-types/int-uint.md) | `Boolean` |
| `Tinyint` | [Int8/UInt8](/docs/en/sql-reference/data-types/int-uint.md)/[Enum8](/docs/en/sql-reference/data-types/enum.md) | `Tinyint` |
| `Smallint` | [Int16/UInt16](/docs/en/sql-reference/data-types/int-uint.md)/[Enum16](/docs/en/sql-reference/data-types/enum.md) | `Smallint` |
| `Int` | [Int32/UInt32](/docs/en/sql-reference/data-types/int-uint.md) | `Int` |
| `Bigint` | [Int64/UInt32](/docs/en/sql-reference/data-types/int-uint.md) | `Bigint` |
| `Float` | [Float32](/docs/en/sql-reference/data-types/float.md) | `Float` |
| `Double` | [Float64](/docs/en/sql-reference/data-types/float.md) | `Double` |
| `Decimal` | [Decimal](/docs/en/sql-reference/data-types/decimal.md) | `Decimal` |
| `Date` | [Date32](/docs/en/sql-reference/data-types/date32.md) | `Date` |
| `Timestamp` | [DateTime64](/docs/en/sql-reference/data-types/datetime64.md) | `Timestamp` |
| `String`, `Char`, `Varchar`, `Binary` | [String](/docs/en/sql-reference/data-types/string.md) | `Binary` |
| `List` | [Array](/docs/en/sql-reference/data-types/array.md) | `List` |
| `Struct` | [Tuple](/docs/en/sql-reference/data-types/tuple.md) | `Struct` |
| `Map` | [Map](/docs/en/sql-reference/data-types/map.md) | `Map` |
| `Int` | [IPv4](/docs/en/sql-reference/data-types/int-uint.md) | `Int` |
| `Binary` | [IPv6](/docs/en/sql-reference/data-types/domains/ipv6.md) | `Binary` |
| `Binary` | [Int128/UInt128/Int256/UInt256](/docs/en/sql-reference/data-types/int-uint.md) | `Binary` |
| `Binary` | [Decimal256](/docs/en/sql-reference/data-types/decimal.md) | `Binary` |
Other types are not supported.

View File

@ -244,10 +244,12 @@ Example of configuration:
<database>system</database>
<user>foo</user>
<password>secret</password>
<secure>1</secure>
</remote1>
</named_collections>
</clickhouse>
```
`secure` is not needed for connection because of `remoteSecure`, but it can be used for dictionaries.
### Example of using named collections with the `remote`/`remoteSecure` functions

View File

@ -103,6 +103,20 @@ cached - for that use setting [query_cache_min_query_runs](settings/settings.md#
Entries in the query cache become stale after a certain time period (time-to-live). By default, this period is 60 seconds but a different
value can be specified at session, profile or query level using setting [query_cache_ttl](settings/settings.md#query-cache-ttl).
Entries in the query cache are compressed by default. This reduces the overall memory consumption at the cost of slower writes into / reads
from the query cache. To disable compression, use setting [query_cache_compress_entries](settings/settings.md#query-cache-compress-entries).
ClickHouse reads table data in blocks of [max_block_size](settings/settings.md#settings-max_block_size) rows. Due to filtering, aggregation,
etc., result blocks are typically much smaller than 'max_block_size' but there are also cases where they are much bigger. Setting
[query_cache_squash_partial_results](settings/settings.md#query-cache-squash-partial-results) (enabled by default) controls if result blocks
are squashed (if they are tiny) or split (if they are large) into blocks of 'max_block_size' size before insertion into the query result
cache. This reduces performance of writes into the query cache but improves compression rate of cache entries and provides more natural
block granularity when query results are later served from the query cache.
As a result, the query cache stores for each query multiple (partial)
result blocks. While this behavior is a good default, it can be suppressed using setting
[query_cache_squash_partial_query_results](settings/settings.md#query-cache-squash-partial-query-results).
Also, results of queries with non-deterministic functions such as `rand()` and `now()` are not cached. This can be overruled using
setting [query_cache_store_results_of_queries_with_nondeterministic_functions](settings/settings.md#query-cache-store-results-of-queries-with-nondeterministic-functions).

View File

@ -1435,6 +1435,28 @@ Possible values:
Default value: `0`
## query_cache_compress_entries {#query-cache-compress-entries}
Compress entries in the [query cache](../query-cache.md). Lessens the memory consumption of the query cache at the cost of slower inserts into / reads from it.
Possible values:
- 0 - Disabled
- 1 - Enabled
Default value: `1`
## query_cache_squash_partial_results {#query-cache-squash-partial-results}
Squash partial result blocks to blocks of size [max_block_size](#setting-max_block_size). Reduces performance of inserts into the [query cache](../query-cache.md) but improves the compressability of cache entries (see [query_cache_compress-entries](#query_cache_compress_entries)).
Possible values:
- 0 - Disabled
- 1 - Enabled
Default value: `1`
## query_cache_ttl {#query-cache-ttl}
After this time in seconds entries in the [query cache](../query-cache.md) become stale.
@ -4049,3 +4071,44 @@ SELECT sum(number) FROM numbers(10000000000) SETTINGS partial_result_on_first_ca
Possible values: `true`, `false`
Default value: `false`
## function_json_value_return_type_allow_nullable
Control whether allow to return `NULL` when value is not exist for JSON_VALUE function.
```sql
SELECT JSON_VALUE('{"hello":"world"}', '$.b') settings function_json_value_return_type_allow_nullable=true;
┌─JSON_VALUE('{"hello":"world"}', '$.b')─┐
│ ᴺᵁᴸᴸ │
└────────────────────────────────────────┘
1 row in set. Elapsed: 0.001 sec.
```
Possible values:
- true — Allow.
- false — Disallow.
Default value: `false`.
## function_json_value_return_type_allow_complex
Control whether allow to return complex type (such as: struct, array, map) for json_value function.
```sql
SELECT JSON_VALUE('{"hello":{"world":"!"}}', '$.hello') settings function_json_value_return_type_allow_complex=true
┌─JSON_VALUE('{"hello":{"world":"!"}}', '$.hello')─┐
│ {"world":"!"} │
└──────────────────────────────────────────────────┘
1 row in set. Elapsed: 0.001 sec.
```
Possible values:
- true — Allow.
- false — Disallow.
Default value: `false`.

View File

@ -51,10 +51,14 @@ But for storing archives with rare queries, shelves will work.
## RAID {#raid}
When using HDD, you can combine their RAID-10, RAID-5, RAID-6 or RAID-50.
For Linux, software RAID is better (with `mdadm`). We do not recommend using LVM.
For Linux, software RAID is better (with `mdadm`).
When creating RAID-10, select the `far` layout.
If your budget allows, choose RAID-10.
LVM by itself (without RAID or `mdadm`) is ok, but making RAID with it or combining it with `mdadm` is a less explored option, and there will be more chances for mistakes
(selecting wrong chunk size; misalignment of chunks; choosing a wrong raid type; forgetting to cleanup disks). If you are confident
in using LVM, there is nothing against using it.
If you have more than 4 disks, use RAID-6 (preferred) or RAID-50, instead of RAID-5.
When using RAID-5, RAID-6 or RAID-50, always increase stripe_cache_size, since the default value is usually not the best choice.
@ -70,7 +74,7 @@ Never set the block size too small or too large.
You can use RAID-0 on SSD.
Regardless of RAID use, always use replication for data security.
Enable NCQ with a long queue. For HDD, choose the CFQ scheduler, and for SSD, choose noop. Dont reduce the readahead setting.
Enable NCQ with a long queue. For HDD, choose the mq-deadline or CFQ scheduler, and for SSD, choose noop. Dont reduce the readahead setting.
For HDD, enable the write cache.
Make sure that [`fstrim`](https://en.wikipedia.org/wiki/Trim_(computing)) is enabled for NVME and SSD disks in your OS (usually it's implemented using a cronjob or systemd service).

View File

@ -6,12 +6,158 @@ sidebar_label: clickhouse-local
# clickhouse-local
The `clickhouse-local` program enables you to perform fast processing on local files, without having to deploy and configure the ClickHouse server. It accepts data that represent tables and queries them using [ClickHouse SQL dialect](../../sql-reference/index.md). `clickhouse-local` uses the same core as ClickHouse server, so it supports most of the features and the same set of formats and table engines.
## When to use clickhouse-local vs. ClickHouse
`clickhouse-local` is an easy-to-use version of ClickHouse that is ideal for developers who need to perform fast processing on local and remote files using SQL without having to install a full database server. With `clickhouse-local`, developers can use SQL commands (using the [ClickHouse SQL dialect](../../sql-reference/index.md)) directly from the command line, providing a simple and efficient way to access ClickHouse features without the need for a full ClickHouse installation. One of the main benefits of `clickhouse-local` is that it is already included when installing [clickhouse-client](https://clickhouse.com/docs/en/integrations/sql-clients/clickhouse-client-local). This means that developers can get started with `clickhouse-local` quickly, without the need for a complex installation process.
While `clickhouse-local` is a great tool for development and testing purposes, and for processing files, it is not suitable for serving end users or applications. In these scenarios, it is recommended to use the open-source [ClickHouse](https://clickhouse.com/docs/en/install). ClickHouse is a powerful OLAP database that is designed to handle large-scale analytical workloads. It provides fast and efficient processing of complex queries on large datasets, making it ideal for use in production environments where high-performance is critical. Additionally, ClickHouse offers a wide range of features such as replication, sharding, and high availability, which are essential for scaling up to handle large datasets and serving applications. If you need to handle larger datasets or serve end users or applications, we recommend using open-source ClickHouse instead of `clickhouse-local`.
Please read the docs below that show example use cases for `clickhouse-local`, such as [querying local CSVs](#query-data-in-a-csv-file-using-sql) or [reading a parquet file in S3](#query-data-in-a-parquet-file-in-aws-s3).
## Download clickhouse-local
`clickhouse-local` is executed using the same `clickhouse` binary that runs the ClickHouse server and `clickhouse-client`. The easiest way to download the latest version is with the following command:
```bash
curl https://clickhouse.com/ | sh
```
:::note
The binary you just downloaded can run all sorts of ClickHouse tools and utilities. If you want to run ClickHouse as a database server, check out the [Quick Start](../../quick-start.mdx).
:::
## Query data in a CSV file using SQL
A common use of `clickhouse-local` is to run ad-hoc queries on files: where you don't have to insert the data into a table. `clickhouse-local` can stream the data from a file into a temporary table and execute your SQL.
If the file is sitting on the same machine as `clickhouse-local`, use the `file` table engine. The following `reviews.tsv` file contains a sampling of Amazon product reviews:
```bash
./clickhouse local -q "SELECT * FROM file('reviews.tsv')"
```
ClickHouse knows the file uses a tab-separated format from filename extension. If you need to explicitly specify the format, simply add one of the [many ClickHouse input formats](../../interfaces/formats.md):
```bash
./clickhouse local -q "SELECT * FROM file('reviews.tsv', 'TabSeparated')"
```
The `file` table function creates a table, and you can use `DESCRIBE` to see the inferred schema:
```bash
./clickhouse local -q "DESCRIBE file('reviews.tsv')"
```
```response
marketplace Nullable(String)
customer_id Nullable(Int64)
review_id Nullable(String)
product_id Nullable(String)
product_parent Nullable(Int64)
product_title Nullable(String)
product_category Nullable(String)
star_rating Nullable(Int64)
helpful_votes Nullable(Int64)
total_votes Nullable(Int64)
vine Nullable(String)
verified_purchase Nullable(String)
review_headline Nullable(String)
review_body Nullable(String)
review_date Nullable(Date)
```
Let's find a product with the highest rating:
```bash
./clickhouse local -q "SELECT
argMax(product_title,star_rating),
max(star_rating)
FROM file('reviews.tsv')"
```
```response
Monopoly Junior Board Game 5
```
## Query data in a Parquet file in AWS S3
If you have a file in S3, use `clickhouse-local` and the `s3` table function to query the file in place (without inserting the data into a ClickHouse table). We have a file named `house_0.parquet` in a public bucket that contains home prices of property sold in the United Kingdom. Let's see how many rows it has:
```bash
./clickhouse local -q "
SELECT count()
FROM s3('https://datasets-documentation.s3.eu-west-3.amazonaws.com/house_parquet/house_0.parquet')"
```
The file has 2.7M rows:
```response
2772030
```
It's always useful to see what the inferred schema that ClickHouse determines from the file:
```bash
./clickhouse local -q "DESCRIBE s3('https://datasets-documentation.s3.eu-west-3.amazonaws.com/house_parquet/house_0.parquet')"
```
```response
price Nullable(Int64)
date Nullable(UInt16)
postcode1 Nullable(String)
postcode2 Nullable(String)
type Nullable(String)
is_new Nullable(UInt8)
duration Nullable(String)
addr1 Nullable(String)
addr2 Nullable(String)
street Nullable(String)
locality Nullable(String)
town Nullable(String)
district Nullable(String)
county Nullable(String)
```
Let's see what the most expensive neighborhoods are:
```bash
./clickhouse local -q "
SELECT
town,
district,
count() AS c,
round(avg(price)) AS price,
bar(price, 0, 5000000, 100)
FROM s3('https://datasets-documentation.s3.eu-west-3.amazonaws.com/house_parquet/house_0.parquet')
GROUP BY
town,
district
HAVING c >= 100
ORDER BY price DESC
LIMIT 10"
```
```response
LONDON CITY OF LONDON 886 2271305 █████████████████████████████████████████████▍
LEATHERHEAD ELMBRIDGE 206 1176680 ███████████████████████▌
LONDON CITY OF WESTMINSTER 12577 1108221 ██████████████████████▏
LONDON KENSINGTON AND CHELSEA 8728 1094496 █████████████████████▉
HYTHE FOLKESTONE AND HYTHE 130 1023980 ████████████████████▍
CHALFONT ST GILES CHILTERN 113 835754 ████████████████▋
AMERSHAM BUCKINGHAMSHIRE 113 799596 ███████████████▉
VIRGINIA WATER RUNNYMEDE 356 789301 ███████████████▊
BARNET ENFIELD 282 740514 ██████████████▊
NORTHWOOD THREE RIVERS 184 731609 ██████████████▋
```
:::tip
When you are ready to insert your files into ClickHouse, startup a ClickHouse server and insert the results of your `file` and `s3` table functions into a `MergeTree` table. View the [Quick Start](../../quick-start.mdx) for more details.
:::
By default `clickhouse-local` has access to data on the same host, and it does not depend on the server's configuration. It also supports loading server configuration using `--config-file` argument. For temporary data, a unique temporary data directory is created by default.
## Usage {#usage}
By default `clickhouse-local` has access to data of a ClickHouse server on the same host, and it does not depend on the server's configuration. It also supports loading server configuration using `--config-file` argument. For temporary data, a unique temporary data directory is created by default.
Basic usage (Linux):
``` bash
@ -24,7 +170,9 @@ Basic usage (Mac):
$ ./clickhouse local --structure "table_structure" --input-format "format_of_incoming_data" --query "query"
```
Also supported on Windows through WSL2.
:::note
`clickhouse-local` is also supported on Windows through WSL2.
:::
Arguments:

View File

@ -0,0 +1,76 @@
---
slug: /en/sql-reference/aggregate-functions/reference/quantileApprox
sidebar_position: 204
---
# quantileApprox
Computes the [quantile](https://en.wikipedia.org/wiki/Quantile) of a numeric data sequence using the [Greenwald-Khanna](http://infolab.stanford.edu/~datar/courses/cs361a/papers/quantiles.pdf) algorithm. The Greenwald-Khanna algorithm is an algorithm used to compute quantiles on a stream of data in a highly efficient manner. It was introduced by Michael Greenwald and Sanjeev Khanna in 2001. It is widely used in databases and big data systems where computing accurate quantiles on a large stream of data in real-time is necessary. The algorithm is highly efficient, taking only O(log n) space and O(log log n) time per item (where n is the size of the input). It is also highly accurate, providing an approximate quantile value with high probability.
`quantileApprox` is different from other quantile functions in ClickHouse, because it enables user to control the accuracy of the approximate quantile result.
**Syntax**
``` sql
quantileApprox(accuracy, level)(expr)
```
Alias: `medianApprox`.
**Arguments**
- `accuracy` — Accuracy of quantile. Constant positive integer. Larger accuracy value means less error. For example, if the accuracy argument is set to 100, the computed quantile will have an error no greater than 1% with high probability. There is a trade-off between the accuracy of the computed quantiles and the computational complexity of the algorithm. A larger accuracy requires more memory and computational resources to compute the quantile accurately, while a smaller accuracy argument allows for a faster and more memory-efficient computation but with a slightly lower accuracy.
- `level` — Level of quantile. Optional parameter. Constant floating-point number from 0 to 1. Default value: 0.5. At `level=0.5` the function calculates [median](https://en.wikipedia.org/wiki/Median).
- `expr` — Expression over the column values resulting in numeric [data types](../../../sql-reference/data-types/index.md#data_types), [Date](../../../sql-reference/data-types/date.md) or [DateTime](../../../sql-reference/data-types/datetime.md).
**Returned value**
- Quantile of the specified level and accuracy.
Type:
- [Float64](../../../sql-reference/data-types/float.md) for numeric data type input.
- [Date](../../../sql-reference/data-types/date.md) if input values have the `Date` type.
- [DateTime](../../../sql-reference/data-types/datetime.md) if input values have the `DateTime` type.
**Example**
``` sql
SELECT quantileApprox(1, 0.25)(number + 1)
FROM numbers(1000)
┌─quantileApprox(1, 0.25)(plus(number, 1))─┐
│ 1 │
└──────────────────────────────────────────┘
SELECT quantileApprox(10, 0.25)(number + 1)
FROM numbers(1000)
┌─quantileApprox(10, 0.25)(plus(number, 1))─┐
│ 156 │
└───────────────────────────────────────────┘
SELECT quantileApprox(100, 0.25)(number + 1)
FROM numbers(1000)
┌─quantileApprox(100, 0.25)(plus(number, 1))─┐
│ 251 │
└────────────────────────────────────────────┘
SELECT quantileApprox(1000, 0.25)(number + 1)
FROM numbers(1000)
┌─quantileApprox(1000, 0.25)(plus(number, 1))─┐
│ 249 │
└─────────────────────────────────────────────┘
```
**See Also**
- [median](../../../sql-reference/aggregate-functions/reference/median.md#median)
- [quantiles](../../../sql-reference/aggregate-functions/reference/quantiles.md#quantiles)

View File

@ -114,3 +114,59 @@ Result:
│ [249.75,499.5,749.25,899.1,949.05,989.01,998.001] │
└─────────────────────────────────────────────────────────────────────┘
```
## quantilesApprox
`quantilesApprox` works similarly with `quantileApprox` but allows us to calculate quantities at different levels simultaneously and returns an array.
**Syntax**
``` sql
quantilesApprox(accuracy, level1, level2, ...)(expr)
```
**Returned value**
- [Array](../../../sql-reference/data-types/array.md) of quantiles of the specified levels.
Type of array values:
- [Float64](../../../sql-reference/data-types/float.md) for numeric data type input.
- [Date](../../../sql-reference/data-types/date.md) if input values have the `Date` type.
- [DateTime](../../../sql-reference/data-types/datetime.md) if input values have the `DateTime` type.
**Example**
Query:
``` sql
SELECT quantilesApprox(1, 0.25, 0.5, 0.75)(number + 1)
FROM numbers(1000)
┌─quantilesApprox(1, 0.25, 0.5, 0.75)(plus(number, 1))─┐
│ [1,1,1] │
└──────────────────────────────────────────────────────┘
SELECT quantilesApprox(10, 0.25, 0.5, 0.75)(number + 1)
FROM numbers(1000)
┌─quantilesApprox(10, 0.25, 0.5, 0.75)(plus(number, 1))─┐
│ [156,413,659] │
└───────────────────────────────────────────────────────┘
SELECT quantilesApprox(100, 0.25, 0.5, 0.75)(number + 1)
FROM numbers(1000)
┌─quantilesApprox(100, 0.25, 0.5, 0.75)(plus(number, 1))─┐
│ [251,498,741] │
└────────────────────────────────────────────────────────┘
SELECT quantilesApprox(1000, 0.25, 0.5, 0.75)(number + 1)
FROM numbers(1000)
┌─quantilesApprox(1000, 0.25, 0.5, 0.75)(plus(number, 1))─┐
│ [249,499,749] │
└─────────────────────────────────────────────────────────┘
```

View File

@ -1276,16 +1276,16 @@ Using replacement fields, you can define a pattern for the resulting string. “
| %k | hour in 24h format (00-23) | 22 |
| %l | hour in 12h format (01-12) | 09 |
| %m | month as an integer number (01-12) | 01 |
| %M | minute (00-59) | 33 |
| %M | full month name (January-December), see (*) below | January |
| %n | new-line character () | |
| %p | AM or PM designation | PM |
| %Q | Quarter (1-4) | 1 |
| %r | 12-hour HH:MM AM/PM time, equivalent to %H:%M %p | 10:30 PM |
| %R | 24-hour HH:MM time, equivalent to %H:%M | 22:33 |
| %r | 12-hour HH:MM AM/PM time, equivalent to %H:%i %p | 10:30 PM |
| %R | 24-hour HH:MM time, equivalent to %H:%i | 22:33 |
| %s | second (00-59) | 44 |
| %S | second (00-59) | 44 |
| %t | horizontal-tab character () | |
| %T | ISO 8601 time format (HH:MM:SS), equivalent to %H:%M:%S | 22:33:44 |
| %T | ISO 8601 time format (HH:MM:SS), equivalent to %H:%i:%S | 22:33:44 |
| %u | ISO 8601 weekday as number with Monday as 1 (1-7) | 2 |
| %V | ISO 8601 week number (01-53) | 01 |
| %w | weekday as a integer number with Sunday as 0 (0-6) | 2 |
@ -1295,6 +1295,8 @@ Using replacement fields, you can define a pattern for the resulting string. “
| %z | Time offset from UTC as +HHMM or -HHMM | -0500 |
| %% | a % sign | % |
(*) In ClickHouse versions earlier than v23.4, `%M` prints the minute (00-59) instead of the full month name (January-December). The previous behavior can be restored using setting `formatdatetime_parsedatetime_m_is_month_name = 0`.
**Example**
Query:
@ -1463,28 +1465,28 @@ Result:
└───────────────────────┘
```
## FROM\_UNIXTIME
## fromUnixTimestamp
Function converts Unix timestamp to a calendar date and a time of a day. When there is only a single argument of [Integer](../../sql-reference/data-types/int-uint.md) type, it acts in the same way as [toDateTime](../../sql-reference/functions/type-conversion-functions.md#todatetime) and return [DateTime](../../sql-reference/data-types/datetime.md) type.
FROM_UNIXTIME uses MySQL datetime format style, refer to https://dev.mysql.com/doc/refman/8.0/en/date-and-time-functions.html#function_date-format.
fromUnixTimestamp uses MySQL datetime format style, refer to https://dev.mysql.com/doc/refman/8.0/en/date-and-time-functions.html#function_date-format.
Alias: `fromUnixTimestamp`.
Alias: `FROM_UNIXTIME`.
**Example:**
Query:
```sql
SELECT FROM_UNIXTIME(423543535);
SELECT fromUnixTimestamp(423543535);
```
Result:
```text
┌─FROM_UNIXTIME(423543535)─┐
│ 1983-06-04 10:58:55 │
└──────────────────────────┘
┌─fromUnixTimestamp(423543535)─┐
1983-06-04 10:58:55 │
└──────────────────────────────
```
When there are two or three arguments, the first an [Integer](../../sql-reference/data-types/int-uint.md), [Date](../../sql-reference/data-types/date.md), [Date32](../../sql-reference/data-types/date32.md), [DateTime](../../sql-reference/data-types/datetime.md) or [DateTime64](../../sql-reference/data-types/datetime64.md), the second a constant format string and the third an optional constant time zone string — it acts in the same way as [formatDateTime](#formatdatetime) and return [String](../../sql-reference/data-types/string.md#string) type.
@ -1492,7 +1494,7 @@ When there are two or three arguments, the first an [Integer](../../sql-referenc
For example:
```sql
SELECT FROM_UNIXTIME(1234334543, '%Y-%m-%d %R:%S') AS DateTime;
SELECT fromUnixTimestamp(1234334543, '%Y-%m-%d %R:%S') AS DateTime;
```
```text
@ -1505,11 +1507,12 @@ SELECT FROM_UNIXTIME(1234334543, '%Y-%m-%d %R:%S') AS DateTime;
- [fromUnixTimestampInJodaSyntax](##fromUnixTimestampInJodaSyntax)
## fromUnixTimestampInJodaSyntax
Similar to FROM_UNIXTIME, except that it formats time in Joda style instead of MySQL style. Refer to https://joda-time.sourceforge.net/apidocs/org/joda/time/format/DateTimeFormat.html.
Similar to fromUnixTimestamp, except that it formats time in Joda style instead of MySQL style. Refer to https://joda-time.sourceforge.net/apidocs/org/joda/time/format/DateTimeFormat.html.
**Example:**
Query:
``` sql
SELECT fromUnixTimestampInJodaSyntax(1669804872, 'yyyy-MM-dd HH:mm:ss', 'UTC');
@ -1517,12 +1520,11 @@ SELECT fromUnixTimestampInJodaSyntax(1669804872, 'yyyy-MM-dd HH:mm:ss', 'UTC');
Result:
```
┌─fromUnixTimestampInJodaSyntax(1669804872, 'yyyy-MM-dd HH:mm:ss', 'UTC')─┐
┌─fromUnixTimestampInJodaSyntax(1669804872, 'yyyy-MM-dd HH:mm:ss', 'UTC')────
│ 2022-11-30 10:41:12 │
└────────────────────────────────────────────────────────────────────────────┘
```
## toModifiedJulianDay
Converts a [Proleptic Gregorian calendar](https://en.wikipedia.org/wiki/Proleptic_Gregorian_calendar) date in text form `YYYY-MM-DD` to a [Modified Julian Day](https://en.wikipedia.org/wiki/Julian_day#Variants) number in Int32. This function supports date from `0000-01-01` to `9999-12-31`. It raises an exception if the argument cannot be parsed as a date, or the date is invalid.

View File

@ -401,7 +401,7 @@ Before version 21.11 the order of arguments was wrong, i.e. JSON_QUERY(path, jso
Parses a JSON and extract a value as JSON scalar.
If the value does not exist, an empty string will be returned.
If the value does not exist, an empty string will be returned by default, and by SET `function_return_type_allow_nullable` = `true`, `NULL` will be returned. If the value is complex type (such as: struct, array, map), an empty string will be returned by default, and by SET `function_json_value_return_type_allow_complex` = `true`, the complex value will be returned.
Example:
@ -410,6 +410,8 @@ SELECT JSON_VALUE('{"hello":"world"}', '$.hello');
SELECT JSON_VALUE('{"array":[[0, 1, 2, 3, 4, 5], [0, -1, -2, -3, -4, -5]]}', '$.array[*][0 to 2, 4]');
SELECT JSON_VALUE('{"hello":2}', '$.hello');
SELECT toTypeName(JSON_VALUE('{"hello":2}', '$.hello'));
select JSON_VALUE('{"hello":"world"}', '$.b') settings function_return_type_allow_nullable=true;
select JSON_VALUE('{"hello":{"world":"!"}}', '$.hello') settings function_json_value_return_type_allow_complex=true;
```
Result:

View File

@ -13,17 +13,18 @@ Functions for [searching](../../sql-reference/functions/string-search-functions.
## replaceOne(haystack, pattern, replacement)
Replaces the first occurrence of the substring pattern (if it exists) in haystack by the replacement string.
pattern and replacement must be constants.
## replaceAll(haystack, pattern, replacement), replace(haystack, pattern, replacement)
Replaces all occurrences of the substring pattern in haystack by the replacement string.
Alias: `replace`.
## replaceRegexpOne(haystack, pattern, replacement)
Replaces the first occurrence of the substring matching the regular expression pattern in haystack by the replacement string.
pattern must be a constant [re2 regular expression](https://github.com/google/re2/wiki/Syntax).
replacement must be a plain constant string or a constant string containing substitutions `\0-\9`.
pattern must be a [re2 regular expression](https://github.com/google/re2/wiki/Syntax).
replacement must be a plain string or a string containing substitutions `\0-\9`.
Substitutions `\1-\9` correspond to the 1st to 9th capturing group (submatch), substitution `\0` corresponds to the entire match.
To use a verbatim `\` character in the pattern or replacement string, escape it using `\`.
Also keep in mind that string literals require an extra escaping.
@ -88,6 +89,8 @@ SELECT replaceRegexpAll('Hello, World!', '^', 'here: ') AS res
└─────────────────────┘
```
Alias: `REGEXP_REPLACE`.
## regexpQuoteMeta(s)
The function adds a backslash before some predefined characters in the string.

View File

@ -208,7 +208,7 @@ Type: [Array](../../sql-reference/data-types/array.md)([Tuple](../../sql-referen
Query:
``` sql
CREATE TABLE tupletest (`col` Tuple(user_ID UInt64, session_ID UInt64) ENGINE = Memory;
CREATE TABLE tupletest (col Tuple(user_ID UInt64, session_ID UInt64)) ENGINE = Memory;
INSERT INTO tupletest VALUES (tuple( 100, 2502)), (tuple(1,100));
@ -227,11 +227,11 @@ Result:
It is possible to transform colums to rows using this function:
``` sql
CREATE TABLE tupletest (`col` Tuple(CPU Float64, Memory Float64, Disk Float64)) ENGINE = Memory;
CREATE TABLE tupletest (col Tuple(CPU Float64, Memory Float64, Disk Float64)) ENGINE = Memory;
INSERT INTO tupletest VALUES(tuple(3.3, 5.5, 6.6));
SELECT arrayJoin(tupleToNameValuePairs(col))FROM tupletest;
SELECT arrayJoin(tupleToNameValuePairs(col)) FROM tupletest;
```
Result:

View File

@ -68,9 +68,9 @@ Result:
## mapFromArrays
Merges an [Array](../../sql-reference/data-types/array.md) of keys and an [Array](../../sql-reference/data-types/array.md) of values into a [Map(key, value)](../../sql-reference/data-types/map.md).
Merges an [Array](../../sql-reference/data-types/array.md) of keys and an [Array](../../sql-reference/data-types/array.md) of values into a [Map(key, value)](../../sql-reference/data-types/map.md). Notice that the second argument could also be a [Map](../../sql-reference/data-types/map.md), thus it is casted to an Array when executing.
The function is a more convenient alternative to `CAST((key_array, value_array), 'Map(key_type, value_type)')`. For example, instead of writing `CAST((['aa', 'bb'], [4, 5]), 'Map(String, UInt32)')`, you can write `mapFromArrays(['aa', 'bb'], [4, 5])`.
The function is a more convenient alternative to `CAST((key_array, value_array_or_map), 'Map(key_type, value_type)')`. For example, instead of writing `CAST((['aa', 'bb'], [4, 5]), 'Map(String, UInt32)')`, you can write `mapFromArrays(['aa', 'bb'], [4, 5])`.
**Syntax**
@ -82,11 +82,11 @@ Alias: `MAP_FROM_ARRAYS(keys, values)`
**Arguments**
- `keys` — Given key array to create a map from. The nested type of array must be: [String](../../sql-reference/data-types/string.md), [Integer](../../sql-reference/data-types/int-uint.md), [LowCardinality](../../sql-reference/data-types/lowcardinality.md), [FixedString](../../sql-reference/data-types/fixedstring.md), [UUID](../../sql-reference/data-types/uuid.md), [Date](../../sql-reference/data-types/date.md), [DateTime](../../sql-reference/data-types/datetime.md), [Date32](../../sql-reference/data-types/date32.md), [Enum](../../sql-reference/data-types/enum.md)
- `values` - Given value array to create a map from.
- `values` - Given value array or map to create a map from.
**Returned value**
- A map whose keys and values are constructed from the key and value arrays
- A map whose keys and values are constructed from the key array and value array/map.
**Example**
@ -94,13 +94,17 @@ Query:
```sql
select mapFromArrays(['a', 'b', 'c'], [1, 2, 3])
```
```text
┌─mapFromArrays(['a', 'b', 'c'], [1, 2, 3])─┐
│ {'a':1,'b':2,'c':3} │
└───────────────────────────────────────────┘
```
SELECT mapFromArrays([1, 2, 3], map('a', 1, 'b', 2, 'c', 3))
┌─mapFromArrays([1, 2, 3], map('a', 1, 'b', 2, 'c', 3))─┐
│ {1:('a',1),2:('b',2),3:('c',3)} │
└───────────────────────────────────────────────────────┘
```
## mapAdd

View File

@ -22,6 +22,10 @@ DROP DATABASE [IF EXISTS] db [ON CLUSTER cluster] [SYNC]
Deletes the table.
:::tip
Also see [UNDROP TABLE](/docs/en/sql-reference/statements/undrop.md)
:::
Syntax:
``` sql

View File

@ -114,11 +114,11 @@ This will also create system tables even if message queue is empty.
## RELOAD CONFIG
Reloads ClickHouse configuration. Used when configuration is stored in ZooKeeper.
Reloads ClickHouse configuration. Used when configuration is stored in ZooKeeper. Note that `SYSTEM RELOAD CONFIG` does not reload `USER` configuration stored in ZooKeeper, it only reloads `USER` configuration that is stored in `users.xml`. To reload all `USER` config use `SYSTEM RELOAD USERS`
## RELOAD USERS
Reloads all access storages, including: users.xml, local disk access storage, replicated (in ZooKeeper) access storage. Note that `SYSTEM RELOAD CONFIG` will only reload users.xml access storage.
Reloads all access storages, including: users.xml, local disk access storage, replicated (in ZooKeeper) access storage.
## SHUTDOWN
@ -224,6 +224,14 @@ Clears freezed backup with the specified name from all the disks. See more about
SYSTEM UNFREEZE WITH NAME <backup_name>
```
### WAIT LOADING PARTS
Wait until all asynchronously loading data parts of a table (outdated data parts) will became loaded.
``` sql
SYSTEM WAIT LOADING PARTS [db.]merge_tree_family_table_name
```
## Managing ReplicatedMergeTree Tables
ClickHouse can manage background replication related processes in [ReplicatedMergeTree](../../engines/table-engines/mergetree-family/replication.md#table_engines-replication) tables.
@ -283,10 +291,14 @@ SYSTEM START REPLICATION QUEUES [[db.]replicated_merge_tree_family_table_name]
Wait until a `ReplicatedMergeTree` table will be synced with other replicas in a cluster, but no more than `receive_timeout` seconds.
``` sql
SYSTEM SYNC REPLICA [ON CLUSTER cluster_name] [db.]replicated_merge_tree_family_table_name [STRICT]
SYSTEM SYNC REPLICA [ON CLUSTER cluster_name] [db.]replicated_merge_tree_family_table_name [STRICT | LIGHTWEIGHT | PULL]
```
After running this statement the `[db.]replicated_merge_tree_family_table_name` fetches commands from the common replicated log into its own replication queue, and then the query waits till the replica processes all of the fetched commands. If a `STRICT` modifier was specified then the query waits for the replication queue to become empty. The `STRICT` version may never succeed if new entries constantly appear in the replication queue.
After running this statement the `[db.]replicated_merge_tree_family_table_name` fetches commands from the common replicated log into its own replication queue, and then the query waits till the replica processes all of the fetched commands. The following modifiers are supported:
- If a `STRICT` modifier was specified then the query waits for the replication queue to become empty. The `STRICT` version may never succeed if new entries constantly appear in the replication queue.
- If a `LIGHTWEIGHT` modifier was specified then the query waits only for `GET_PART`, `ATTACH_PART`, `DROP_RANGE`, `REPLACE_RANGE` and `DROP_PART` entries to be processed.
- If a `PULL` modifier was specified then the query pulls new replication queue entries from ZooKeeper, but does not wait for anything to be processed.
### RESTART REPLICA

View File

@ -0,0 +1,99 @@
---
slug: /en/sql-reference/statements/undrop
sidebar_label: UNDROP
---
# UNDROP TABLE
Cancels the dropping of the table.
Beginning with ClickHouse version 23.3 it is possible to UNDROP a table in an Atomic database
within `database_atomic_delay_before_drop_table_sec` (8 minutes by default) of issuing the DROP TABLE statement. Dropped tables are listed in
a system table called `system.dropped_tables`.
If you have a materialized view without a `TO` clause associated with the dropped table, then you will also have to UNDROP the inner table of that view.
:::note
UNDROP TABLE is experimental. To use it add this setting:
```sql
set allow_experimental_undrop_table_query = 1;
```
:::
:::tip
Also see [DROP TABLE](/docs/en/sql-reference/statements/drop.md)
:::
Syntax:
``` sql
UNDROP TABLE [db.]name [UUID '<uuid>'] [ON CLUSTER cluster]
```
**Example**
``` sql
set allow_experimental_undrop_table_query = 1;
```
```sql
CREATE TABLE undropMe
(
`id` UInt8
)
ENGINE = MergeTree
ORDER BY id
```
```sql
DROP TABLE undropMe
```
```sql
SELECT *
FROM system.dropped_tables
FORMAT Vertical
```
```response
Row 1:
──────
index: 0
database: default
table: undropMe
uuid: aa696a1a-1d70-4e60-a841-4c80827706cc
engine: MergeTree
metadata_dropped_path: /var/lib/clickhouse/metadata_dropped/default.undropMe.aa696a1a-1d70-4e60-a841-4c80827706cc.sql
table_dropped_time: 2023-04-05 14:12:12
1 row in set. Elapsed: 0.001 sec.
```
```sql
UNDROP TABLE undropMe
```
```response
Ok.
```
```sql
SELECT *
FROM system.dropped_tables
FORMAT Vertical
```
```response
Ok.
0 rows in set. Elapsed: 0.001 sec.
```
```sql
DESCRIBE TABLE undropMe
FORMAT Vertical
```
```response
Row 1:
──────
name: id
type: UInt8
default_type:
default_expression:
comment:
codec_expression:
ttl_expression:
```

View File

@ -272,10 +272,14 @@ SYSTEM START REPLICATION QUEUES [[db.]replicated_merge_tree_family_table_name]
Ждет когда таблица семейства `ReplicatedMergeTree` будет синхронизирована с другими репликами в кластере, но не более `receive_timeout` секунд:
``` sql
SYSTEM SYNC REPLICA [db.]replicated_merge_tree_family_table_name [STRICT]
SYSTEM SYNC REPLICA [db.]replicated_merge_tree_family_table_name [STRICT | LIGHTWEIGHT | PULL]
```
После выполнения этого запроса таблица `[db.]replicated_merge_tree_family_table_name` загружает команды из общего реплицированного лога в свою собственную очередь репликации. Затем запрос ждет, пока реплика не обработает все загруженные команды. Если указан модификатор `STRICT`, то запрос ждёт когда очередь репликации станет пустой. Строгий вариант запроса может никогда не завершиться успешно, если в очереди репликации постоянно появляются новые записи.
После выполнения этого запроса таблица `[db.]replicated_merge_tree_family_table_name` загружает команды из общего реплицированного лога в свою собственную очередь репликации. Затем запрос ждет, пока реплика не обработает все загруженные команды. Поддерживаются следующие модификаторы:
- Если указан модификатор `STRICT`, то запрос ждёт когда очередь репликации станет пустой. Строгий вариант запроса может никогда не завершиться успешно, если в очереди репликации постоянно появляются новые записи.
- Если указан модификатор `LIGHTWEIGHT`, то запрос ждёт когда будут обработаны записи `GET_PART`, `ATTACH_PART`, `DROP_RANGE`, `REPLACE_RANGE` и `DROP_PART`.
- Если указан модификатор `PULL`, то запрос только загружает записи очереди репликации из ZooKeeper и не ждёт выполнения чего-либо.
### RESTART REPLICA {#query_language-system-restart-replica}

View File

@ -240,7 +240,7 @@ SYSTEM START REPLICATION QUEUES [[db.]replicated_merge_tree_family_table_name]
``` sql
SYSTEM SYNC REPLICA [db.]replicated_merge_tree_family_table_name
SYSTEM SYNC REPLICA [db.]replicated_merge_tree_family_table_name [STRICT | LIGHTWEIGHT | PULL]
```
### RESTART REPLICA {#query_language-system-restart-replica}

View File

@ -277,11 +277,11 @@ void Client::initialize(Poco::Util::Application & self)
*/
const char * env_user = getenv("CLICKHOUSE_USER"); // NOLINT(concurrency-mt-unsafe)
if (env_user)
if (env_user && !config().has("user"))
config().setString("user", env_user);
const char * env_password = getenv("CLICKHOUSE_PASSWORD"); // NOLINT(concurrency-mt-unsafe)
if (env_password)
if (env_password && !config().has("password"))
config().setString("password", env_password);
parseConnectionsCredentials();

View File

@ -1,4 +1,5 @@
#include "ClusterCopierApp.h"
#include <Common/ZooKeeper/ZooKeeper.h>
#include <Common/StatusFile.h>
#include <Common/TerminalSize.h>
#include <IO/ConnectionTimeouts.h>
@ -192,6 +193,8 @@ void ClusterCopierApp::mainImpl()
if (!task_file.empty())
copier->uploadTaskDescription(task_path, task_file, config().getBool("task-upload-force", false));
zkutil::validateZooKeeperConfig(config());
copier->init();
copier->process(ConnectionTimeouts::getTCPTimeoutsWithoutFailover(context->getSettingsRef()));

View File

@ -89,8 +89,12 @@ static std::vector<std::string> extractFromConfig(
if (has_zk_includes && process_zk_includes)
{
DB::ConfigurationPtr bootstrap_configuration(new Poco::Util::XMLConfiguration(config_xml));
zkutil::validateZooKeeperConfig(*bootstrap_configuration);
zkutil::ZooKeeperPtr zookeeper = std::make_shared<zkutil::ZooKeeper>(
*bootstrap_configuration, "zookeeper", nullptr);
*bootstrap_configuration, bootstrap_configuration->has("zookeeper") ? "zookeeper" : "keeper", nullptr);
zkutil::ZooKeeperNodeCache zk_node_cache([&] { return zookeeper; });
config_xml = processor.processConfig(&has_zk_includes, &zk_node_cache);
}

View File

@ -87,7 +87,7 @@ void MetricsTransmitter::transmit(std::vector<ProfileEvents::Count> & prev_count
if (send_events)
{
for (size_t i = 0, end = ProfileEvents::end(); i < end; ++i)
for (ProfileEvents::Event i = ProfileEvents::Event(0), end = ProfileEvents::end(); i < end; ++i)
{
const auto counter = ProfileEvents::global_counters[i].load(std::memory_order_relaxed);
const auto counter_increment = counter - prev_counters[i];
@ -100,7 +100,7 @@ void MetricsTransmitter::transmit(std::vector<ProfileEvents::Count> & prev_count
if (send_events_cumulative)
{
for (size_t i = 0, end = ProfileEvents::end(); i < end; ++i)
for (ProfileEvents::Event i = ProfileEvents::Event(0), end = ProfileEvents::end(); i < end; ++i)
{
const auto counter = ProfileEvents::global_counters[i].load(std::memory_order_relaxed);
std::string key{ProfileEvents::getName(static_cast<ProfileEvents::Event>(i))};
@ -110,7 +110,7 @@ void MetricsTransmitter::transmit(std::vector<ProfileEvents::Count> & prev_count
if (send_metrics)
{
for (size_t i = 0, end = CurrentMetrics::end(); i < end; ++i)
for (CurrentMetrics::Metric i = CurrentMetrics::Metric(0), end = CurrentMetrics::end(); i < end; ++i)
{
const auto value = CurrentMetrics::values[i].load(std::memory_order_relaxed);

View File

@ -815,7 +815,8 @@ try
}
);
bool has_zookeeper = config().has("zookeeper");
zkutil::validateZooKeeperConfig(config());
bool has_zookeeper = zkutil::hasZooKeeperConfig(config());
zkutil::ZooKeeperNodeCache main_config_zk_node_cache([&] { return global_context->getZooKeeper(); });
zkutil::EventPtr main_config_zk_changed_event = std::make_shared<Poco::Event>();
@ -1307,7 +1308,7 @@ try
{
/// We do not load ZooKeeper configuration on the first config loading
/// because TestKeeper server is not started yet.
if (config->has("zookeeper"))
if (zkutil::hasZooKeeperConfig(*config))
global_context->reloadZooKeeperIfChanged(config);
global_context->reloadAuxiliaryZooKeepersConfigIfChanged(config);

View File

@ -288,7 +288,8 @@ public:
readVarUInt(size, buf);
if (unlikely(size > AGGREGATE_FUNCTION_GROUP_ARRAY_MAX_ARRAY_SIZE))
throw Exception(ErrorCodes::TOO_LARGE_ARRAY_SIZE, "Too large array size");
throw Exception(ErrorCodes::TOO_LARGE_ARRAY_SIZE,
"Too large array size (maximum: {})", AGGREGATE_FUNCTION_GROUP_ARRAY_MAX_ARRAY_SIZE);
if (limit_num_elems && unlikely(size > max_elems))
throw Exception(ErrorCodes::TOO_LARGE_ARRAY_SIZE, "Too large array size, it should not exceed {}", max_elems);
@ -367,7 +368,8 @@ struct GroupArrayNodeBase
UInt64 size;
readVarUInt(size, buf);
if (unlikely(size > AGGREGATE_FUNCTION_GROUP_ARRAY_MAX_ARRAY_SIZE))
throw Exception(ErrorCodes::TOO_LARGE_ARRAY_SIZE, "Too large array size");
throw Exception(ErrorCodes::TOO_LARGE_ARRAY_SIZE,
"Too large array size (maximum: {})", AGGREGATE_FUNCTION_GROUP_ARRAY_MAX_ARRAY_SIZE);
Node * node = reinterpret_cast<Node *>(arena->alignedAlloc(sizeof(Node) + size, alignof(Node)));
node->size = size;
@ -621,7 +623,8 @@ public:
return;
if (unlikely(elems > AGGREGATE_FUNCTION_GROUP_ARRAY_MAX_ARRAY_SIZE))
throw Exception(ErrorCodes::TOO_LARGE_ARRAY_SIZE, "Too large array size");
throw Exception(ErrorCodes::TOO_LARGE_ARRAY_SIZE,
"Too large array size (maximum: {})", AGGREGATE_FUNCTION_GROUP_ARRAY_MAX_ARRAY_SIZE);
if (limit_num_elems && unlikely(elems > max_elems))
throw Exception(ErrorCodes::TOO_LARGE_ARRAY_SIZE, "Too large array size, it should not exceed {}", max_elems);

View File

@ -79,7 +79,8 @@ public:
{
length_to_resize = applyVisitor(FieldVisitorConvertToNumber<UInt64>(), params[1]);
if (length_to_resize > AGGREGATE_FUNCTION_GROUP_ARRAY_INSERT_AT_MAX_SIZE)
throw Exception(ErrorCodes::TOO_LARGE_ARRAY_SIZE, "Too large array size");
throw Exception(ErrorCodes::TOO_LARGE_ARRAY_SIZE,
"Too large array size (maximum: {})", AGGREGATE_FUNCTION_GROUP_ARRAY_INSERT_AT_MAX_SIZE);
}
}
@ -167,7 +168,8 @@ public:
readVarUInt(size, buf);
if (size > AGGREGATE_FUNCTION_GROUP_ARRAY_INSERT_AT_MAX_SIZE)
throw Exception(ErrorCodes::TOO_LARGE_ARRAY_SIZE, "Too large array size");
throw Exception(ErrorCodes::TOO_LARGE_ARRAY_SIZE,
"Too large array size (maximum: {})", AGGREGATE_FUNCTION_GROUP_ARRAY_INSERT_AT_MAX_SIZE);
Array & arr = data(place).value;

View File

@ -144,7 +144,8 @@ public:
readVarUInt(size, buf);
if (unlikely(size > AGGREGATE_FUNCTION_MOVING_MAX_ARRAY_SIZE))
throw Exception(ErrorCodes::TOO_LARGE_ARRAY_SIZE, "Too large array size");
throw Exception(ErrorCodes::TOO_LARGE_ARRAY_SIZE,
"Too large array size (maximum: {})", AGGREGATE_FUNCTION_MOVING_MAX_ARRAY_SIZE);
if (size > 0)
{

View File

@ -127,7 +127,8 @@ public:
if (size == 0)
throw Exception(ErrorCodes::INCORRECT_DATA, "Incorrect size (0) in groupBitmap.");
if (size > max_size)
throw Exception(ErrorCodes::TOO_LARGE_ARRAY_SIZE, "Too large array size in groupBitmap.");
throw Exception(ErrorCodes::TOO_LARGE_ARRAY_SIZE,
"Too large array size in groupBitmap (maximum: {})", max_size);
/// TODO: this is unnecessary copying - it will be better to read and deserialize in one pass.
std::unique_ptr<char[]> buf(new char[size]);

View File

@ -294,7 +294,8 @@ public:
throw Exception(ErrorCodes::TOO_LARGE_ARRAY_SIZE, "Too many bins");
static constexpr size_t max_size = 1_GiB;
if (size > max_size)
throw Exception(ErrorCodes::TOO_LARGE_ARRAY_SIZE, "Too large array size in histogram.");
throw Exception(ErrorCodes::TOO_LARGE_ARRAY_SIZE,
"Too large array size in histogram (maximum: {})", max_size);
buf.readStrict(reinterpret_cast<char *>(points), size * sizeof(WeightedValue));
}

View File

@ -117,7 +117,7 @@ struct AggregateFunctionIntervalLengthSumData
readBinary(size, buf);
if (unlikely(size > MAX_ARRAY_SIZE))
throw Exception(ErrorCodes::TOO_LARGE_ARRAY_SIZE, "Too large array size");
throw Exception(ErrorCodes::TOO_LARGE_ARRAY_SIZE, "Too large array size (maximum: {})", MAX_ARRAY_SIZE);
segments.clear();
segments.reserve(size);

View File

@ -0,0 +1,36 @@
#include <AggregateFunctions/AggregateFunctionFactory.h>
#include <AggregateFunctions/AggregateFunctionKolmogorovSmirnovTest.h>
#include <AggregateFunctions/FactoryHelpers.h>
namespace ErrorCodes
{
extern const int NOT_IMPLEMENTED;
}
namespace DB
{
struct Settings;
namespace
{
AggregateFunctionPtr createAggregateFunctionKolmogorovSmirnovTest(
const std::string & name, const DataTypes & argument_types, const Array & parameters, const Settings *)
{
assertBinary(name, argument_types);
if (!isNumber(argument_types[0]) || !isNumber(argument_types[1]))
throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Aggregate function {} only supports numerical types", name);
return std::make_shared<AggregateFunctionKolmogorovSmirnov>(argument_types, parameters);
}
}
void registerAggregateFunctionKolmogorovSmirnovTest(AggregateFunctionFactory & factory)
{
factory.registerFunction("kolmogorovSmirnovTest", createAggregateFunctionKolmogorovSmirnovTest, AggregateFunctionFactory::CaseInsensitive);
}
}

View File

@ -0,0 +1,323 @@
#pragma once
#include <AggregateFunctions/IAggregateFunction.h>
#include <AggregateFunctions/StatCommon.h>
#include <Columns/ColumnVector.h>
#include <Columns/ColumnTuple.h>
#include <Common/Exception.h>
#include <Common/assert_cast.h>
#include <Common/ArenaAllocator.h>
#include <Common/PODArray_fwd.h>
#include <base/types.h>
#include <DataTypes/DataTypeNullable.h>
#include <DataTypes/DataTypesNumber.h>
#include <DataTypes/DataTypeTuple.h>
#include <IO/ReadHelpers.h>
#include <IO/WriteHelpers.h>
namespace DB
{
struct Settings;
namespace ErrorCodes
{
extern const int ILLEGAL_TYPE_OF_ARGUMENT;
extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH;
extern const int BAD_ARGUMENTS;
}
struct KolmogorovSmirnov : public StatisticalSample<Float64, Float64>
{
enum class Alternative
{
TwoSided,
Less,
Greater
};
std::pair<Float64, Float64> getResult(Alternative alternative, String method)
{
::sort(x.begin(), x.end());
::sort(y.begin(), y.end());
Float64 max_s = std::numeric_limits<Float64>::min();
Float64 min_s = std::numeric_limits<Float64>::max();
Float64 now_s = 0;
UInt64 pos_x = 0;
UInt64 pos_y = 0;
UInt64 n1 = x.size();
UInt64 n2 = y.size();
const Float64 n1_d = 1. / n1;
const Float64 n2_d = 1. / n2;
const Float64 tol = 1e-7;
// reference: https://en.wikipedia.org/wiki/Kolmogorov%E2%80%93Smirnov_test
while (pos_x < x.size() && pos_y < y.size())
{
if (likely(fabs(x[pos_x] - y[pos_y]) >= tol))
{
if (x[pos_x] < y[pos_y])
{
now_s += n1_d;
++pos_x;
}
else
{
now_s -= n2_d;
++pos_y;
}
max_s = std::max(max_s, now_s);
min_s = std::min(min_s, now_s);
}
else
{
now_s += n1_d;
++pos_x;
}
}
now_s += n1_d * (x.size() - pos_x) - n2_d * (y.size() - pos_y);
min_s = std::min(min_s, now_s);
max_s = std::max(max_s, now_s);
Float64 d = 0;
if (alternative == Alternative::TwoSided)
d = std::max(std::abs(max_s), std::abs(min_s));
else if (alternative == Alternative::Less)
d = -min_s;
else if (alternative == Alternative::Greater)
d = max_s;
UInt64 g = std::__gcd(n1, n2);
UInt64 nx_g = n1 / g;
UInt64 ny_g = n2 / g;
if (method == "auto")
method = std::max(n1, n2) <= 10000 ? "exact" : "asymp";
else if (method == "exact" && nx_g >= std::numeric_limits<Int32>::max() / ny_g)
method = "asymp";
Float64 p_value = std::numeric_limits<Float64>::infinity();
if (method == "exact")
{
/* reference:
* Gunar Schröer and Dietrich Trenkler
* Exact and Randomization Distributions of Kolmogorov-Smirnov, Tests for Two or Three Samples
*
* and
*
* Thomas Viehmann
* Numerically more stable computation of the p-values for the two-sample Kolmogorov-Smirnov test
*/
if (n2 > n1)
std::swap(n1, n2);
const Float64 f_n1 = static_cast<Float64>(n1);
const Float64 f_n2 = static_cast<Float64>(n2);
const Float64 k_d = (0.5 + floor(d * f_n2 * f_n1 - tol)) / (f_n2 * f_n1);
PaddedPODArray<Float64> c(n1 + 1);
auto check = alternative == Alternative::TwoSided ?
[](const Float64 & q, const Float64 & r, const Float64 & s) { return fabs(r - s) >= q; }
: [](const Float64 & q, const Float64 & r, const Float64 & s) { return r - s >= q; };
c[0] = 0;
for (UInt64 j = 1; j <= n1; j++)
if (check(k_d, 0., j / f_n1))
c[j] = 1.;
else
c[j] = c[j - 1];
for (UInt64 i = 1; i <= n2; i++)
{
if (check(k_d, i / f_n2, 0.))
c[0] = 1.;
for (UInt64 j = 1; j <= n1; j++)
if (check(k_d, i / f_n2, j / f_n1))
c[j] = 1.;
else
{
Float64 v = i / static_cast<Float64>(i + j);
Float64 w = j / static_cast<Float64>(i + j);
c[j] = v * c[j] + w * c[j - 1];
}
}
p_value = c[n1];
}
else if (method == "asymp")
{
Float64 n = std::min(n1, n2);
Float64 m = std::max(n1, n2);
Float64 p = sqrt((n * m) / (n + m)) * d;
if (alternative == Alternative::TwoSided)
{
/* reference:
* J.DURBIN
* Distribution theory for tests based on the sample distribution function
*/
Float64 new_val, old_val, s, w, z;
UInt64 k_max = static_cast<UInt64>(sqrt(2 - log(tol)));
if (p < 1)
{
z = - (M_PI_2 * M_PI_4) / (p * p);
w = log(p);
s = 0;
for (UInt64 k = 1; k < k_max; k += 2)
s += exp(k * k * z - w);
p = s / 0.398942280401432677939946059934;
}
else
{
z = -2 * p * p;
s = -1;
UInt64 k = 1;
old_val = 0;
new_val = 1;
while (fabs(old_val - new_val) > tol)
{
old_val = new_val;
new_val += 2 * s * exp(z * k * k);
s *= -1;
k++;
}
p = new_val;
}
p_value = 1 - p;
}
else
{
/* reference:
* J. L. HODGES, Jr
* The significance probability of the Smirnov two-sample test
*/
// Use Hodges' suggested approximation Eqn 5.3
// Requires m to be the larger of (n1, n2)
Float64 expt = -2 * p * p - 2 * p * (m + 2 * n) / sqrt(m * n * (m + n)) / 3.0;
p_value = exp(expt);
}
}
return {d, p_value};
}
};
class AggregateFunctionKolmogorovSmirnov final:
public IAggregateFunctionDataHelper<KolmogorovSmirnov, AggregateFunctionKolmogorovSmirnov>
{
private:
using Alternative = typename KolmogorovSmirnov::Alternative;
Alternative alternative = Alternative::TwoSided;
String method = "auto";
public:
explicit AggregateFunctionKolmogorovSmirnov(const DataTypes & arguments, const Array & params)
: IAggregateFunctionDataHelper<KolmogorovSmirnov, AggregateFunctionKolmogorovSmirnov> ({arguments}, {}, createResultType())
{
if (params.size() > 2)
throw Exception(ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH, "Aggregate function {} require two parameter or less", getName());
if (params.empty())
return;
if (params[0].getType() != Field::Types::String)
throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "Aggregate function {} require first parameter to be a String", getName());
const auto & param = params[0].get<String>();
if (param == "two-sided")
alternative = Alternative::TwoSided;
else if (param == "less")
alternative = Alternative::Less;
else if (param == "greater")
alternative = Alternative::Greater;
else
throw Exception(ErrorCodes::BAD_ARGUMENTS, "Unknown parameter in aggregate function {}. "
"It must be one of: 'two-sided', 'less', 'greater'", getName());
if (params.size() != 2)
return;
if (params[1].getType() != Field::Types::String)
throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "Aggregate function {} require second parameter to be a String", getName());
method = params[1].get<String>();
if (method != "auto" && method != "exact" && method != "asymp")
throw Exception(ErrorCodes::BAD_ARGUMENTS, "Unknown method in aggregate function {}. "
"It must be one of: 'auto', 'exact', 'asymp'", getName());
}
String getName() const override
{
return "kolmogorovSmirnovTest";
}
bool allocatesMemoryInArena() const override { return true; }
static DataTypePtr createResultType()
{
DataTypes types
{
std::make_shared<DataTypeNumber<Float64>>(),
std::make_shared<DataTypeNumber<Float64>>(),
};
Strings names
{
"d_statistic",
"p_value"
};
return std::make_shared<DataTypeTuple>(
std::move(types),
std::move(names)
);
}
void add(AggregateDataPtr __restrict place, const IColumn ** columns, size_t row_num, Arena * arena) const override
{
Float64 value = columns[0]->getFloat64(row_num);
UInt8 is_second = columns[1]->getUInt(row_num);
if (is_second)
this->data(place).addY(value, arena);
else
this->data(place).addX(value, arena);
}
void merge(AggregateDataPtr __restrict place, ConstAggregateDataPtr rhs, Arena * arena) const override
{
this->data(place).merge(this->data(rhs), arena);
}
void serialize(ConstAggregateDataPtr __restrict place, WriteBuffer & buf, std::optional<size_t> /* version */) const override
{
this->data(place).write(buf);
}
void deserialize(AggregateDataPtr __restrict place, ReadBuffer & buf, std::optional<size_t> /* version */, Arena * arena) const override
{
this->data(place).read(buf, arena);
}
void insertResultInto(AggregateDataPtr __restrict place, IColumn & to, Arena *) const override
{
if (!this->data(place).size_x || !this->data(place).size_y)
throw Exception(ErrorCodes::BAD_ARGUMENTS, "Aggregate function {} require both samples to be non empty", getName());
auto [d_statistic, p_value] = this->data(place).getResult(alternative, method);
/// Because p-value is a probability.
p_value = std::min(1.0, std::max(0.0, p_value));
auto & column_tuple = assert_cast<ColumnTuple &>(to);
auto & column_stat = assert_cast<ColumnVector<Float64> &>(column_tuple.getColumn(0));
auto & column_value = assert_cast<ColumnVector<Float64> &>(column_tuple.getColumn(1));
column_stat.getData().push_back(d_statistic);
column_value.getData().push_back(p_value);
}
};
}

View File

@ -140,7 +140,8 @@ public:
readVarUInt(size, buf);
if (unlikely(size > AGGREGATE_FUNCTION_MAX_INTERSECTIONS_MAX_ARRAY_SIZE))
throw Exception(ErrorCodes::TOO_LARGE_ARRAY_SIZE, "Too large array size");
throw Exception(ErrorCodes::TOO_LARGE_ARRAY_SIZE,
"Too large array size (maximum: {})", AGGREGATE_FUNCTION_MAX_INTERSECTIONS_MAX_ARRAY_SIZE);
auto & value = this->data(place).value;

View File

@ -26,9 +26,11 @@ namespace ErrorCodes
{
extern const int ILLEGAL_TYPE_OF_ARGUMENT;
extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH;
extern const int BAD_ARGUMENTS;
}
template <typename> class QuantileTiming;
template <typename> class QuantileApprox;
/** Generic aggregate function for calculation of quantiles.
@ -60,6 +62,7 @@ private:
using ColVecType = ColumnVectorOrDecimal<Value>;
static constexpr bool returns_float = !(std::is_same_v<FloatReturnType, void>);
static constexpr bool is_quantile_approx = std::is_same_v<Data, QuantileApprox<Value>>;
static_assert(!is_decimal<Value> || !returns_float);
QuantileLevels<Float64> levels;
@ -67,22 +70,57 @@ private:
/// Used when there are single level to get.
Float64 level = 0.5;
/// Used for the approximate version of the algorithm (Greenwald-Khanna)
ssize_t accuracy = 10000;
DataTypePtr & argument_type;
public:
AggregateFunctionQuantile(const DataTypes & argument_types_, const Array & params)
: IAggregateFunctionDataHelper<Data, AggregateFunctionQuantile<Value, Data, Name, has_second_arg, FloatReturnType, returns_many>>(
argument_types_, params, createResultType(argument_types_))
, levels(params, returns_many)
, levels(is_quantile_approx && !params.empty() ? Array(params.begin() + 1, params.end()) : params, returns_many)
, level(levels.levels[0])
, argument_type(this->argument_types[0])
{
if (!returns_many && levels.size() > 1)
throw Exception(ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH, "Aggregate function {} require one parameter or less", getName());
throw Exception(ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH, "Aggregate function {} requires one level parameter or less", getName());
if constexpr (is_quantile_approx)
{
if (params.empty())
throw Exception(
ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH, "Aggregate function {} requires at least one param", getName());
const auto & accuracy_field = params[0];
if (!isInt64OrUInt64FieldType(accuracy_field.getType()))
throw Exception(
ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "Aggregate function {} requires accuracy parameter with integer type", getName());
if (accuracy_field.getType() == Field::Types::Int64)
accuracy = accuracy_field.get<Int64>();
else
accuracy = accuracy_field.get<UInt64>();
if (accuracy <= 0)
throw Exception(
ErrorCodes::BAD_ARGUMENTS,
"Aggregate function {} requires accuracy parameter with positive value but is {}",
getName(),
accuracy);
}
}
String getName() const override { return Name::name; }
void create(AggregateDataPtr __restrict place) const override /// NOLINT
{
if constexpr (is_quantile_approx)
new (place) Data(accuracy);
else
new (place) Data;
}
static DataTypePtr createResultType(const DataTypes & argument_types_)
{
DataTypePtr res;
@ -257,4 +295,7 @@ struct NameQuantilesBFloat16 { static constexpr auto name = "quantilesBFloat16";
struct NameQuantileBFloat16Weighted { static constexpr auto name = "quantileBFloat16Weighted"; };
struct NameQuantilesBFloat16Weighted { static constexpr auto name = "quantilesBFloat16Weighted"; };
struct NameQuantileApprox { static constexpr auto name = "quantileApprox"; };
struct NameQuantilesApprox { static constexpr auto name = "quantilesApprox"; };
}

View File

@ -0,0 +1,71 @@
#include <AggregateFunctions/AggregateFunctionQuantile.h>
#include <AggregateFunctions/QuantileApprox.h>
#include <AggregateFunctions/AggregateFunctionFactory.h>
#include <AggregateFunctions/Helpers.h>
#include <DataTypes/DataTypeDate.h>
#include <DataTypes/DataTypeDateTime.h>
#include <Core/Field.h>
namespace DB
{
namespace ErrorCodes
{
extern const int ILLEGAL_TYPE_OF_ARGUMENT;
}
namespace
{
template <typename Value, bool _> using FuncQuantileApprox = AggregateFunctionQuantile<Value, QuantileApprox<Value>, NameQuantileApprox, false, void, false>;
template <typename Value, bool _> using FuncQuantilesApprox = AggregateFunctionQuantile<Value, QuantileApprox<Value>, NameQuantilesApprox, false, void, true>;
template <template <typename, bool> class Function>
AggregateFunctionPtr createAggregateFunctionQuantile(
const std::string & name, const DataTypes & argument_types, const Array & params, const Settings *)
{
/// Second argument type check doesn't depend on the type of the first one.
Function<void, true>::assertSecondArg(argument_types);
const DataTypePtr & argument_type = argument_types[0];
WhichDataType which(argument_type);
#define DISPATCH(TYPE) \
if (which.idx == TypeIndex::TYPE) \
return std::make_shared<Function<TYPE, true>>(argument_types, params);
FOR_BASIC_NUMERIC_TYPES(DISPATCH)
#undef DISPATCH
if (which.idx == TypeIndex::Date) return std::make_shared<Function<DataTypeDate::FieldType, false>>(argument_types, params);
if (which.idx == TypeIndex::DateTime) return std::make_shared<Function<DataTypeDateTime::FieldType, false>>(argument_types, params);
if (which.idx == TypeIndex::Decimal32) return std::make_shared<Function<Decimal32, false>>(argument_types, params);
if (which.idx == TypeIndex::Decimal64) return std::make_shared<Function<Decimal64, false>>(argument_types, params);
if (which.idx == TypeIndex::Decimal128) return std::make_shared<Function<Decimal128, false>>(argument_types, params);
if (which.idx == TypeIndex::Decimal256) return std::make_shared<Function<Decimal256, false>>(argument_types, params);
if (which.idx == TypeIndex::DateTime64) return std::make_shared<Function<DateTime64, false>>(argument_types, params);
if (which.idx == TypeIndex::Int128) return std::make_shared<Function<Int128, true>>(argument_types, params);
if (which.idx == TypeIndex::UInt128) return std::make_shared<Function<UInt128, true>>(argument_types, params);
if (which.idx == TypeIndex::Int256) return std::make_shared<Function<Int256, true>>(argument_types, params);
if (which.idx == TypeIndex::UInt256) return std::make_shared<Function<UInt256, true>>(argument_types, params);
throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "Illegal type {} of argument for aggregate function {}",
argument_type->getName(), name);
}
}
void registerAggregateFunctionsQuantileApprox(AggregateFunctionFactory & factory)
{
/// For aggregate functions returning array we cannot return NULL on empty set.
AggregateFunctionProperties properties = { .returns_default_when_only_null = true };
factory.registerFunction(NameQuantileApprox::name, createAggregateFunctionQuantile<FuncQuantileApprox>);
factory.registerFunction(NameQuantilesApprox::name, {createAggregateFunctionQuantile<FuncQuantilesApprox>, properties});
/// 'median' is an alias for 'quantile'
factory.registerAlias("medianApprox", NameQuantileApprox::name);
}
}

View File

@ -324,7 +324,8 @@ public:
return;
if (unlikely(size > max_node_size_deserialize))
throw Exception(ErrorCodes::TOO_LARGE_ARRAY_SIZE, "Too large array size");
throw Exception(ErrorCodes::TOO_LARGE_ARRAY_SIZE,
"Too large array size (maximum: {})", max_node_size_deserialize);
auto & value = data(place).value;

View File

@ -1,5 +1,7 @@
#pragma once
#include <base/arithmeticOverflow.h>
#include <array>
#include <string_view>
#include <DataTypes/DataTypeString.h>
@ -43,7 +45,19 @@ struct AggregateFunctionSparkbarData
auto [it, inserted] = points.insert({x, y});
if (!inserted)
it->getMapped() += y;
{
if constexpr (std::is_floating_point_v<Y>)
{
it->getMapped() += y;
return it->getMapped();
}
else
{
Y res;
bool has_overfllow = common::addOverflow(it->getMapped(), y, res);
it->getMapped() = has_overfllow ? std::numeric_limits<Y>::max() : res;
}
}
return it->getMapped();
}
@ -117,6 +131,7 @@ class AggregateFunctionSparkbar final
{
private:
static constexpr size_t BAR_LEVELS = 8;
const size_t width = 0;
/// Range for x specified in parameters.
@ -126,8 +141,8 @@ private:
size_t updateFrame(ColumnString::Chars & frame, Y value) const
{
static constexpr std::array<std::string_view, 9> bars{" ", "", "", "", "", "", "", "", ""};
const auto & bar = (isNaN(value) || value < 1 || 8 < value) ? bars[0] : bars[static_cast<UInt8>(value)];
static constexpr std::array<std::string_view, BAR_LEVELS + 1> bars{" ", "", "", "", "", "", "", "", ""};
const auto & bar = (isNaN(value) || value < 1 || static_cast<Y>(BAR_LEVELS) < value) ? bars[0] : bars[static_cast<UInt8>(value)];
frame.insert(bar.begin(), bar.end());
return bar.size();
}
@ -161,7 +176,7 @@ private:
}
PaddedPODArray<Y> histogram(width, 0);
PaddedPODArray<UInt64> fhistogram(width, 0);
PaddedPODArray<UInt64> count_histogram(width, 0); /// The number of points in each bucket
for (const auto & point : data.points)
{
@ -176,22 +191,30 @@ private:
Float64 w = histogram.size();
size_t index = std::min<size_t>(static_cast<size_t>(w / delta * value), histogram.size() - 1);
if (std::numeric_limits<Y>::max() - histogram[index] > point.getMapped())
Y res;
bool has_overfllow = false;
if constexpr (std::is_floating_point_v<Y>)
res = histogram[index] + point.getMapped();
else
has_overfllow = common::addOverflow(histogram[index], point.getMapped(), res);
if (unlikely(has_overfllow))
{
histogram[index] += point.getMapped();
fhistogram[index] += 1;
/// In case of overflow, just saturate
/// Do not count new values, because we do not know how many of them were added
histogram[index] = std::numeric_limits<Y>::max();
}
else
{
/// In case of overflow, just saturate
histogram[index] = std::numeric_limits<Y>::max();
histogram[index] = res;
count_histogram[index] += 1;
}
}
for (size_t i = 0; i < histogram.size(); ++i)
{
if (fhistogram[i] > 0)
histogram[i] /= fhistogram[i];
if (count_histogram[i] > 0)
histogram[i] /= count_histogram[i];
}
Y y_max = 0;
@ -209,12 +232,30 @@ private:
return;
}
/// Scale the histogram to the range [0, BAR_LEVELS]
for (auto & y : histogram)
{
if (isNaN(y) || y <= 0)
{
y = 0;
continue;
}
constexpr auto levels_num = static_cast<Y>(BAR_LEVELS - 1);
if constexpr (std::is_floating_point_v<Y>)
{
y = y / (y_max / levels_num) + 1;
}
else
y = y * 7 / y_max + 1;
{
Y scaled;
bool has_overfllow = common::mulOverflow<Y>(y, levels_num, scaled);
if (has_overfllow)
y = y / (y_max / levels_num) + 1;
else
y = scaled / y_max + 1;
}
}
size_t sz = 0;

View File

@ -0,0 +1,477 @@
#pragma once
#include <cmath>
#include <base/sort.h>
#include <Common/RadixSort.h>
#include <IO/WriteBuffer.h>
#include <IO/ReadBuffer.h>
#include <IO/WriteHelpers.h>
#include <IO/ReadHelpers.h>
namespace DB
{
namespace ErrorCodes
{
extern const int LOGICAL_ERROR;
extern const int NOT_IMPLEMENTED;
}
template <typename T>
class ApproxSampler
{
public:
struct Stats
{
T value; // the sampled value
Int64 g; // the minimum rank jump from the previous value's minimum rank
Int64 delta; // the maximum span of the rank
Stats() = default;
Stats(T value_, Int64 g_, Int64 delta_) : value(value_), g(g_), delta(delta_) {}
};
struct QueryResult
{
size_t index;
Int64 rank;
T value;
QueryResult(size_t index_, Int64 rank_, T value_) : index(index_), rank(rank_), value(value_) { }
};
ApproxSampler() = default;
explicit ApproxSampler(
double relative_error_,
size_t compress_threshold_ = default_compress_threshold,
size_t count_ = 0,
bool compressed_ = false)
: relative_error(relative_error_)
, compress_threshold(compress_threshold_)
, count(count_)
, compressed(compressed_)
{
sampled.reserve(compress_threshold);
backup_sampled.reserve(compress_threshold);
head_sampled.reserve(default_head_size);
}
bool isCompressed() const { return compressed; }
void setCompressed() { compressed = true; }
void insert(T x)
{
head_sampled.push_back(x);
compressed = false;
if (head_sampled.size() >= default_head_size)
{
withHeadBufferInserted();
if (sampled.size() >= compress_threshold)
compress();
}
}
void query(const Float64 * percentiles, const size_t * indices, size_t size, T * result) const
{
if (!head_sampled.empty())
throw Exception(ErrorCodes::LOGICAL_ERROR, "Cannot operate on an uncompressed summary, call compress() first");
if (sampled.empty())
{
for (size_t i = 0; i < size; ++i)
result[i] = T();
return;
}
Int64 current_max = std::numeric_limits<Int64>::min();
for (const auto & stats : sampled)
current_max = std::max(stats.delta + stats.g, current_max);
Int64 target_error = current_max/2;
size_t index= 0;
auto min_rank = sampled[0].g;
for (size_t i = 0; i < size; ++i)
{
double percentile = percentiles[indices[i]];
if (percentile <= relative_error)
{
result[indices[i]] = sampled.front().value;
}
else if (percentile >= 1 - relative_error)
{
result[indices[i]] = sampled.back().value;
}
else
{
QueryResult res = findApproxQuantile(index, min_rank, target_error, percentile);
index = res.index;
min_rank = res.rank;
result[indices[i]] = res.value;
}
}
}
void compress()
{
if (compressed)
return;
withHeadBufferInserted();
doCompress(2 * relative_error * count);
compressed = true;
}
void merge(const ApproxSampler & other)
{
if (other.count == 0)
return;
else if (count == 0)
{
compress_threshold = other.compress_threshold;
relative_error = other.relative_error;
count = other.count;
compressed = other.compressed;
sampled.resize(other.sampled.size());
memcpy(sampled.data(), other.sampled.data(), sizeof(Stats) * other.sampled.size());
return;
}
else
{
// Merge the two buffers.
// The GK algorithm is a bit unclear about it, but we need to adjust the statistics during the
// merging. The main idea is that samples that come from one side will suffer from the lack of
// precision of the other.
// As a concrete example, take two QuantileSummaries whose samples (value, g, delta) are:
// `a = [(0, 1, 0), (20, 99, 0)]` and `b = [(10, 1, 0), (30, 49, 0)]`
// This means `a` has 100 values, whose minimum is 0 and maximum is 20,
// while `b` has 50 values, between 10 and 30.
// The resulting samples of the merge will be:
// a+b = [(0, 1, 0), (10, 1, ??), (20, 99, ??), (30, 49, 0)]
// The values of `g` do not change, as they represent the minimum number of values between two
// consecutive samples. The values of `delta` should be adjusted, however.
// Take the case of the sample `10` from `b`. In the original stream, it could have appeared
// right after `0` (as expressed by `g=1`) or right before `20`, so `delta=99+0-1=98`.
// In the GK algorithm's style of working in terms of maximum bounds, one can observe that the
// maximum additional uncertainty over samples coming from `b` is `max(g_a + delta_a) =
// floor(2 * eps_a * n_a)`. Likewise, additional uncertainty over samples from `a` is
// `floor(2 * eps_b * n_b)`.
// Only samples that interleave the other side are affected. That means that samples from
// one side that are lesser (or greater) than all samples from the other side are just copied
// unmodified.
// If the merging instances have different `relativeError`, the resulting instance will carry
// the largest one: `eps_ab = max(eps_a, eps_b)`.
// The main invariant of the GK algorithm is kept:
// `max(g_ab + delta_ab) <= floor(2 * eps_ab * (n_a + n_b))` since
// `max(g_ab + delta_ab) <= floor(2 * eps_a * n_a) + floor(2 * eps_b * n_b)`
// Finally, one can see how the `insert(x)` operation can be expressed as `merge([(x, 1, 0])`
compress();
backup_sampled.clear();
backup_sampled.reserve(sampled.size() + other.sampled.size());
double merged_relative_error = std::max(relative_error, other.relative_error);
size_t merged_count = count + other.count;
Int64 additional_self_delta = static_cast<Int64>(std::floor(2 * other.relative_error * other.count));
Int64 additional_other_delta = static_cast<Int64>(std::floor(2 * relative_error * count));
// Do a merge of two sorted lists until one of the lists is fully consumed
size_t self_idx = 0;
size_t other_idx = 0;
while (self_idx < sampled.size() && other_idx < other.sampled.size())
{
const Stats & self_sample = sampled[self_idx];
const Stats & other_sample = other.sampled[other_idx];
// Detect next sample
Stats next_sample;
Int64 additional_delta = 0;
if (self_sample.value < other_sample.value)
{
++self_idx;
next_sample = self_sample;
additional_delta = other_idx > 0 ? additional_self_delta : 0;
}
else
{
++other_idx;
next_sample = other_sample;
additional_delta = self_idx > 0 ? additional_other_delta : 0;
}
// Insert it
next_sample.delta += additional_delta;
backup_sampled.emplace_back(std::move(next_sample));
}
// Copy the remaining samples from the other list
// (by construction, at most one `while` loop will run)
while (self_idx < sampled.size())
{
backup_sampled.emplace_back(sampled[self_idx]);
++self_idx;
}
while (other_idx < other.sampled.size())
{
backup_sampled.emplace_back(other.sampled[other_idx]);
++other_idx;
}
std::swap(sampled, backup_sampled);
relative_error = merged_relative_error;
count = merged_count;
compress_threshold = other.compress_threshold;
doCompress(2 * merged_relative_error * merged_count);
compressed = true;
}
}
void write(WriteBuffer & buf) const
{
writeIntBinary<size_t>(compress_threshold, buf);
writeFloatBinary<double>(relative_error, buf);
writeIntBinary<size_t>(count, buf);
writeIntBinary<size_t>(sampled.size(), buf);
for (const auto & stats : sampled)
{
writeFloatBinary<T>(stats.value, buf);
writeIntBinary<Int64>(stats.g, buf);
writeIntBinary<Int64>(stats.delta, buf);
}
}
void read(ReadBuffer & buf)
{
readIntBinary<size_t>(compress_threshold, buf);
readFloatBinary<double>(relative_error, buf);
readIntBinary<size_t>(count, buf);
size_t sampled_len = 0;
readIntBinary<size_t>(sampled_len, buf);
sampled.resize(sampled_len);
for (size_t i = 0; i < sampled_len; ++i)
{
auto stats = sampled[i];
readFloatBinary<T>(stats.value, buf);
readIntBinary<Int64>(stats.g, buf);
readIntBinary<Int64>(stats.delta, buf);
}
}
private:
QueryResult findApproxQuantile(size_t index, Int64 min_rank_at_index, double target_error, double percentile) const
{
Stats curr_sample = sampled[index];
Int64 rank = static_cast<Int64>(std::ceil(percentile * count));
size_t i = index;
Int64 min_rank = min_rank_at_index;
while (i < sampled.size() - 1)
{
Int64 max_rank = min_rank + curr_sample.delta;
if (max_rank - target_error <= rank && rank <= min_rank + target_error)
return {i, min_rank, curr_sample.value};
else
{
++i;
curr_sample = sampled[i];
min_rank += curr_sample.g;
}
}
return {sampled.size()-1, 0, sampled.back().value};
}
void withHeadBufferInserted()
{
if (head_sampled.empty())
return;
bool use_radix_sort = head_sampled.size() >= 256 && (is_arithmetic_v<T> && !is_big_int_v<T>);
if (use_radix_sort)
RadixSort<RadixSortNumTraits<T>>::executeLSD(head_sampled.data(), head_sampled.size());
else
::sort(head_sampled.begin(), head_sampled.end());
backup_sampled.clear();
backup_sampled.reserve(sampled.size() + head_sampled.size());
size_t sample_idx = 0;
size_t ops_idx = 0;
size_t current_count = count;
for (; ops_idx < head_sampled.size(); ++ops_idx)
{
T current_sample = head_sampled[ops_idx];
// Add all the samples before the next observation.
while (sample_idx < sampled.size() && sampled[sample_idx].value <= current_sample)
{
backup_sampled.emplace_back(sampled[sample_idx]);
++sample_idx;
}
// If it is the first one to insert, of if it is the last one
++current_count;
Int64 delta;
if (backup_sampled.empty() || (sample_idx == sampled.size() && ops_idx == (head_sampled.size() - 1)))
delta = 0;
else
delta = static_cast<Int64>(std::floor(2 * relative_error * current_count));
backup_sampled.emplace_back(current_sample, 1, delta);
}
// Add all the remaining existing samples
for (; sample_idx < sampled.size(); ++sample_idx)
backup_sampled.emplace_back(sampled[sample_idx]);
std::swap(sampled, backup_sampled);
head_sampled.clear();
count = current_count;
}
void doCompress(double merge_threshold)
{
if (sampled.empty())
return;
backup_sampled.clear();
// Start for the last element, which is always part of the set.
// The head contains the current new head, that may be merged with the current element.
Stats head = sampled.back();
ssize_t i = sampled.size() - 2;
// Do not compress the last element
while (i >= 1)
{
// The current sample:
const auto & sample1 = sampled[i];
// Do we need to compress?
if (sample1.g + head.g + head.delta < merge_threshold)
{
// Do not insert yet, just merge the current element into the head.
head.g += sample1.g;
}
else
{
// Prepend the current head, and keep the current sample as target for merging.
backup_sampled.push_back(head);
head = sample1;
}
--i;
}
backup_sampled.push_back(head);
// If necessary, add the minimum element:
auto curr_head = sampled.front();
// don't add the minimum element if `currentSamples` has only one element (both `currHead` and
// `head` point to the same element)
if (curr_head.value <= head.value && sampled.size() > 1)
backup_sampled.emplace_back(sampled.front());
std::reverse(backup_sampled.begin(), backup_sampled.end());
std::swap(sampled, backup_sampled);
}
double relative_error;
size_t compress_threshold;
size_t count = 0;
bool compressed;
PaddedPODArray<Stats> sampled;
PaddedPODArray<Stats> backup_sampled;
PaddedPODArray<T> head_sampled;
static constexpr size_t default_compress_threshold = 10000;
static constexpr size_t default_head_size = 50000;
};
template <typename Value>
class QuantileApprox
{
private:
using Data = ApproxSampler<Value>;
mutable Data data;
public:
QuantileApprox() = default;
explicit QuantileApprox(size_t accuracy) : data(1.0 / static_cast<double>(accuracy)) { }
void add(const Value & x)
{
data.insert(x);
}
template <typename Weight>
void add(const Value &, const Weight &)
{
throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Method add with weight is not implemented for GKSampler");
}
void merge(const QuantileApprox & rhs)
{
if (!data.isCompressed())
data.compress();
data.merge(rhs.data);
}
void serialize(WriteBuffer & buf) const
{
/// Always compress before serialization
if (!data.isCompressed())
data.compress();
data.write(buf);
}
void deserialize(ReadBuffer & buf)
{
data.read(buf);
data.setCompressed();
}
/// Get the value of the `level` quantile. The level must be between 0 and 1.
Value get(Float64 level)
{
if (!data.isCompressed())
data.compress();
Value res;
size_t indice = 0;
data.query(&level, &indice, 1, &res);
return res;
}
/// Get the `size` values of `levels` quantiles. Write `size` results starting with `result` address.
/// indices - an array of index levels such that the corresponding elements will go in ascending order.
void getMany(const Float64 * levels, const size_t * indices, size_t size, Value * result)
{
if (!data.isCompressed())
data.compress();
data.query(levels, indices, size, result);
}
Float64 getFloat64(Float64 /*level*/)
{
throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Method getFloat64 is not implemented for GKSampler");
}
void getManyFloat(const Float64 * /*levels*/, const size_t * /*indices*/, size_t /*size*/, Float64 * /*result*/)
{
throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Method getManyFloat is not implemented for GKSampler");
}
};
}

View File

@ -58,7 +58,8 @@ struct QuantileExactBase
size_t size = 0;
readVarUInt(size, buf);
if (unlikely(size > QUANTILE_EXACT_MAX_ARRAY_SIZE))
throw Exception(ErrorCodes::TOO_LARGE_ARRAY_SIZE, "Too large array size");
throw Exception(ErrorCodes::TOO_LARGE_ARRAY_SIZE,
"Too large array size (maximum: {})", QUANTILE_EXACT_MAX_ARRAY_SIZE);
array.resize(size);
buf.readStrict(reinterpret_cast<char *>(array.data()), size * sizeof(array[0]));
}

View File

@ -213,7 +213,8 @@ public:
size_t size = std::min(total_values, sample_count);
static constexpr size_t MAX_RESERVOIR_SIZE = 1_GiB;
if (unlikely(size > MAX_RESERVOIR_SIZE))
throw DB::Exception(DB::ErrorCodes::TOO_LARGE_ARRAY_SIZE, "Too large array size");
throw DB::Exception(DB::ErrorCodes::TOO_LARGE_ARRAY_SIZE,
"Too large array size (maximum: {})", MAX_RESERVOIR_SIZE);
samples.resize(size);

View File

@ -166,7 +166,8 @@ public:
static constexpr size_t MAX_RESERVOIR_SIZE = 1_GiB;
if (unlikely(size > MAX_RESERVOIR_SIZE))
throw DB::Exception(DB::ErrorCodes::TOO_LARGE_ARRAY_SIZE, "Too large array size");
throw DB::Exception(DB::ErrorCodes::TOO_LARGE_ARRAY_SIZE,
"Too large array size (maximum: {})", MAX_RESERVOIR_SIZE);
samples.resize(size);
for (size_t i = 0; i < size; ++i)

View File

@ -32,6 +32,7 @@ void registerAggregateFunctionsQuantileTDigest(AggregateFunctionFactory &);
void registerAggregateFunctionsQuantileTDigestWeighted(AggregateFunctionFactory &);
void registerAggregateFunctionsQuantileBFloat16(AggregateFunctionFactory &);
void registerAggregateFunctionsQuantileBFloat16Weighted(AggregateFunctionFactory &);
void registerAggregateFunctionsQuantileApprox(AggregateFunctionFactory &);
void registerAggregateFunctionsSequenceMatch(AggregateFunctionFactory &);
void registerAggregateFunctionWindowFunnel(AggregateFunctionFactory &);
void registerAggregateFunctionRate(AggregateFunctionFactory &);
@ -79,6 +80,7 @@ void registerAggregateFunctionExponentialMovingAverage(AggregateFunctionFactory
void registerAggregateFunctionSparkbar(AggregateFunctionFactory &);
void registerAggregateFunctionIntervalLengthSum(AggregateFunctionFactory &);
void registerAggregateFunctionAnalysisOfVariance(AggregateFunctionFactory &);
void registerAggregateFunctionKolmogorovSmirnovTest(AggregateFunctionFactory & factory);
class AggregateFunctionCombinatorFactory;
void registerAggregateFunctionCombinatorIf(AggregateFunctionCombinatorFactory &);
@ -123,6 +125,7 @@ void registerAggregateFunctions()
registerAggregateFunctionsQuantileTDigestWeighted(factory);
registerAggregateFunctionsQuantileBFloat16(factory);
registerAggregateFunctionsQuantileBFloat16Weighted(factory);
registerAggregateFunctionsQuantileApprox(factory);
registerAggregateFunctionsSequenceMatch(factory);
registerAggregateFunctionWindowFunnel(factory);
registerAggregateFunctionRate(factory);
@ -170,6 +173,7 @@ void registerAggregateFunctions()
registerAggregateFunctionExponentialMovingAverage(factory);
registerAggregateFunctionSparkbar(factory);
registerAggregateFunctionAnalysisOfVariance(factory);
registerAggregateFunctionKolmogorovSmirnovTest(factory);
registerWindowFunctions(factory);
}

View File

@ -86,7 +86,12 @@ public:
DataTypePtr getResultType() const override
{
return getExpression()->getResultType();
return result_type;
}
void resolve(DataTypePtr lambda_type)
{
result_type = std::move(lambda_type);
}
void dumpTreeImpl(WriteBuffer & buffer, FormatState & format_state, size_t indent) const override;
@ -102,6 +107,7 @@ protected:
private:
Names argument_names;
DataTypePtr result_type;
static constexpr size_t arguments_child_index = 0;
static constexpr size_t expression_child_index = 1;

599
src/Analyzer/Passes/CNF.cpp Normal file
View File

@ -0,0 +1,599 @@
#include <Analyzer/Passes/CNF.h>
#include <Analyzer/InDepthQueryTreeVisitor.h>
#include <Analyzer/FunctionNode.h>
#include <Analyzer/ConstantNode.h>
#include <Interpreters/TreeCNFConverter.h>
#include <IO/WriteBufferFromString.h>
#include <IO/Operators.h>
#include <Functions/FunctionFactory.h>
#include <Common/checkStackSize.h>
namespace DB
{
namespace ErrorCodes
{
extern const int TOO_MANY_TEMPORARY_COLUMNS;
}
namespace Analyzer
{
namespace
{
bool isLogicalFunction(const FunctionNode & function_node)
{
const std::string_view name = function_node.getFunctionName();
return name == "and" || name == "or" || name == "not";
}
template <typename... Args>
QueryTreeNodePtr createFunctionNode(const FunctionOverloadResolverPtr & function_resolver, Args &&... args)
{
auto function_node = std::make_shared<FunctionNode>(function_resolver->getName());
auto & new_arguments = function_node->getArguments().getNodes();
new_arguments.reserve(sizeof...(args));
(new_arguments.push_back(std::forward<Args>(args)), ...);
function_node->resolveAsFunction(function_resolver);
return function_node;
}
size_t countAtoms(const QueryTreeNodePtr & node)
{
checkStackSize();
const auto * function_node = node->as<FunctionNode>();
if (!function_node || !isLogicalFunction(*function_node))
return 1;
size_t atom_count = 0;
const auto & arguments = function_node->getArguments().getNodes();
for (const auto & argument : arguments)
atom_count += countAtoms(argument);
return atom_count;
}
class SplitMultiLogicVisitor
{
public:
explicit SplitMultiLogicVisitor(ContextPtr context)
: current_context(std::move(context))
{}
void visit(QueryTreeNodePtr & node)
{
checkStackSize();
auto * function_node = node->as<FunctionNode>();
if (!function_node || !isLogicalFunction(*function_node))
return;
const auto & name = function_node->getFunctionName();
if (name == "and" || name == "or")
{
auto function_resolver = FunctionFactory::instance().get(name, current_context);
const auto & arguments = function_node->getArguments().getNodes();
if (arguments.size() > 2)
{
QueryTreeNodePtr current = arguments[0];
for (size_t i = 1; i < arguments.size(); ++i)
current = createFunctionNode(function_resolver, std::move(current), arguments[i]);
auto & new_function_node = current->as<FunctionNode &>();
function_node->getArguments().getNodes() = std::move(new_function_node.getArguments().getNodes());
function_node->resolveAsFunction(function_resolver);
}
}
else
{
assert(name == "not");
}
auto & arguments = function_node->getArguments().getNodes();
for (auto & argument : arguments)
visit(argument);
}
private:
ContextPtr current_context;
};
class PushNotVisitor
{
public:
explicit PushNotVisitor(const ContextPtr & context)
: not_function_resolver(FunctionFactory::instance().get("not", context))
, or_function_resolver(FunctionFactory::instance().get("or", context))
, and_function_resolver(FunctionFactory::instance().get("and", context))
{}
void visit(QueryTreeNodePtr & node, bool add_negation)
{
checkStackSize();
auto * function_node = node->as<FunctionNode>();
if (!function_node || !isLogicalFunction(*function_node))
{
if (add_negation)
node = createFunctionNode(not_function_resolver, std::move(node));
return;
}
std::string_view function_name = function_node->getFunctionName();
if (function_name == "and" || function_name == "or")
{
if (add_negation)
{
if (function_name == "and")
function_node->resolveAsFunction(or_function_resolver);
else
function_node->resolveAsFunction(and_function_resolver);
}
auto & arguments = function_node->getArguments().getNodes();
for (auto & argument : arguments)
visit(argument, add_negation);
return;
}
assert(function_name == "not");
auto & arguments = function_node->getArguments().getNodes();
assert(arguments.size() == 1);
node = arguments[0];
visit(node, !add_negation);
}
private:
const FunctionOverloadResolverPtr not_function_resolver;
const FunctionOverloadResolverPtr or_function_resolver;
const FunctionOverloadResolverPtr and_function_resolver;
};
class PushOrVisitor
{
public:
PushOrVisitor(ContextPtr context, size_t max_atoms_, size_t num_atoms_)
: max_atoms(max_atoms_)
, num_atoms(num_atoms_)
, and_resolver(FunctionFactory::instance().get("and", context))
, or_resolver(FunctionFactory::instance().get("or", context))
{}
bool visit(QueryTreeNodePtr & node)
{
if (max_atoms && num_atoms > max_atoms)
return false;
checkStackSize();
auto * function_node = node->as<FunctionNode>();
if (!function_node)
return true;
std::string_view name = function_node->getFunctionName();
if (name == "or" || name == "and")
{
auto & arguments = function_node->getArguments().getNodes();
for (auto & argument : arguments)
visit(argument);
}
if (name == "or")
{
auto & arguments = function_node->getArguments().getNodes();
assert(arguments.size() == 2);
size_t and_node_id = arguments.size();
for (size_t i = 0; i < arguments.size(); ++i)
{
auto & argument = arguments[i];
if (auto * argument_function_node = argument->as<FunctionNode>();
argument_function_node && argument_function_node->getFunctionName() == "and")
and_node_id = i;
}
if (and_node_id == arguments.size())
return true;
auto & other_node = arguments[1 - and_node_id];
auto & and_function_arguments = arguments[and_node_id]->as<FunctionNode &>().getArguments().getNodes();
auto lhs = createFunctionNode(or_resolver, other_node->clone(), std::move(and_function_arguments[0]));
num_atoms += countAtoms(other_node);
auto rhs = createFunctionNode(or_resolver, std::move(other_node), std::move(and_function_arguments[1]));
node = createFunctionNode(and_resolver, std::move(lhs), std::move(rhs));
visit(node);
}
return true;
}
private:
size_t max_atoms;
size_t num_atoms;
const FunctionOverloadResolverPtr and_resolver;
const FunctionOverloadResolverPtr or_resolver;
};
class CollectGroupsVisitor
{
public:
void visit(QueryTreeNodePtr & node)
{
CNF::OrGroup or_group;
visitImpl(node, or_group);
if (!or_group.empty())
and_group.insert(std::move(or_group));
}
CNF::AndGroup and_group;
private:
void visitImpl(QueryTreeNodePtr & node, CNF::OrGroup & or_group)
{
checkStackSize();
auto * function_node = node->as<FunctionNode>();
if (!function_node || !isLogicalFunction(*function_node))
{
or_group.insert(CNF::AtomicFormula{false, std::move(node)});
return;
}
std::string_view name = function_node->getFunctionName();
if (name == "and")
{
auto & arguments = function_node->getArguments().getNodes();
for (auto & argument : arguments)
{
CNF::OrGroup argument_or_group;
visitImpl(argument, argument_or_group);
if (!argument_or_group.empty())
and_group.insert(std::move(argument_or_group));
}
}
else if (name == "or")
{
auto & arguments = function_node->getArguments().getNodes();
for (auto & argument : arguments)
visitImpl(argument, or_group);
}
else
{
assert(name == "not");
auto & arguments = function_node->getArguments().getNodes();
or_group.insert(CNF::AtomicFormula{true, std::move(arguments[0])});
}
}
};
std::optional<CNF::AtomicFormula> tryInvertFunction(
const CNF::AtomicFormula & atom, const ContextPtr & context, const std::unordered_map<std::string, std::string> & inverse_relations)
{
auto * function_node = atom.node_with_hash.node->as<FunctionNode>();
if (!function_node)
return std::nullopt;
if (auto it = inverse_relations.find(function_node->getFunctionName()); it != inverse_relations.end())
{
auto inverse_function_resolver = FunctionFactory::instance().get(it->second, context);
function_node->resolveAsFunction(inverse_function_resolver);
return CNF::AtomicFormula{!atom.negative, atom.node_with_hash.node};
}
return std::nullopt;
}
}
bool CNF::AtomicFormula::operator==(const AtomicFormula & rhs) const
{
return negative == rhs.negative && node_with_hash == rhs.node_with_hash;
}
bool CNF::AtomicFormula::operator<(const AtomicFormula & rhs) const
{
if (node_with_hash.hash > rhs.node_with_hash.hash)
return false;
return node_with_hash.hash < rhs.node_with_hash.hash || negative < rhs.negative;
}
std::string CNF::dump() const
{
WriteBufferFromOwnString res;
bool first = true;
for (const auto & group : statements)
{
if (!first)
res << " AND ";
first = false;
res << "(";
bool first_in_group = true;
for (const auto & atom : group)
{
if (!first_in_group)
res << " OR ";
first_in_group = false;
if (atom.negative)
res << " NOT ";
res << atom.node_with_hash.node->formatASTForErrorMessage();
}
res << ")";
}
return res.str();
}
CNF & CNF::transformGroups(std::function<OrGroup(const OrGroup &)> fn)
{
AndGroup result;
for (const auto & group : statements)
{
auto new_group = fn(group);
if (!new_group.empty())
result.insert(std::move(new_group));
}
statements = std::move(result);
return *this;
}
CNF & CNF::transformAtoms(std::function<AtomicFormula(const AtomicFormula &)> fn)
{
transformGroups([fn](const OrGroup & group)
{
OrGroup result;
for (const auto & atom : group)
{
auto new_atom = fn(atom);
if (new_atom.node_with_hash.node)
result.insert(std::move(new_atom));
}
return result;
});
return *this;
}
CNF & CNF::pushNotIntoFunctions(const ContextPtr & context)
{
transformAtoms([&](const AtomicFormula & atom)
{
return pushNotIntoFunction(atom, context);
});
return *this;
}
CNF::AtomicFormula CNF::pushNotIntoFunction(const AtomicFormula & atom, const ContextPtr & context)
{
if (!atom.negative)
return atom;
static const std::unordered_map<std::string, std::string> inverse_relations = {
{"equals", "notEquals"},
{"less", "greaterOrEquals"},
{"lessOrEquals", "greater"},
{"in", "notIn"},
{"like", "notLike"},
{"empty", "notEmpty"},
{"notEquals", "equals"},
{"greaterOrEquals", "less"},
{"greater", "lessOrEquals"},
{"notIn", "in"},
{"notLike", "like"},
{"notEmpty", "empty"},
};
if (auto inverted_atom = tryInvertFunction(atom, context, inverse_relations);
inverted_atom.has_value())
return std::move(*inverted_atom);
return atom;
}
CNF & CNF::pullNotOutFunctions(const ContextPtr & context)
{
transformAtoms([&](const AtomicFormula & atom)
{
static const std::unordered_map<std::string, std::string> inverse_relations = {
{"notEquals", "equals"},
{"greaterOrEquals", "less"},
{"greater", "lessOrEquals"},
{"notIn", "in"},
{"notLike", "like"},
{"notEmpty", "empty"},
};
if (auto inverted_atom = tryInvertFunction(atom, context, inverse_relations);
inverted_atom.has_value())
return std::move(*inverted_atom);
return atom;
});
return *this;
}
CNF & CNF::filterAlwaysTrueGroups(std::function<bool(const OrGroup &)> predicate)
{
AndGroup filtered;
for (const auto & or_group : statements)
{
if (predicate(or_group))
filtered.insert(or_group);
}
statements = std::move(filtered);
return *this;
}
CNF & CNF::filterAlwaysFalseAtoms(std::function<bool(const AtomicFormula &)> predicate)
{
AndGroup filtered;
for (const auto & or_group : statements)
{
OrGroup filtered_group;
for (const auto & atom : or_group)
{
if (predicate(atom))
filtered_group.insert(atom);
}
if (!filtered_group.empty())
filtered.insert(std::move(filtered_group));
else
{
filtered.clear();
filtered_group.insert(AtomicFormula{false, QueryTreeNodePtrWithHash{std::make_shared<ConstantNode>(static_cast<UInt8>(0))}});
filtered.insert(std::move(filtered_group));
break;
}
}
statements = std::move(filtered);
return *this;
}
CNF & CNF::reduce()
{
while (true)
{
AndGroup new_statements = reduceOnceCNFStatements(statements);
if (statements == new_statements)
{
statements = filterCNFSubsets(statements);
return *this;
}
else
statements = new_statements;
}
}
void CNF::appendGroup(const AndGroup & and_group)
{
for (const auto & or_group : and_group)
statements.emplace(or_group);
}
CNF::CNF(AndGroup statements_)
: statements(std::move(statements_))
{}
std::optional<CNF> CNF::tryBuildCNF(const QueryTreeNodePtr & node, ContextPtr context, size_t max_growth_multiplier)
{
auto node_cloned = node->clone();
size_t atom_count = countAtoms(node_cloned);
size_t max_atoms = max_growth_multiplier ? std::max(MAX_ATOMS_WITHOUT_CHECK, atom_count * max_growth_multiplier) : 0;
{
SplitMultiLogicVisitor visitor(context);
visitor.visit(node_cloned);
}
{
PushNotVisitor visitor(context);
visitor.visit(node_cloned, false);
}
if (PushOrVisitor visitor(context, max_atoms, atom_count);
!visitor.visit(node_cloned))
return std::nullopt;
CollectGroupsVisitor collect_visitor;
collect_visitor.visit(node_cloned);
if (collect_visitor.and_group.empty())
return std::nullopt;
return CNF{std::move(collect_visitor.and_group)};
}
CNF CNF::toCNF(const QueryTreeNodePtr & node, ContextPtr context, size_t max_growth_multiplier)
{
auto cnf = tryBuildCNF(node, context, max_growth_multiplier);
if (!cnf)
throw Exception(ErrorCodes::TOO_MANY_TEMPORARY_COLUMNS,
"Cannot convert expression '{}' to CNF, because it produces to many clauses."
"Size of boolean formula in CNF can be exponential of size of source formula.");
return *cnf;
}
QueryTreeNodePtr CNF::toQueryTree(ContextPtr context) const
{
if (statements.empty())
return nullptr;
QueryTreeNodes and_arguments;
and_arguments.reserve(statements.size());
auto not_resolver = FunctionFactory::instance().get("not", context);
auto or_resolver = FunctionFactory::instance().get("or", context);
auto and_resolver = FunctionFactory::instance().get("and", context);
const auto function_node_from_atom = [&](const auto & atom) -> QueryTreeNodePtr
{
auto cloned_node = atom.node_with_hash.node->clone();
if (atom.negative)
return createFunctionNode(not_resolver, std::move(cloned_node));
return std::move(cloned_node);
};
for (const auto & or_group : statements)
{
if (or_group.size() == 1)
{
const auto & atom = *or_group.begin();
and_arguments.push_back(function_node_from_atom(atom));
}
else
{
QueryTreeNodes or_arguments;
or_arguments.reserve(or_group.size());
for (const auto & atom : or_group)
or_arguments.push_back(function_node_from_atom(atom));
auto or_function = std::make_shared<FunctionNode>("or");
or_function->getArguments().getNodes() = std::move(or_arguments);
or_function->resolveAsFunction(or_resolver);
and_arguments.push_back(std::move(or_function));
}
}
if (and_arguments.size() == 1)
return std::move(and_arguments[0]);
auto and_function = std::make_shared<FunctionNode>("and");
and_function->getArguments().getNodes() = std::move(and_arguments);
and_function->resolveAsFunction(and_resolver);
return and_function;
}
}
}

67
src/Analyzer/Passes/CNF.h Normal file
View File

@ -0,0 +1,67 @@
#pragma once
#include <Analyzer/HashUtils.h>
#include <Analyzer/IQueryTreeNode.h>
#include <Common/SipHash.h>
#include <Interpreters/Context_fwd.h>
#include <unordered_set>
namespace DB::Analyzer
{
class CNF
{
public:
struct AtomicFormula
{
bool negative = false;
QueryTreeNodePtrWithHash node_with_hash;
bool operator==(const AtomicFormula & rhs) const;
bool operator<(const AtomicFormula & rhs) const;
};
// Different hash is generated for different order, so we use std::set
using OrGroup = std::set<AtomicFormula>;
using AndGroup = std::set<OrGroup>;
std::string dump() const;
static constexpr size_t DEFAULT_MAX_GROWTH_MULTIPLIER = 20;
static constexpr size_t MAX_ATOMS_WITHOUT_CHECK = 200;
CNF & transformAtoms(std::function<AtomicFormula(const AtomicFormula &)> fn);
CNF & transformGroups(std::function<OrGroup(const OrGroup &)> fn);
CNF & filterAlwaysTrueGroups(std::function<bool(const OrGroup &)> predicate);
CNF & filterAlwaysFalseAtoms(std::function<bool(const AtomicFormula &)> predicate);
CNF & reduce();
void appendGroup(const AndGroup & and_group);
/// Convert "NOT fn" to a single node representing inverse of "fn"
CNF & pushNotIntoFunctions(const ContextPtr & context);
CNF & pullNotOutFunctions(const ContextPtr & context);
static AtomicFormula pushNotIntoFunction(const AtomicFormula & atom, const ContextPtr & context);
explicit CNF(AndGroup statements_);
static std::optional<CNF> tryBuildCNF(const QueryTreeNodePtr & node, ContextPtr context, size_t max_growth_multiplier = DEFAULT_MAX_GROWTH_MULTIPLIER);
static CNF toCNF(const QueryTreeNodePtr & node, ContextPtr context, size_t max_growth_multiplier = DEFAULT_MAX_GROWTH_MULTIPLIER);
QueryTreeNodePtr toQueryTree(ContextPtr context) const;
const auto & getStatements() const
{
return statements;
}
private:
AndGroup statements;
};
}

View File

@ -0,0 +1,733 @@
#include <Analyzer/Passes/ConvertQueryToCNFPass.h>
#include <Analyzer/InDepthQueryTreeVisitor.h>
#include <Analyzer/FunctionNode.h>
#include <Analyzer/TableNode.h>
#include <Analyzer/ColumnNode.h>
#include <Analyzer/TableFunctionNode.h>
#include <Analyzer/ConstantNode.h>
#include <Analyzer/Passes/CNF.h>
#include <Analyzer/Utils.h>
#include <Storages/IStorage.h>
#include <Functions/FunctionFactory.h>
#include "Analyzer/HashUtils.h"
#include "Analyzer/IQueryTreeNode.h"
#include "Interpreters/ComparisonGraph.h"
#include "base/types.h"
namespace DB
{
namespace
{
std::optional<Analyzer::CNF> tryConvertQueryToCNF(const QueryTreeNodePtr & node, const ContextPtr & context)
{
auto cnf_form = Analyzer::CNF::tryBuildCNF(node, context);
if (!cnf_form)
return std::nullopt;
cnf_form->pushNotIntoFunctions(context);
return cnf_form;
}
enum class MatchState : uint8_t
{
FULL_MATCH, /// a = b
PARTIAL_MATCH, /// a = not b
NONE,
};
MatchState match(const Analyzer::CNF::AtomicFormula & a, const Analyzer::CNF::AtomicFormula & b)
{
using enum MatchState;
if (a.node_with_hash != b.node_with_hash)
return NONE;
return a.negative == b.negative ? FULL_MATCH : PARTIAL_MATCH;
}
bool checkIfGroupAlwaysTrueFullMatch(const Analyzer::CNF::OrGroup & group, const ConstraintsDescription::QueryTreeData & query_tree_constraints)
{
/// We have constraints in CNF.
/// CNF is always true => Each OR group in CNF is always true.
/// So, we try to check whether we have al least one OR group from CNF as subset in our group.
/// If we've found one then our group is always true too.
const auto & constraints_data = query_tree_constraints.getConstraintData();
std::vector<size_t> found(constraints_data.size());
for (size_t i = 0; i < constraints_data.size(); ++i)
found[i] = constraints_data[i].size();
for (const auto & atom : group)
{
const auto constraint_atom_ids = query_tree_constraints.getAtomIds(atom.node_with_hash);
if (constraint_atom_ids)
{
const auto constraint_atoms = query_tree_constraints.getAtomsById(*constraint_atom_ids);
for (size_t i = 0; i < constraint_atoms.size(); ++i)
{
if (match(constraint_atoms[i], atom) == MatchState::FULL_MATCH)
{
if ((--found[(*constraint_atom_ids)[i].group_id]) == 0)
return true;
}
}
}
}
return false;
}
bool checkIfGroupAlwaysTrueGraph(const Analyzer::CNF::OrGroup & group, const ComparisonGraph<QueryTreeNodePtr> & graph)
{
/// We try to find at least one atom that is always true by using comparison graph.
for (const auto & atom : group)
{
const auto * function_node = atom.node_with_hash.node->as<FunctionNode>();
if (function_node)
{
const auto & arguments = function_node->getArguments().getNodes();
if (arguments.size() == 2)
{
const auto expected = ComparisonGraph<QueryTreeNodePtr>::atomToCompareResult(atom);
if (graph.isAlwaysCompare(expected, arguments[0], arguments[1]))
return true;
}
}
}
return false;
}
bool checkIfAtomAlwaysFalseFullMatch(const Analyzer::CNF::AtomicFormula & atom, const ConstraintsDescription::QueryTreeData & query_tree_constraints)
{
const auto constraint_atom_ids = query_tree_constraints.getAtomIds(atom.node_with_hash);
if (constraint_atom_ids)
{
for (const auto & constraint_atom : query_tree_constraints.getAtomsById(*constraint_atom_ids))
{
const auto match_result = match(constraint_atom, atom);
if (match_result == MatchState::PARTIAL_MATCH)
return true;
}
}
return false;
}
bool checkIfAtomAlwaysFalseGraph(const Analyzer::CNF::AtomicFormula & atom, const ComparisonGraph<QueryTreeNodePtr> & graph)
{
const auto * function_node = atom.node_with_hash.node->as<FunctionNode>();
if (!function_node)
return false;
const auto & arguments = function_node->getArguments().getNodes();
if (arguments.size() != 2)
return false;
/// TODO: special support for !=
const auto expected = ComparisonGraph<QueryTreeNodePtr>::atomToCompareResult(atom);
return !graph.isPossibleCompare(expected, arguments[0], arguments[1]);
}
void replaceToConstants(QueryTreeNodePtr & term, const ComparisonGraph<QueryTreeNodePtr> & graph)
{
const auto equal_constant = graph.getEqualConst(term);
if (equal_constant)
{
term = (*equal_constant)->clone();
return;
}
for (auto & child : term->getChildren())
{
if (child)
replaceToConstants(child, graph);
}
}
Analyzer::CNF::AtomicFormula replaceTermsToConstants(const Analyzer::CNF::AtomicFormula & atom, const ComparisonGraph<QueryTreeNodePtr> & graph)
{
auto node = atom.node_with_hash.node->clone();
replaceToConstants(node, graph);
return {atom.negative, std::move(node)};
}
StorageSnapshotPtr getStorageSnapshot(const QueryTreeNodePtr & node)
{
StorageSnapshotPtr storage_snapshot{nullptr};
if (auto * table_node = node->as<TableNode>())
return table_node->getStorageSnapshot();
else if (auto * table_function_node = node->as<TableFunctionNode>())
return table_function_node->getStorageSnapshot();
return nullptr;
}
bool onlyIndexColumns(const QueryTreeNodePtr & node, const std::unordered_set<std::string_view> & primary_key_set)
{
const auto * column_node = node->as<ColumnNode>();
/// TODO: verify that full name is correct here
if (column_node && !primary_key_set.contains(column_node->getColumnName()))
return false;
for (const auto & child : node->getChildren())
{
if (child && !onlyIndexColumns(child, primary_key_set))
return false;
}
return true;
}
bool onlyConstants(const QueryTreeNodePtr & node)
{
/// if it's only constant it will be already calculated
return node->as<ConstantNode>() != nullptr;
}
const std::unordered_map<std::string_view, ComparisonGraphCompareResult> & getRelationMap()
{
using enum ComparisonGraphCompareResult;
static const std::unordered_map<std::string_view, ComparisonGraphCompareResult> relations =
{
{"equals", EQUAL},
{"less", LESS},
{"lessOrEquals", LESS_OR_EQUAL},
{"greaterOrEquals", GREATER_OR_EQUAL},
{"greater", GREATER},
};
return relations;
}
const std::unordered_map<ComparisonGraphCompareResult, std::string> & getReverseRelationMap()
{
using enum ComparisonGraphCompareResult;
static const std::unordered_map<ComparisonGraphCompareResult, std::string> relations =
{
{EQUAL, "equals"},
{LESS, "less"},
{LESS_OR_EQUAL, "lessOrEquals"},
{GREATER_OR_EQUAL, "greaterOrEquals"},
{GREATER, "greater"},
};
return relations;
}
bool canBeSequence(const ComparisonGraphCompareResult left, const ComparisonGraphCompareResult right)
{
using enum ComparisonGraphCompareResult;
if (left == UNKNOWN || right == UNKNOWN || left == NOT_EQUAL || right == NOT_EQUAL)
return false;
if ((left == GREATER || left == GREATER_OR_EQUAL) && (right == LESS || right == LESS_OR_EQUAL))
return false;
if ((right == GREATER || right == GREATER_OR_EQUAL) && (left == LESS || left == LESS_OR_EQUAL))
return false;
return true;
}
ComparisonGraphCompareResult mostStrict(const ComparisonGraphCompareResult left, const ComparisonGraphCompareResult right)
{
using enum ComparisonGraphCompareResult;
if (left == LESS || left == GREATER)
return left;
if (right == LESS || right == GREATER)
return right;
if (left == LESS_OR_EQUAL || left == GREATER_OR_EQUAL)
return left;
if (right == LESS_OR_EQUAL || right == GREATER_OR_EQUAL)
return right;
if (left == EQUAL)
return left;
if (right == EQUAL)
return right;
return UNKNOWN;
}
/// Create OR-group for 'indexHint'.
/// Consider we have expression like A <op1> C, where C is constant.
/// Consider we have a constraint I <op2> A, where I depends only on columns from primary key.
/// Then if op1 and op2 forms a sequence of comparisons (e.g. A < C and I < A),
/// we can add to expression 'indexHint(I < A)' condition.
Analyzer::CNF::OrGroup createIndexHintGroup(
const Analyzer::CNF::OrGroup & group,
const ComparisonGraph<QueryTreeNodePtr> & graph,
const QueryTreeNodes & primary_key_only_nodes,
const ContextPtr & context)
{
Analyzer::CNF::OrGroup result;
for (const auto & atom : group)
{
const auto * function_node = atom.node_with_hash.node->as<FunctionNode>();
if (!function_node || !getRelationMap().contains(function_node->getFunctionName()))
continue;
const auto & arguments = function_node->getArguments().getNodes();
if (arguments.size() != 2)
continue;
auto check_and_insert = [&](const size_t index, const ComparisonGraphCompareResult expected_result)
{
if (!onlyConstants(arguments[1 - index]))
return false;
for (const auto & primary_key_node : primary_key_only_nodes)
{
ComparisonGraphCompareResult actual_result;
if (index == 0)
actual_result = graph.compare(primary_key_node, arguments[index]);
else
actual_result = graph.compare(arguments[index], primary_key_node);
if (canBeSequence(expected_result, actual_result))
{
auto helper_node = function_node->clone();
auto & helper_function_node = helper_node->as<FunctionNode &>();
helper_function_node.getArguments().getNodes()[index] = primary_key_node->clone();
auto reverse_function_name = getReverseRelationMap().at(mostStrict(expected_result, actual_result));
helper_function_node.resolveAsFunction(FunctionFactory::instance().get(reverse_function_name, context));
result.insert(Analyzer::CNF::AtomicFormula{atom.negative, std::move(helper_node)});
return true;
}
}
return false;
};
auto expected = getRelationMap().at(function_node->getFunctionName());
if (!check_and_insert(0, expected) && !check_and_insert(1, expected))
return {};
}
return result;
}
void addIndexConstraint(Analyzer::CNF & cnf, const QueryTreeNodes & table_expressions, const ContextPtr & context)
{
for (const auto & table_expression : table_expressions)
{
auto snapshot = getStorageSnapshot(table_expression);
if (!snapshot || !snapshot->metadata)
continue;
const auto primary_key = snapshot->metadata->getColumnsRequiredForPrimaryKey();
const std::unordered_set<std::string_view> primary_key_set(primary_key.begin(), primary_key.end());
const auto & query_tree_constraint = snapshot->metadata->getConstraints().getQueryTreeData(context, table_expression);
const auto & graph = query_tree_constraint.getGraph();
QueryTreeNodes primary_key_only_nodes;
for (const auto & vertex : graph.getVertices())
{
for (const auto & node : vertex)
{
if (onlyIndexColumns(node, primary_key_set))
primary_key_only_nodes.push_back(node);
}
}
Analyzer::CNF::AndGroup and_group;
const auto & statements = cnf.getStatements();
for (const auto & group : statements)
{
auto new_group = createIndexHintGroup(group, graph, primary_key_only_nodes, context);
if (!new_group.empty())
and_group.emplace(std::move(new_group));
}
if (!and_group.empty())
{
Analyzer::CNF::OrGroup new_group;
auto index_hint_node = std::make_shared<FunctionNode>("indexHint");
index_hint_node->getArguments().getNodes().push_back(Analyzer::CNF{std::move(and_group)}.toQueryTree(context));
index_hint_node->resolveAsFunction(FunctionFactory::instance().get("indexHint", context));
new_group.insert({false, QueryTreeNodePtrWithHash{std::move(index_hint_node)}});
cnf.appendGroup({new_group});
}
}
}
struct ColumnPrice
{
Int64 compressed_size{0};
Int64 uncompressed_size{0};
ColumnPrice(const Int64 compressed_size_, const Int64 uncompressed_size_)
: compressed_size(compressed_size_)
, uncompressed_size(uncompressed_size_)
{
}
bool operator<(const ColumnPrice & that) const
{
return std::tie(compressed_size, uncompressed_size) < std::tie(that.compressed_size, that.uncompressed_size);
}
ColumnPrice & operator+=(const ColumnPrice & that)
{
compressed_size += that.compressed_size;
uncompressed_size += that.uncompressed_size;
return *this;
}
ColumnPrice & operator-=(const ColumnPrice & that)
{
compressed_size -= that.compressed_size;
uncompressed_size -= that.uncompressed_size;
return *this;
}
};
using ColumnPriceByName = std::unordered_map<String, ColumnPrice>;
using ColumnPriceByQueryNode = QueryTreeNodePtrWithHashMap<ColumnPrice>;
class ComponentCollectorVisitor : public ConstInDepthQueryTreeVisitor<ComponentCollectorVisitor>
{
public:
ComponentCollectorVisitor(
std::set<UInt64> & components_,
QueryTreeNodePtrWithHashMap<UInt64> & query_node_to_component_,
const ComparisonGraph<QueryTreeNodePtr> & graph_)
: components(components_), query_node_to_component(query_node_to_component_), graph(graph_)
{}
void visitImpl(const QueryTreeNodePtr & node)
{
if (auto id = graph.getComponentId(node))
{
query_node_to_component.emplace(node, *id);
components.insert(*id);
}
}
private:
std::set<UInt64> & components;
QueryTreeNodePtrWithHashMap<UInt64> & query_node_to_component;
const ComparisonGraph<QueryTreeNodePtr> & graph;
};
class ColumnNameCollectorVisitor : public ConstInDepthQueryTreeVisitor<ColumnNameCollectorVisitor>
{
public:
ColumnNameCollectorVisitor(
std::unordered_set<std::string> & column_names_,
const QueryTreeNodePtrWithHashMap<UInt64> * query_node_to_component_)
: column_names(column_names_), query_node_to_component(query_node_to_component_)
{}
bool needChildVisit(const VisitQueryTreeNodeType & parent, const VisitQueryTreeNodeType &)
{
return !query_node_to_component || !query_node_to_component->contains(parent);
}
void visitImpl(const QueryTreeNodePtr & node)
{
if (query_node_to_component && query_node_to_component->contains(node))
return;
if (const auto * column_node = node->as<ColumnNode>())
column_names.insert(column_node->getColumnName());
}
private:
std::unordered_set<std::string> & column_names;
const QueryTreeNodePtrWithHashMap<UInt64> * query_node_to_component;
};
class SubstituteColumnVisitor : public InDepthQueryTreeVisitor<SubstituteColumnVisitor>
{
public:
SubstituteColumnVisitor(
const QueryTreeNodePtrWithHashMap<UInt64> & query_node_to_component_,
const std::unordered_map<UInt64, QueryTreeNodePtr> & id_to_query_node_map_,
ContextPtr context_)
: query_node_to_component(query_node_to_component_), id_to_query_node_map(id_to_query_node_map_), context(std::move(context_))
{}
void visitImpl(QueryTreeNodePtr & node)
{
auto component_id_it = query_node_to_component.find(node);
if (component_id_it == query_node_to_component.end())
return;
const auto component_id = component_id_it->second;
auto new_node = id_to_query_node_map.at(component_id)->clone();
if (!node->getResultType()->equals(*new_node->getResultType()))
{
node = buildCastFunction(new_node, node->getResultType(), context);
return;
}
node = std::move(new_node);
}
private:
const QueryTreeNodePtrWithHashMap<UInt64> & query_node_to_component;
const std::unordered_map<UInt64, QueryTreeNodePtr> & id_to_query_node_map;
ContextPtr context;
};
ColumnPrice calculatePrice(
const ColumnPriceByName & column_prices,
const std::unordered_set<std::string> & column_names)
{
ColumnPrice result(0, 0);
for (const auto & column : column_names)
{
if (auto it = column_prices.find(column); it != column_prices.end())
result += it->second;
}
return result;
}
void bruteForce(
const ComparisonGraph<QueryTreeNodePtr> & graph,
const std::vector<UInt64> & components,
size_t current_component,
const ColumnPriceByName & column_prices,
ColumnPrice current_price,
std::vector<QueryTreeNodePtr> & expressions_stack,
ColumnPrice & min_price,
std::vector<QueryTreeNodePtr> & min_expressions)
{
if (current_component == components.size())
{
if (current_price < min_price)
{
min_price = current_price;
min_expressions = expressions_stack;
}
return;
}
for (const auto & node : graph.getComponent(components[current_component]))
{
std::unordered_set<std::string> column_names;
ColumnNameCollectorVisitor column_name_collector{column_names, nullptr};
column_name_collector.visit(node);
ColumnPrice expression_price = calculatePrice(column_prices, column_names);
expressions_stack.push_back(node);
current_price += expression_price;
ColumnPriceByName new_prices(column_prices);
for (const auto & column : column_names)
new_prices.insert_or_assign(column, ColumnPrice(0, 0));
bruteForce(graph,
components,
current_component + 1,
new_prices,
current_price,
expressions_stack,
min_price,
min_expressions);
current_price -= expression_price;
expressions_stack.pop_back();
}
}
void substituteColumns(QueryNode & query_node, const QueryTreeNodes & table_expressions, const ContextPtr & context)
{
static constexpr UInt64 COLUMN_PENALTY = 10 * 1024 * 1024;
static constexpr Int64 INDEX_PRICE = -1'000'000'000'000'000'000;
for (const auto & table_expression : table_expressions)
{
auto snapshot = getStorageSnapshot(table_expression);
if (!snapshot || !snapshot->metadata)
continue;
const auto column_sizes = snapshot->storage.getColumnSizes();
if (column_sizes.empty())
return;
auto query_tree_constraint = snapshot->metadata->getConstraints().getQueryTreeData(context, table_expression);
const auto & graph = query_tree_constraint.getGraph();
auto run_for_all = [&](const auto function)
{
function(query_node.getProjectionNode());
if (query_node.hasWhere())
function(query_node.getWhere());
if (query_node.hasPrewhere())
function(query_node.getPrewhere());
if (query_node.hasHaving())
function(query_node.getHaving());
};
std::set<UInt64> components;
QueryTreeNodePtrWithHashMap<UInt64> query_node_to_component;
std::unordered_set<std::string> column_names;
run_for_all([&](QueryTreeNodePtr & node)
{
ComponentCollectorVisitor component_collector{components, query_node_to_component, graph};
component_collector.visit(node);
ColumnNameCollectorVisitor column_name_collector{column_names, &query_node_to_component};
column_name_collector.visit(node);
});
ColumnPriceByName column_prices;
const auto primary_key = snapshot->metadata->getColumnsRequiredForPrimaryKey();
for (const auto & [column_name, column_size] : column_sizes)
column_prices.insert_or_assign(column_name, ColumnPrice(column_size.data_compressed + COLUMN_PENALTY, column_size.data_uncompressed));
for (const auto & column_name : primary_key)
column_prices.insert_or_assign(column_name, ColumnPrice(INDEX_PRICE, INDEX_PRICE));
for (const auto & column_name : column_names)
column_prices.insert_or_assign(column_name, ColumnPrice(0, 0));
std::unordered_map<UInt64, QueryTreeNodePtr> id_to_query_node_map;
std::vector<UInt64> components_list;
for (const auto component_id : components)
{
auto component = graph.getComponent(component_id);
if (component.size() == 1)
id_to_query_node_map[component_id] = component.front();
else
components_list.push_back(component_id);
}
std::vector<QueryTreeNodePtr> expressions_stack;
ColumnPrice min_price(std::numeric_limits<Int64>::max(), std::numeric_limits<Int64>::max());
std::vector<QueryTreeNodePtr> min_expressions;
bruteForce(graph,
components_list,
0,
column_prices,
ColumnPrice(0, 0),
expressions_stack,
min_price,
min_expressions);
for (size_t i = 0; i < components_list.size(); ++i)
id_to_query_node_map[components_list[i]] = min_expressions[i];
SubstituteColumnVisitor substitute_column{query_node_to_component, id_to_query_node_map, context};
run_for_all([&](QueryTreeNodePtr & node)
{
substitute_column.visit(node);
});
}
}
void optimizeWithConstraints(Analyzer::CNF & cnf, const QueryTreeNodes & table_expressions, const ContextPtr & context)
{
cnf.pullNotOutFunctions(context);
for (const auto & table_expression : table_expressions)
{
auto snapshot = getStorageSnapshot(table_expression);
if (!snapshot || !snapshot->metadata)
continue;
const auto & constraints = snapshot->metadata->getConstraints();
const auto & query_tree_constraints = constraints.getQueryTreeData(context, table_expression);
const auto & compare_graph = query_tree_constraints.getGraph();
cnf.filterAlwaysTrueGroups([&](const auto & group)
{
/// remove always true groups from CNF
return !checkIfGroupAlwaysTrueFullMatch(group, query_tree_constraints) && !checkIfGroupAlwaysTrueGraph(group, compare_graph);
})
.filterAlwaysFalseAtoms([&](const Analyzer::CNF::AtomicFormula & atom)
{
/// remove always false atoms from CNF
return !checkIfAtomAlwaysFalseFullMatch(atom, query_tree_constraints) && !checkIfAtomAlwaysFalseGraph(atom, compare_graph);
})
.transformAtoms([&](const auto & atom)
{
return replaceTermsToConstants(atom, compare_graph);
})
.reduce();
}
cnf.pushNotIntoFunctions(context);
const auto & settings = context->getSettingsRef();
if (settings.optimize_append_index)
addIndexConstraint(cnf, table_expressions, context);
}
void optimizeNode(QueryTreeNodePtr & node, const QueryTreeNodes & table_expressions, const ContextPtr & context)
{
const auto & settings = context->getSettingsRef();
auto cnf = tryConvertQueryToCNF(node, context);
if (!cnf)
return;
if (settings.optimize_using_constraints)
optimizeWithConstraints(*cnf, table_expressions, context);
auto new_node = cnf->toQueryTree(context);
node = std::move(new_node);
}
class ConvertQueryToCNFVisitor : public InDepthQueryTreeVisitorWithContext<ConvertQueryToCNFVisitor>
{
public:
using Base = InDepthQueryTreeVisitorWithContext<ConvertQueryToCNFVisitor>;
using Base::Base;
void visitImpl(QueryTreeNodePtr & node)
{
auto * query_node = node->as<QueryNode>();
if (!query_node)
return;
auto table_expressions = extractTableExpressions(query_node->getJoinTree());
const auto & context = getContext();
const auto & settings = context->getSettingsRef();
bool has_filter = false;
const auto optimize_filter = [&](QueryTreeNodePtr & filter_node)
{
if (filter_node == nullptr)
return;
optimizeNode(filter_node, table_expressions, context);
has_filter = true;
};
optimize_filter(query_node->getWhere());
optimize_filter(query_node->getPrewhere());
optimize_filter(query_node->getHaving());
if (has_filter && settings.optimize_substitute_columns)
substituteColumns(*query_node, table_expressions, context);
}
};
}
void ConvertLogicalExpressionToCNFPass::run(QueryTreeNodePtr query_tree_node, ContextPtr context)
{
const auto & settings = context->getSettingsRef();
if (!settings.convert_query_to_cnf)
return;
ConvertQueryToCNFVisitor visitor(std::move(context));
visitor.visit(query_tree_node);
}
}

View File

@ -0,0 +1,18 @@
#pragma once
#include <Analyzer/IQueryTreePass.h>
namespace DB
{
class ConvertLogicalExpressionToCNFPass final : public IQueryTreePass
{
public:
String getName() override { return "ConvertLogicalExpressionToCNFPass"; }
String getDescription() override { return "Convert logical expression to CNF and apply optimizations using constraints"; }
void run(QueryTreeNodePtr query_tree_node, ContextPtr context) override;
};
}

View File

@ -32,6 +32,7 @@
#include <AggregateFunctions/AggregateFunctionFactory.h>
#include <TableFunctions/TableFunctionFactory.h>
#include <Formats/FormatFactory.h>
#include <Databases/IDatabase.h>
@ -75,6 +76,7 @@
#include <Analyzer/InDepthQueryTreeVisitor.h>
#include <Analyzer/QueryTreeBuilder.h>
#include <Analyzer/IQueryTreeNode.h>
#include <Analyzer/Identifier.h>
namespace ProfileEvents
{
@ -112,6 +114,8 @@ namespace ErrorCodes
extern const int ALIAS_REQUIRED;
extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH;
extern const int UNKNOWN_TABLE;
extern const int ILLEGAL_COLUMN;
extern const int NUMBER_OF_COLUMNS_DOESNT_MATCH;
}
/** Query analyzer implementation overview. Please check documentation in QueryAnalysisPass.h before.
@ -5085,8 +5089,11 @@ ProjectionNames QueryAnalyzer::resolveFunction(QueryTreeNodePtr & node, Identifi
arguments_projection_names[function_lambda_argument_index] = lambda_argument_projection_name_buffer.str();
}
argument_types[function_lambda_argument_index] = std::make_shared<DataTypeFunction>(function_data_type_argument_types, lambda_to_resolve->getResultType());
argument_columns[function_lambda_argument_index].type = argument_types[function_lambda_argument_index];
auto lambda_resolved_type = std::make_shared<DataTypeFunction>(function_data_type_argument_types, lambda_to_resolve_typed.getExpression()->getResultType());
lambda_to_resolve_typed.resolve(lambda_resolved_type);
argument_types[function_lambda_argument_index] = lambda_resolved_type;
argument_columns[function_lambda_argument_index].type = lambda_resolved_type;
function_arguments[function_lambda_argument_index] = std::move(lambda_to_resolve);
}
@ -6076,6 +6083,18 @@ void QueryAnalyzer::initializeTableExpressionData(const QueryTreeNodePtr & table
scope.table_expression_node_to_data.emplace(table_expression_node, std::move(table_expression_data));
}
bool findIdentifier(const FunctionNode & function)
{
for (const auto & argument : function.getArguments())
{
if (argument->as<IdentifierNode>())
return true;
if (const auto * f = argument->as<FunctionNode>(); f && findIdentifier(*f))
return true;
}
return false;
}
/// Resolve table function node in scope
void QueryAnalyzer::resolveTableFunction(QueryTreeNodePtr & table_function_node,
IdentifierResolveScope & scope,
@ -6087,12 +6106,11 @@ void QueryAnalyzer::resolveTableFunction(QueryTreeNodePtr & table_function_node,
if (!nested_table_function)
expressions_visitor.visit(table_function_node_typed.getArgumentsNode());
const auto & table_function_factory = TableFunctionFactory::instance();
const auto & table_function_name = table_function_node_typed.getTableFunctionName();
auto & scope_context = scope.context;
TableFunctionPtr table_function_ptr = table_function_factory.tryGet(table_function_name, scope_context);
TableFunctionPtr table_function_ptr = TableFunctionFactory::instance().tryGet(table_function_name, scope_context);
if (!table_function_ptr)
{
auto hints = TableFunctionFactory::instance().getHints(table_function_name);
@ -6107,17 +6125,131 @@ void QueryAnalyzer::resolveTableFunction(QueryTreeNodePtr & table_function_node,
table_function_name);
}
uint64_t use_structure_from_insertion_table_in_table_functions = scope_context->getSettingsRef().use_structure_from_insertion_table_in_table_functions;
if (!nested_table_function &&
scope_context->getSettingsRef().use_structure_from_insertion_table_in_table_functions &&
use_structure_from_insertion_table_in_table_functions &&
scope_context->hasInsertionTable() &&
table_function_ptr->needStructureHint())
{
const auto & insertion_table = scope_context->getInsertionTable();
if (!insertion_table.empty())
{
auto insertion_table_storage = DatabaseCatalog::instance().getTable(insertion_table, scope_context);
const auto & structure_hint = insertion_table_storage->getInMemoryMetadataPtr()->columns;
table_function_ptr->setStructureHint(structure_hint);
const auto & insert_structure = DatabaseCatalog::instance().getTable(insertion_table, scope_context)->getInMemoryMetadataPtr()->getColumns();
DB::ColumnsDescription structure_hint;
bool use_columns_from_insert_query = true;
/// Insert table matches columns against SELECT expression by position, so we want to map
/// insert table columns to table function columns through names from SELECT expression.
auto insert_column = insert_structure.begin();
auto insert_structure_end = insert_structure.end(); /// end iterator of the range covered by possible asterisk
auto virtual_column_names = table_function_ptr->getVirtualsToCheckBeforeUsingStructureHint();
bool asterisk = false;
const auto & expression_list = scope.scope_node->as<QueryNode &>().getProjection();
auto expression = expression_list.begin();
/// We want to go through SELECT expression list and correspond each expression to column in insert table
/// which type will be used as a hint for the file structure inference.
for (; expression != expression_list.end() && insert_column != insert_structure_end; ++expression)
{
if (auto * identifier_node = (*expression)->as<IdentifierNode>())
{
if (!virtual_column_names.contains(identifier_node->getIdentifier().getFullName()))
{
if (asterisk)
{
if (use_structure_from_insertion_table_in_table_functions == 1)
throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Asterisk cannot be mixed with column list in INSERT SELECT query.");
use_columns_from_insert_query = false;
break;
}
structure_hint.add({ identifier_node->getIdentifier().getFullName(), insert_column->type });
}
/// Once we hit asterisk we want to find end of the range covered by asterisk
/// contributing every further SELECT expression to the tail of insert structure
if (asterisk)
--insert_structure_end;
else
++insert_column;
}
else if (auto * matcher_node = (*expression)->as<MatcherNode>(); matcher_node && matcher_node->getMatcherType() == MatcherNodeType::ASTERISK)
{
if (asterisk)
{
if (use_structure_from_insertion_table_in_table_functions == 1)
throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Only one asterisk can be used in INSERT SELECT query.");
use_columns_from_insert_query = false;
break;
}
if (!structure_hint.empty())
{
if (use_structure_from_insertion_table_in_table_functions == 1)
throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Asterisk cannot be mixed with column list in INSERT SELECT query.");
use_columns_from_insert_query = false;
break;
}
asterisk = true;
}
else if (auto * function = (*expression)->as<FunctionNode>())
{
if (use_structure_from_insertion_table_in_table_functions == 2 && findIdentifier(*function))
{
use_columns_from_insert_query = false;
break;
}
/// Once we hit asterisk we want to find end of the range covered by asterisk
/// contributing every further SELECT expression to the tail of insert structure
if (asterisk)
--insert_structure_end;
else
++insert_column;
}
else
{
/// Once we hit asterisk we want to find end of the range covered by asterisk
/// contributing every further SELECT expression to the tail of insert structure
if (asterisk)
--insert_structure_end;
else
++insert_column;
}
}
if (use_structure_from_insertion_table_in_table_functions == 2 && !asterisk)
{
/// For input function we should check if input format supports reading subset of columns.
if (table_function_ptr->getName() == "input")
use_columns_from_insert_query = FormatFactory::instance().checkIfFormatSupportsSubsetOfColumns(scope.context->getInsertFormat());
else
use_columns_from_insert_query = table_function_ptr->supportsReadingSubsetOfColumns();
}
if (use_columns_from_insert_query)
{
if (expression == expression_list.end())
{
/// Append tail of insert structure to the hint
if (asterisk)
{
for (; insert_column != insert_structure_end; ++insert_column)
structure_hint.add({ insert_column->name, insert_column->type });
}
if (!structure_hint.empty())
table_function_ptr->setStructureHint(structure_hint);
} else if (use_structure_from_insertion_table_in_table_functions == 1)
throw Exception(ErrorCodes::NUMBER_OF_COLUMNS_DOESNT_MATCH, "Number of columns in insert table less than required by SELECT expression.");
}
}
}

View File

@ -41,7 +41,7 @@
#include <Analyzer/Passes/LogicalExpressionOptimizerPass.h>
#include <Analyzer/Passes/CrossToInnerJoinPass.h>
#include <Analyzer/Passes/ShardNumColumnToFunctionPass.h>
#include <Analyzer/Passes/ConvertQueryToCNFPass.h>
namespace DB
{
@ -115,13 +115,23 @@ private:
for (size_t i = 0; i < expected_argument_types_size; ++i)
{
// Skip lambdas
if (WhichDataType(expected_argument_types[i]).isFunction())
continue;
const auto & expected_argument_type = expected_argument_types[i];
const auto & actual_argument_type = actual_argument_columns[i].type;
if (!expected_argument_type)
throw Exception(ErrorCodes::LOGICAL_ERROR,
"Function {} expected argument {} type is not set after running {} pass",
function->toAST()->formatForErrorMessage(),
i + 1,
pass_name);
if (!actual_argument_type)
throw Exception(ErrorCodes::LOGICAL_ERROR,
"Function {} actual argument {} type is not set after running {} pass",
function->toAST()->formatForErrorMessage(),
i + 1,
pass_name);
if (!expected_argument_type->equals(*actual_argument_type))
{
/// Aggregate functions remove low cardinality for their argument types
@ -148,8 +158,6 @@ private:
/** ClickHouse query tree pass manager.
*
* TODO: Support setting convert_query_to_cnf.
* TODO: Support setting optimize_using_constraints.
* TODO: Support setting optimize_substitute_columns.
* TODO: Support GROUP BY injective function elimination.
* TODO: Support setting optimize_move_functions_out_of_any.
@ -235,6 +243,8 @@ void addQueryTreePasses(QueryTreePassManager & manager)
manager.addPass(std::make_unique<QueryAnalysisPass>());
manager.addPass(std::make_unique<FunctionToSubcolumnsPass>());
manager.addPass(std::make_unique<ConvertLogicalExpressionToCNFPass>());
manager.addPass(std::make_unique<CountDistinctPass>());
manager.addPass(std::make_unique<RewriteAggregateFunctionWithIfPass>());
manager.addPass(std::make_unique<SumIfToCountIfPass>());

View File

@ -8,7 +8,8 @@
namespace DB
{
BackupCoordinationLocal::BackupCoordinationLocal(bool plain_backup_) : file_infos(plain_backup_)
BackupCoordinationLocal::BackupCoordinationLocal(bool plain_backup_)
: log(&Poco::Logger::get("BackupCoordinationLocal")), file_infos(plain_backup_)
{
}
@ -35,7 +36,7 @@ Strings BackupCoordinationLocal::waitForStage(const String &, std::chrono::milli
void BackupCoordinationLocal::addReplicatedPartNames(const String & table_shared_id, const String & table_name_for_logs, const String & replica_name, const std::vector<PartNameAndChecksum> & part_names_and_checksums)
{
std::lock_guard lock{replicated_tables_mutex};
replicated_tables.addPartNames(table_shared_id, table_name_for_logs, replica_name, part_names_and_checksums);
replicated_tables.addPartNames({table_shared_id, table_name_for_logs, replica_name, part_names_and_checksums});
}
Strings BackupCoordinationLocal::getReplicatedPartNames(const String & table_shared_id, const String & replica_name) const
@ -48,7 +49,7 @@ Strings BackupCoordinationLocal::getReplicatedPartNames(const String & table_sha
void BackupCoordinationLocal::addReplicatedMutations(const String & table_shared_id, const String & table_name_for_logs, const String & replica_name, const std::vector<MutationInfo> & mutations)
{
std::lock_guard lock{replicated_tables_mutex};
replicated_tables.addMutations(table_shared_id, table_name_for_logs, replica_name, mutations);
replicated_tables.addMutations({table_shared_id, table_name_for_logs, replica_name, mutations});
}
std::vector<IBackupCoordination::MutationInfo> BackupCoordinationLocal::getReplicatedMutations(const String & table_shared_id, const String & replica_name) const
@ -61,7 +62,7 @@ std::vector<IBackupCoordination::MutationInfo> BackupCoordinationLocal::getRepli
void BackupCoordinationLocal::addReplicatedDataPath(const String & table_shared_id, const String & data_path)
{
std::lock_guard lock{replicated_tables_mutex};
replicated_tables.addDataPath(table_shared_id, data_path);
replicated_tables.addDataPath({table_shared_id, data_path});
}
Strings BackupCoordinationLocal::getReplicatedDataPaths(const String & table_shared_id) const
@ -74,7 +75,7 @@ Strings BackupCoordinationLocal::getReplicatedDataPaths(const String & table_sha
void BackupCoordinationLocal::addReplicatedAccessFilePath(const String & access_zk_path, AccessEntityType access_entity_type, const String & file_path)
{
std::lock_guard lock{replicated_access_mutex};
replicated_access.addFilePath(access_zk_path, access_entity_type, "", file_path);
replicated_access.addFilePath({access_zk_path, access_entity_type, "", file_path});
}
Strings BackupCoordinationLocal::getReplicatedAccessFilePaths(const String & access_zk_path, AccessEntityType access_entity_type) const
@ -87,7 +88,7 @@ Strings BackupCoordinationLocal::getReplicatedAccessFilePaths(const String & acc
void BackupCoordinationLocal::addReplicatedSQLObjectsDir(const String & loader_zk_path, UserDefinedSQLObjectType object_type, const String & dir_path)
{
std::lock_guard lock{replicated_sql_objects_mutex};
replicated_sql_objects.addDirectory(loader_zk_path, object_type, "", dir_path);
replicated_sql_objects.addDirectory({loader_zk_path, object_type, "", dir_path});
}
Strings BackupCoordinationLocal::getReplicatedSQLObjectsDirs(const String & loader_zk_path, UserDefinedSQLObjectType object_type) const
@ -125,7 +126,12 @@ bool BackupCoordinationLocal::startWritingFile(size_t data_file_index)
bool BackupCoordinationLocal::hasConcurrentBackups(const std::atomic<size_t> & num_active_backups) const
{
return (num_active_backups > 1);
if (num_active_backups > 1)
{
LOG_WARNING(log, "Found concurrent backups: num_active_backups={}", num_active_backups);
return true;
}
return false;
}
}

View File

@ -52,6 +52,8 @@ public:
bool hasConcurrentBackups(const std::atomic<size_t> & num_active_backups) const override;
private:
Poco::Logger * const log;
BackupCoordinationReplicatedTables TSA_GUARDED_BY(replicated_tables_mutex) replicated_tables;
BackupCoordinationReplicatedAccess TSA_GUARDED_BY(replicated_access_mutex) replicated_access;
BackupCoordinationReplicatedSQLObjects TSA_GUARDED_BY(replicated_sql_objects_mutex) replicated_sql_objects;

View File

@ -1,13 +1,18 @@
#include <Backups/BackupCoordinationRemote.h>
#include <base/hex.h>
#include <Access/Common/AccessEntityType.h>
#include <Backups/BackupCoordinationReplicatedAccess.h>
#include <Backups/BackupCoordinationStage.h>
#include <Common/escapeForFileName.h>
#include <Common/ZooKeeper/Common.h>
#include <Common/ZooKeeper/KeeperException.h>
#include <Functions/UserDefined/UserDefinedSQLObjectType.h>
#include <IO/ReadBufferFromString.h>
#include <IO/ReadHelpers.h>
#include <IO/WriteBufferFromString.h>
#include <IO/WriteHelpers.h>
#include <Common/ZooKeeper/KeeperException.h>
#include <Common/escapeForFileName.h>
#include <Backups/BackupCoordinationStage.h>
namespace DB
@ -154,8 +159,7 @@ BackupCoordinationRemote::BackupCoordinationRemote(
const String & current_host_,
bool plain_backup_,
bool is_internal_)
: get_zookeeper(get_zookeeper_)
, root_zookeeper_path(root_zookeeper_path_)
: root_zookeeper_path(root_zookeeper_path_)
, zookeeper_path(root_zookeeper_path_ + "/backup-" + backup_uuid_)
, keeper_settings(keeper_settings_)
, backup_uuid(backup_uuid_)
@ -164,17 +168,33 @@ BackupCoordinationRemote::BackupCoordinationRemote(
, current_host_index(findCurrentHostIndex(all_hosts, current_host))
, plain_backup(plain_backup_)
, is_internal(is_internal_)
{
zookeeper_retries_info = ZooKeeperRetriesInfo(
"BackupCoordinationRemote",
&Poco::Logger::get("BackupCoordinationRemote"),
keeper_settings.keeper_max_retries,
keeper_settings.keeper_retry_initial_backoff_ms,
keeper_settings.keeper_retry_max_backoff_ms);
, log(&Poco::Logger::get("BackupCoordinationRemote"))
, with_retries(
log,
get_zookeeper_,
keeper_settings,
[zookeeper_path = zookeeper_path, current_host = current_host, is_internal = is_internal]
(WithRetries::FaultyKeeper & zk)
{
/// Recreate this ephemeral node to signal that we are alive.
if (is_internal)
{
String alive_node_path = zookeeper_path + "/stage/alive|" + current_host;
auto code = zk->tryCreate(alive_node_path, "", zkutil::CreateMode::Ephemeral);
if (code == Coordination::Error::ZNODEEXISTS)
zk->handleEphemeralNodeExistenceNoFailureInjection(alive_node_path, "");
else if (code != Coordination::Error::ZOK)
throw zkutil::KeeperException(code, alive_node_path);
}
})
{
createRootNodes();
stage_sync.emplace(
zookeeper_path + "/stage", [this] { return getZooKeeper(); }, &Poco::Logger::get("BackupCoordination"));
zookeeper_path,
with_retries,
log);
}
BackupCoordinationRemote::~BackupCoordinationRemote()
@ -190,44 +210,45 @@ BackupCoordinationRemote::~BackupCoordinationRemote()
}
}
zkutil::ZooKeeperPtr BackupCoordinationRemote::getZooKeeper() const
{
std::lock_guard lock{zookeeper_mutex};
if (!zookeeper || zookeeper->expired())
{
zookeeper = get_zookeeper();
/// It's possible that we connected to different [Zoo]Keeper instance
/// so we may read a bit stale state.
zookeeper->sync(zookeeper_path);
}
return zookeeper;
}
void BackupCoordinationRemote::createRootNodes()
{
auto zk = getZooKeeper();
zk->createAncestors(zookeeper_path);
zk->createIfNotExists(zookeeper_path, "");
zk->createIfNotExists(zookeeper_path + "/repl_part_names", "");
zk->createIfNotExists(zookeeper_path + "/repl_mutations", "");
zk->createIfNotExists(zookeeper_path + "/repl_data_paths", "");
zk->createIfNotExists(zookeeper_path + "/repl_access", "");
zk->createIfNotExists(zookeeper_path + "/repl_sql_objects", "");
zk->createIfNotExists(zookeeper_path + "/file_infos", "");
zk->createIfNotExists(zookeeper_path + "/writing_files", "");
auto holder = with_retries.createRetriesControlHolder("createRootNodes");
holder.retries_ctl.retryLoop(
[&, &zk = holder.faulty_zookeeper]()
{
with_retries.renewZooKeeper(zk);
zk->createAncestors(zookeeper_path);
Coordination::Requests ops;
Coordination::Responses responses;
ops.emplace_back(zkutil::makeCreateRequest(zookeeper_path, "", zkutil::CreateMode::Persistent));
ops.emplace_back(zkutil::makeCreateRequest(zookeeper_path + "/repl_part_names", "", zkutil::CreateMode::Persistent));
ops.emplace_back(zkutil::makeCreateRequest(zookeeper_path + "/repl_mutations", "", zkutil::CreateMode::Persistent));
ops.emplace_back(zkutil::makeCreateRequest(zookeeper_path + "/repl_data_paths", "", zkutil::CreateMode::Persistent));
ops.emplace_back(zkutil::makeCreateRequest(zookeeper_path + "/repl_access", "", zkutil::CreateMode::Persistent));
ops.emplace_back(zkutil::makeCreateRequest(zookeeper_path + "/repl_sql_objects", "", zkutil::CreateMode::Persistent));
ops.emplace_back(zkutil::makeCreateRequest(zookeeper_path + "/file_infos", "", zkutil::CreateMode::Persistent));
ops.emplace_back(zkutil::makeCreateRequest(zookeeper_path + "/writing_files", "", zkutil::CreateMode::Persistent));
zk->tryMulti(ops, responses);
});
}
void BackupCoordinationRemote::removeAllNodes()
{
/// Usually this function is called by the initiator when a backup is complete so we don't need the coordination anymore.
///
/// However there can be a rare situation when this function is called after an error occurs on the initiator of a query
/// while some hosts are still making the backup. Removing all the nodes will remove the parent node of the backup coordination
/// at `zookeeper_path` which might cause such hosts to stop with exception "ZNONODE". Or such hosts might still do some useless part
/// of their backup work before that. Anyway in this case backup won't be finalized (because only an initiator can do that).
auto zk = getZooKeeper();
zk->removeRecursive(zookeeper_path);
auto holder = with_retries.createRetriesControlHolder("removeAllNodes");
holder.retries_ctl.retryLoop(
[&, &zk = holder.faulty_zookeeper]()
{
/// Usually this function is called by the initiator when a backup is complete so we don't need the coordination anymore.
///
/// However there can be a rare situation when this function is called after an error occurs on the initiator of a query
/// while some hosts are still making the backup. Removing all the nodes will remove the parent node of the backup coordination
/// at `zookeeper_path` which might cause such hosts to stop with exception "ZNONODE". Or such hosts might still do some useless part
/// of their backup work before that. Anyway in this case backup won't be finalized (because only an initiator can do that).
with_retries.renewZooKeeper(zk);
zk->removeRecursive(zookeeper_path);
});
}
@ -255,10 +276,11 @@ Strings BackupCoordinationRemote::waitForStage(const String & stage_to_wait, std
void BackupCoordinationRemote::serializeToMultipleZooKeeperNodes(const String & path, const String & value, const String & logging_name)
{
{
ZooKeeperRetriesControl retries_ctl(logging_name + "::create", zookeeper_retries_info);
retries_ctl.retryLoop([&]
auto holder = with_retries.createRetriesControlHolder(logging_name + "::create");
holder.retries_ctl.retryLoop(
[&, &zk = holder.faulty_zookeeper]()
{
auto zk = getZooKeeper();
with_retries.renewZooKeeper(zk);
zk->createIfNotExists(path, "");
});
}
@ -279,10 +301,11 @@ void BackupCoordinationRemote::serializeToMultipleZooKeeperNodes(const String &
String part = value.substr(begin, end - begin);
String part_path = fmt::format("{}/{:06}", path, i);
ZooKeeperRetriesControl retries_ctl(logging_name + "::createPart", zookeeper_retries_info);
retries_ctl.retryLoop([&]
auto holder = with_retries.createRetriesControlHolder(logging_name + "::createPart");
holder.retries_ctl.retryLoop(
[&, &zk = holder.faulty_zookeeper]()
{
auto zk = getZooKeeper();
with_retries.renewZooKeeper(zk);
zk->createIfNotExists(part_path, part);
});
}
@ -293,9 +316,11 @@ String BackupCoordinationRemote::deserializeFromMultipleZooKeeperNodes(const Str
Strings part_names;
{
ZooKeeperRetriesControl retries_ctl(logging_name + "::getChildren", zookeeper_retries_info);
retries_ctl.retryLoop([&]{
auto zk = getZooKeeper();
auto holder = with_retries.createRetriesControlHolder(logging_name + "::getChildren");
holder.retries_ctl.retryLoop(
[&, &zk = holder.faulty_zookeeper]()
{
with_retries.renewZooKeeper(zk);
part_names = zk->getChildren(path);
std::sort(part_names.begin(), part_names.end());
});
@ -306,10 +331,11 @@ String BackupCoordinationRemote::deserializeFromMultipleZooKeeperNodes(const Str
{
String part;
String part_path = path + "/" + part_name;
ZooKeeperRetriesControl retries_ctl(logging_name + "::get", zookeeper_retries_info);
retries_ctl.retryLoop([&]
auto holder = with_retries.createRetriesControlHolder(logging_name + "::get");
holder.retries_ctl.retryLoop(
[&, &zk = holder.faulty_zookeeper]()
{
auto zk = getZooKeeper();
with_retries.renewZooKeeper(zk);
part = zk->get(part_path);
});
res += part;
@ -330,11 +356,16 @@ void BackupCoordinationRemote::addReplicatedPartNames(
throw Exception(ErrorCodes::LOGICAL_ERROR, "addReplicatedPartNames() must not be called after preparing");
}
auto zk = getZooKeeper();
String path = zookeeper_path + "/repl_part_names/" + escapeForFileName(table_shared_id);
zk->createIfNotExists(path, "");
path += "/" + escapeForFileName(replica_name);
zk->create(path, ReplicatedPartNames::serialize(part_names_and_checksums, table_name_for_logs), zkutil::CreateMode::Persistent);
auto holder = with_retries.createRetriesControlHolder("addReplicatedPartNames");
holder.retries_ctl.retryLoop(
[&, &zk = holder.faulty_zookeeper]()
{
with_retries.renewZooKeeper(zk);
String path = zookeeper_path + "/repl_part_names/" + escapeForFileName(table_shared_id);
zk->createIfNotExists(path, "");
path += "/" + escapeForFileName(replica_name);
zk->createIfNotExists(path, ReplicatedPartNames::serialize(part_names_and_checksums, table_name_for_logs));
});
}
Strings BackupCoordinationRemote::getReplicatedPartNames(const String & table_shared_id, const String & replica_name) const
@ -356,11 +387,16 @@ void BackupCoordinationRemote::addReplicatedMutations(
throw Exception(ErrorCodes::LOGICAL_ERROR, "addReplicatedMutations() must not be called after preparing");
}
auto zk = getZooKeeper();
String path = zookeeper_path + "/repl_mutations/" + escapeForFileName(table_shared_id);
zk->createIfNotExists(path, "");
path += "/" + escapeForFileName(replica_name);
zk->create(path, ReplicatedMutations::serialize(mutations, table_name_for_logs), zkutil::CreateMode::Persistent);
auto holder = with_retries.createRetriesControlHolder("addReplicatedMutations");
holder.retries_ctl.retryLoop(
[&, &zk = holder.faulty_zookeeper]()
{
with_retries.renewZooKeeper(zk);
String path = zookeeper_path + "/repl_mutations/" + escapeForFileName(table_shared_id);
zk->createIfNotExists(path, "");
path += "/" + escapeForFileName(replica_name);
zk->createIfNotExists(path, ReplicatedMutations::serialize(mutations, table_name_for_logs));
});
}
std::vector<IBackupCoordination::MutationInfo> BackupCoordinationRemote::getReplicatedMutations(const String & table_shared_id, const String & replica_name) const
@ -380,11 +416,16 @@ void BackupCoordinationRemote::addReplicatedDataPath(
throw Exception(ErrorCodes::LOGICAL_ERROR, "addReplicatedDataPath() must not be called after preparing");
}
auto zk = getZooKeeper();
String path = zookeeper_path + "/repl_data_paths/" + escapeForFileName(table_shared_id);
zk->createIfNotExists(path, "");
path += "/" + escapeForFileName(data_path);
zk->createIfNotExists(path, "");
auto holder = with_retries.createRetriesControlHolder("addReplicatedDataPath");
holder.retries_ctl.retryLoop(
[&, &zk = holder.faulty_zookeeper]()
{
with_retries.renewZooKeeper(zk);
String path = zookeeper_path + "/repl_data_paths/" + escapeForFileName(table_shared_id);
zk->createIfNotExists(path, "");
path += "/" + escapeForFileName(data_path);
zk->createIfNotExists(path, "");
});
}
Strings BackupCoordinationRemote::getReplicatedDataPaths(const String & table_shared_id) const
@ -400,55 +441,88 @@ void BackupCoordinationRemote::prepareReplicatedTables() const
if (replicated_tables)
return;
std::vector<BackupCoordinationReplicatedTables::PartNamesForTableReplica> part_names_for_replicated_tables;
{
auto holder = with_retries.createRetriesControlHolder("prepareReplicatedTables::repl_part_names");
holder.retries_ctl.retryLoop(
[&, &zk = holder.faulty_zookeeper]()
{
part_names_for_replicated_tables.clear();
with_retries.renewZooKeeper(zk);
String path = zookeeper_path + "/repl_part_names";
for (const String & escaped_table_shared_id : zk->getChildren(path))
{
String table_shared_id = unescapeForFileName(escaped_table_shared_id);
String path2 = path + "/" + escaped_table_shared_id;
for (const String & escaped_replica_name : zk->getChildren(path2))
{
String replica_name = unescapeForFileName(escaped_replica_name);
auto part_names = ReplicatedPartNames::deserialize(zk->get(path2 + "/" + escaped_replica_name));
part_names_for_replicated_tables.push_back(
{table_shared_id, part_names.table_name_for_logs, replica_name, part_names.part_names_and_checksums});
}
}
});
}
std::vector<BackupCoordinationReplicatedTables::MutationsForTableReplica> mutations_for_replicated_tables;
{
auto holder = with_retries.createRetriesControlHolder("prepareReplicatedTables::repl_mutations");
holder.retries_ctl.retryLoop(
[&, &zk = holder.faulty_zookeeper]()
{
mutations_for_replicated_tables.clear();
with_retries.renewZooKeeper(zk);
String path = zookeeper_path + "/repl_mutations";
for (const String & escaped_table_shared_id : zk->getChildren(path))
{
String table_shared_id = unescapeForFileName(escaped_table_shared_id);
String path2 = path + "/" + escaped_table_shared_id;
for (const String & escaped_replica_name : zk->getChildren(path2))
{
String replica_name = unescapeForFileName(escaped_replica_name);
auto mutations = ReplicatedMutations::deserialize(zk->get(path2 + "/" + escaped_replica_name));
mutations_for_replicated_tables.push_back(
{table_shared_id, mutations.table_name_for_logs, replica_name, mutations.mutations});
}
}
});
}
std::vector<BackupCoordinationReplicatedTables::DataPathForTableReplica> data_paths_for_replicated_tables;
{
auto holder = with_retries.createRetriesControlHolder("prepareReplicatedTables::repl_data_paths");
holder.retries_ctl.retryLoop(
[&, &zk = holder.faulty_zookeeper]()
{
data_paths_for_replicated_tables.clear();
with_retries.renewZooKeeper(zk);
String path = zookeeper_path + "/repl_data_paths";
for (const String & escaped_table_shared_id : zk->getChildren(path))
{
String table_shared_id = unescapeForFileName(escaped_table_shared_id);
String path2 = path + "/" + escaped_table_shared_id;
for (const String & escaped_data_path : zk->getChildren(path2))
{
String data_path = unescapeForFileName(escaped_data_path);
data_paths_for_replicated_tables.push_back({table_shared_id, data_path});
}
}
});
}
replicated_tables.emplace();
auto zk = getZooKeeper();
{
String path = zookeeper_path + "/repl_part_names";
for (const String & escaped_table_shared_id : zk->getChildren(path))
{
String table_shared_id = unescapeForFileName(escaped_table_shared_id);
String path2 = path + "/" + escaped_table_shared_id;
for (const String & escaped_replica_name : zk->getChildren(path2))
{
String replica_name = unescapeForFileName(escaped_replica_name);
auto part_names = ReplicatedPartNames::deserialize(zk->get(path2 + "/" + escaped_replica_name));
replicated_tables->addPartNames(table_shared_id, part_names.table_name_for_logs, replica_name, part_names.part_names_and_checksums);
}
}
}
{
String path = zookeeper_path + "/repl_mutations";
for (const String & escaped_table_shared_id : zk->getChildren(path))
{
String table_shared_id = unescapeForFileName(escaped_table_shared_id);
String path2 = path + "/" + escaped_table_shared_id;
for (const String & escaped_replica_name : zk->getChildren(path2))
{
String replica_name = unescapeForFileName(escaped_replica_name);
auto mutations = ReplicatedMutations::deserialize(zk->get(path2 + "/" + escaped_replica_name));
replicated_tables->addMutations(table_shared_id, mutations.table_name_for_logs, replica_name, mutations.mutations);
}
}
}
{
String path = zookeeper_path + "/repl_data_paths";
for (const String & escaped_table_shared_id : zk->getChildren(path))
{
String table_shared_id = unescapeForFileName(escaped_table_shared_id);
String path2 = path + "/" + escaped_table_shared_id;
for (const String & escaped_data_path : zk->getChildren(path2))
{
String data_path = unescapeForFileName(escaped_data_path);
replicated_tables->addDataPath(table_shared_id, data_path);
}
}
}
for (auto & part_names : part_names_for_replicated_tables)
replicated_tables->addPartNames(std::move(part_names));
for (auto & mutations : mutations_for_replicated_tables)
replicated_tables->addMutations(std::move(mutations));
for (auto & data_paths : data_paths_for_replicated_tables)
replicated_tables->addDataPath(std::move(data_paths));
}
void BackupCoordinationRemote::addReplicatedAccessFilePath(const String & access_zk_path, AccessEntityType access_entity_type, const String & file_path)
{
{
@ -457,13 +531,18 @@ void BackupCoordinationRemote::addReplicatedAccessFilePath(const String & access
throw Exception(ErrorCodes::LOGICAL_ERROR, "addReplicatedAccessFilePath() must not be called after preparing");
}
auto zk = getZooKeeper();
String path = zookeeper_path + "/repl_access/" + escapeForFileName(access_zk_path);
zk->createIfNotExists(path, "");
path += "/" + AccessEntityTypeInfo::get(access_entity_type).name;
zk->createIfNotExists(path, "");
path += "/" + current_host;
zk->createIfNotExists(path, file_path);
auto holder = with_retries.createRetriesControlHolder("addReplicatedAccessFilePath");
holder.retries_ctl.retryLoop(
[&, &zk = holder.faulty_zookeeper]()
{
with_retries.renewZooKeeper(zk);
String path = zookeeper_path + "/repl_access/" + escapeForFileName(access_zk_path);
zk->createIfNotExists(path, "");
path += "/" + AccessEntityTypeInfo::get(access_entity_type).name;
zk->createIfNotExists(path, "");
path += "/" + current_host;
zk->createIfNotExists(path, file_path);
});
}
Strings BackupCoordinationRemote::getReplicatedAccessFilePaths(const String & access_zk_path, AccessEntityType access_entity_type) const
@ -478,25 +557,35 @@ void BackupCoordinationRemote::prepareReplicatedAccess() const
if (replicated_access)
return;
replicated_access.emplace();
auto zk = getZooKeeper();
String path = zookeeper_path + "/repl_access";
for (const String & escaped_access_zk_path : zk->getChildren(path))
std::vector<BackupCoordinationReplicatedAccess::FilePathForAccessEntitry> file_path_for_access_entities;
auto holder = with_retries.createRetriesControlHolder("prepareReplicatedAccess");
holder.retries_ctl.retryLoop(
[&, &zk = holder.faulty_zookeeper]()
{
String access_zk_path = unescapeForFileName(escaped_access_zk_path);
String path2 = path + "/" + escaped_access_zk_path;
for (const String & type_str : zk->getChildren(path2))
file_path_for_access_entities.clear();
with_retries.renewZooKeeper(zk);
String path = zookeeper_path + "/repl_access";
for (const String & escaped_access_zk_path : zk->getChildren(path))
{
AccessEntityType type = AccessEntityTypeInfo::parseType(type_str);
String path3 = path2 + "/" + type_str;
for (const String & host_id : zk->getChildren(path3))
String access_zk_path = unescapeForFileName(escaped_access_zk_path);
String path2 = path + "/" + escaped_access_zk_path;
for (const String & type_str : zk->getChildren(path2))
{
String file_path = zk->get(path3 + "/" + host_id);
replicated_access->addFilePath(access_zk_path, type, host_id, file_path);
AccessEntityType type = AccessEntityTypeInfo::parseType(type_str);
String path3 = path2 + "/" + type_str;
for (const String & host_id : zk->getChildren(path3))
{
String file_path = zk->get(path3 + "/" + host_id);
file_path_for_access_entities.push_back({access_zk_path, type, host_id, file_path});
}
}
}
}
});
replicated_access.emplace();
for (auto & file_path : file_path_for_access_entities)
replicated_access->addFilePath(std::move(file_path));
}
void BackupCoordinationRemote::addReplicatedSQLObjectsDir(const String & loader_zk_path, UserDefinedSQLObjectType object_type, const String & dir_path)
@ -507,21 +596,26 @@ void BackupCoordinationRemote::addReplicatedSQLObjectsDir(const String & loader_
throw Exception(ErrorCodes::LOGICAL_ERROR, "addReplicatedSQLObjectsDir() must not be called after preparing");
}
auto zk = getZooKeeper();
String path = zookeeper_path + "/repl_sql_objects/" + escapeForFileName(loader_zk_path);
zk->createIfNotExists(path, "");
path += "/";
switch (object_type)
auto holder = with_retries.createRetriesControlHolder("addReplicatedSQLObjectsDir");
holder.retries_ctl.retryLoop(
[&, &zk = holder.faulty_zookeeper]()
{
case UserDefinedSQLObjectType::Function:
path += "functions";
break;
}
with_retries.renewZooKeeper(zk);
String path = zookeeper_path + "/repl_sql_objects/" + escapeForFileName(loader_zk_path);
zk->createIfNotExists(path, "");
zk->createIfNotExists(path, "");
path += "/" + current_host;
zk->createIfNotExists(path, dir_path);
path += "/";
switch (object_type)
{
case UserDefinedSQLObjectType::Function:
path += "functions";
break;
}
zk->createIfNotExists(path, "");
path += "/" + current_host;
zk->createIfNotExists(path, dir_path);
});
}
Strings BackupCoordinationRemote::getReplicatedSQLObjectsDirs(const String & loader_zk_path, UserDefinedSQLObjectType object_type) const
@ -536,27 +630,36 @@ void BackupCoordinationRemote::prepareReplicatedSQLObjects() const
if (replicated_sql_objects)
return;
replicated_sql_objects.emplace();
auto zk = getZooKeeper();
String path = zookeeper_path + "/repl_sql_objects";
for (const String & escaped_loader_zk_path : zk->getChildren(path))
std::vector<BackupCoordinationReplicatedSQLObjects::DirectoryPathForSQLObject> directories_for_sql_objects;
auto holder = with_retries.createRetriesControlHolder("prepareReplicatedSQLObjects");
holder.retries_ctl.retryLoop(
[&, &zk = holder.faulty_zookeeper]()
{
String loader_zk_path = unescapeForFileName(escaped_loader_zk_path);
String objects_path = path + "/" + escaped_loader_zk_path;
directories_for_sql_objects.clear();
with_retries.renewZooKeeper(zk);
if (String functions_path = objects_path + "/functions"; zk->exists(functions_path))
String path = zookeeper_path + "/repl_sql_objects";
for (const String & escaped_loader_zk_path : zk->getChildren(path))
{
UserDefinedSQLObjectType object_type = UserDefinedSQLObjectType::Function;
for (const String & host_id : zk->getChildren(functions_path))
String loader_zk_path = unescapeForFileName(escaped_loader_zk_path);
String objects_path = path + "/" + escaped_loader_zk_path;
if (String functions_path = objects_path + "/functions"; zk->exists(functions_path))
{
String dir = zk->get(functions_path + "/" + host_id);
replicated_sql_objects->addDirectory(loader_zk_path, object_type, host_id, dir);
UserDefinedSQLObjectType object_type = UserDefinedSQLObjectType::Function;
for (const String & host_id : zk->getChildren(functions_path))
{
String dir = zk->get(functions_path + "/" + host_id);
directories_for_sql_objects.push_back({loader_zk_path, object_type, host_id, dir});
}
}
}
}
}
});
replicated_sql_objects.emplace();
for (auto & directory : directories_for_sql_objects)
replicated_sql_objects->addDirectory(std::move(directory));
}
void BackupCoordinationRemote::addFileInfos(BackupFileInfos && file_infos_)
{
@ -594,9 +697,11 @@ void BackupCoordinationRemote::prepareFileInfos() const
Strings hosts_with_file_infos;
{
ZooKeeperRetriesControl retries_ctl("prepareFileInfos::get_hosts", zookeeper_retries_info);
retries_ctl.retryLoop([&]{
auto zk = getZooKeeper();
auto holder = with_retries.createRetriesControlHolder("prepareFileInfos::get_hosts");
holder.retries_ctl.retryLoop(
[&, &zk = holder.faulty_zookeeper]()
{
with_retries.renewZooKeeper(zk);
hosts_with_file_infos = zk->getChildren(zookeeper_path + "/file_infos");
});
}
@ -615,10 +720,11 @@ bool BackupCoordinationRemote::startWritingFile(size_t data_file_index)
String full_path = zookeeper_path + "/writing_files/" + std::to_string(data_file_index);
String host_index_str = std::to_string(current_host_index);
ZooKeeperRetriesControl retries_ctl("startWritingFile", zookeeper_retries_info);
retries_ctl.retryLoop([&]
auto holder = with_retries.createRetriesControlHolder("startWritingFile");
holder.retries_ctl.retryLoop(
[&, &zk = holder.faulty_zookeeper]()
{
auto zk = getZooKeeper();
with_retries.renewZooKeeper(zk);
auto code = zk->tryCreate(full_path, host_index_str, zkutil::CreateMode::Persistent);
if (code == Coordination::Error::ZOK)
@ -632,51 +738,66 @@ bool BackupCoordinationRemote::startWritingFile(size_t data_file_index)
return acquired_writing;
}
bool BackupCoordinationRemote::hasConcurrentBackups(const std::atomic<size_t> &) const
{
/// If its internal concurrency will be checked for the base backup
if (is_internal)
return false;
auto zk = getZooKeeper();
std::string backup_stage_path = zookeeper_path + "/stage";
if (!zk->exists(root_zookeeper_path))
zk->createAncestors(root_zookeeper_path);
bool result = false;
for (size_t attempt = 0; attempt < MAX_ZOOKEEPER_ATTEMPTS; ++attempt)
auto holder = with_retries.createRetriesControlHolder("getAllArchiveSuffixes");
holder.retries_ctl.retryLoop(
[&, &zk = holder.faulty_zookeeper]()
{
Coordination::Stat stat;
zk->get(root_zookeeper_path, &stat);
Strings existing_backup_paths = zk->getChildren(root_zookeeper_path);
with_retries.renewZooKeeper(zk);
for (const auto & existing_backup_path : existing_backup_paths)
if (!zk->exists(root_zookeeper_path))
zk->createAncestors(root_zookeeper_path);
for (size_t attempt = 0; attempt < MAX_ZOOKEEPER_ATTEMPTS; ++attempt)
{
if (startsWith(existing_backup_path, "restore-"))
continue;
Coordination::Stat stat;
zk->get(root_zookeeper_path, &stat);
Strings existing_backup_paths = zk->getChildren(root_zookeeper_path);
String existing_backup_uuid = existing_backup_path;
existing_backup_uuid.erase(0, String("backup-").size());
for (const auto & existing_backup_path : existing_backup_paths)
{
if (startsWith(existing_backup_path, "restore-"))
continue;
if (existing_backup_uuid == toString(backup_uuid))
continue;
String existing_backup_uuid = existing_backup_path;
existing_backup_uuid.erase(0, String("backup-").size());
const auto status = zk->get(root_zookeeper_path + "/" + existing_backup_path + "/stage");
if (status != Stage::COMPLETED)
return true;
if (existing_backup_uuid == toString(backup_uuid))
continue;
String status;
if (zk->tryGet(root_zookeeper_path + "/" + existing_backup_path + "/stage", status))
{
/// If status is not COMPLETED it could be because the backup failed, check if 'error' exists
if (status != Stage::COMPLETED && !zk->exists(root_zookeeper_path + "/" + existing_backup_path + "/error"))
{
LOG_WARNING(log, "Found a concurrent backup: {}, current backup: {}", existing_backup_uuid, toString(backup_uuid));
result = true;
return;
}
}
}
zk->createIfNotExists(backup_stage_path, "");
auto code = zk->trySet(backup_stage_path, Stage::SCHEDULED_TO_START, stat.version);
if (code == Coordination::Error::ZOK)
break;
bool is_last_attempt = (attempt == MAX_ZOOKEEPER_ATTEMPTS - 1);
if ((code != Coordination::Error::ZBADVERSION) || is_last_attempt)
throw zkutil::KeeperException(code, backup_stage_path);
}
});
zk->createIfNotExists(backup_stage_path, "");
auto code = zk->trySet(backup_stage_path, Stage::SCHEDULED_TO_START, stat.version);
if (code == Coordination::Error::ZOK)
break;
bool is_last_attempt = (attempt == MAX_ZOOKEEPER_ATTEMPTS - 1);
if ((code != Coordination::Error::ZBADVERSION) || is_last_attempt)
throw zkutil::KeeperException(code, backup_stage_path);
}
return false;
return result;
}
}

View File

@ -6,7 +6,7 @@
#include <Backups/BackupCoordinationReplicatedSQLObjects.h>
#include <Backups/BackupCoordinationReplicatedTables.h>
#include <Backups/BackupCoordinationStageSync.h>
#include <Storages/MergeTree/ZooKeeperRetries.h>
#include <Backups/WithRetries.h>
namespace DB
@ -19,13 +19,7 @@ constexpr size_t MAX_ZOOKEEPER_ATTEMPTS = 10;
class BackupCoordinationRemote : public IBackupCoordination
{
public:
struct BackupKeeperSettings
{
UInt64 keeper_max_retries;
UInt64 keeper_retry_initial_backoff_ms;
UInt64 keeper_retry_max_backoff_ms;
UInt64 keeper_value_max_size;
};
using BackupKeeperSettings = WithRetries::KeeperSettings;
BackupCoordinationRemote(
zkutil::GetZooKeeper get_zookeeper_,
@ -79,7 +73,6 @@ public:
static size_t findCurrentHostIndex(const Strings & all_hosts, const String & current_host);
private:
zkutil::ZooKeeperPtr getZooKeeper() const;
void createRootNodes();
void removeAllNodes();
@ -94,7 +87,6 @@ private:
void prepareReplicatedSQLObjects() const TSA_REQUIRES(replicated_sql_objects_mutex);
void prepareFileInfos() const TSA_REQUIRES(file_infos_mutex);
const zkutil::GetZooKeeper get_zookeeper;
const String root_zookeeper_path;
const String zookeeper_path;
const BackupKeeperSettings keeper_settings;
@ -104,11 +96,12 @@ private:
const size_t current_host_index;
const bool plain_backup;
const bool is_internal;
Poco::Logger * const log;
mutable ZooKeeperRetriesInfo zookeeper_retries_info;
/// The order of these two fields matters, because stage_sync holds a reference to with_retries object
mutable WithRetries with_retries;
std::optional<BackupCoordinationStageSync> stage_sync;
mutable zkutil::ZooKeeperPtr TSA_GUARDED_BY(zookeeper_mutex) zookeeper;
mutable std::optional<BackupCoordinationReplicatedTables> TSA_GUARDED_BY(replicated_tables_mutex) replicated_tables;
mutable std::optional<BackupCoordinationReplicatedAccess> TSA_GUARDED_BY(replicated_access_mutex) replicated_access;
mutable std::optional<BackupCoordinationReplicatedSQLObjects> TSA_GUARDED_BY(replicated_sql_objects_mutex) replicated_sql_objects;

View File

@ -7,8 +7,13 @@ namespace DB
BackupCoordinationReplicatedAccess::BackupCoordinationReplicatedAccess() = default;
BackupCoordinationReplicatedAccess::~BackupCoordinationReplicatedAccess() = default;
void BackupCoordinationReplicatedAccess::addFilePath(const String & access_zk_path, AccessEntityType access_entity_type, const String & host_id, const String & file_path)
void BackupCoordinationReplicatedAccess::addFilePath(FilePathForAccessEntitry && file_path_for_access_entity)
{
const auto & access_zk_path = file_path_for_access_entity.access_zk_path;
const auto & access_entity_type = file_path_for_access_entity.access_entity_type;
const auto & host_id = file_path_for_access_entity.host_id;
const auto & file_path = file_path_for_access_entity.file_path;
auto & ref = file_paths_by_zk_path[std::make_pair(access_zk_path, access_entity_type)];
ref.file_paths.emplace(file_path);

View File

@ -28,8 +28,16 @@ public:
BackupCoordinationReplicatedAccess();
~BackupCoordinationReplicatedAccess();
struct FilePathForAccessEntitry
{
String access_zk_path;
AccessEntityType access_entity_type;
String host_id;
String file_path;
};
/// Adds a path to access*.txt file keeping access entities of a ReplicatedAccessStorage.
void addFilePath(const String & access_zk_path, AccessEntityType access_entity_type, const String & host_id, const String & file_path);
void addFilePath(FilePathForAccessEntitry && file_path_for_access_entity);
/// Returns all paths added by addFilePath() if `host_id` is a host chosen to store access.
Strings getFilePaths(const String & access_zk_path, AccessEntityType access_entity_type, const String & host_id) const;

View File

@ -7,8 +7,13 @@ namespace DB
BackupCoordinationReplicatedSQLObjects::BackupCoordinationReplicatedSQLObjects() = default;
BackupCoordinationReplicatedSQLObjects::~BackupCoordinationReplicatedSQLObjects() = default;
void BackupCoordinationReplicatedSQLObjects::addDirectory(const String & loader_zk_path, UserDefinedSQLObjectType object_type, const String & host_id, const String & dir_path)
void BackupCoordinationReplicatedSQLObjects::addDirectory(DirectoryPathForSQLObject && directory_path_for_sql_object)
{
const auto & loader_zk_path = directory_path_for_sql_object.loader_zk_path;
const auto & object_type = directory_path_for_sql_object.object_type;
const auto & host_id = directory_path_for_sql_object.host_id;
const auto & dir_path = directory_path_for_sql_object.dir_path;
auto & ref = dir_paths_by_zk_path[std::make_pair(loader_zk_path, object_type)];
ref.dir_paths.emplace(dir_path);

View File

@ -28,8 +28,16 @@ public:
BackupCoordinationReplicatedSQLObjects();
~BackupCoordinationReplicatedSQLObjects();
struct DirectoryPathForSQLObject
{
String loader_zk_path;
UserDefinedSQLObjectType object_type;
String host_id;
String dir_path;
};
/// Adds a path to directory keeping user defined SQL objects.
void addDirectory(const String & loader_zk_path, UserDefinedSQLObjectType object_type, const String & host_id, const String & dir_path);
void addDirectory(DirectoryPathForSQLObject && directory_path_for_sql_object);
/// Returns all added paths to directories if `host_id` is a host chosen to store user-defined SQL objects.
Strings getDirectories(const String & loader_zk_path, UserDefinedSQLObjectType object_type, const String & host_id) const;

View File

@ -149,12 +149,13 @@ private:
BackupCoordinationReplicatedTables::BackupCoordinationReplicatedTables() = default;
BackupCoordinationReplicatedTables::~BackupCoordinationReplicatedTables() = default;
void BackupCoordinationReplicatedTables::addPartNames(
const String & table_shared_id,
const String & table_name_for_logs,
const String & replica_name,
const std::vector<PartNameAndChecksum> & part_names_and_checksums)
void BackupCoordinationReplicatedTables::addPartNames(PartNamesForTableReplica && part_names)
{
const auto & table_shared_id = part_names.table_shared_id;
const auto & table_name_for_logs = part_names.table_name_for_logs;
const auto & replica_name = part_names.replica_name;
const auto & part_names_and_checksums = part_names.part_names_and_checksums;
if (prepared)
throw Exception(ErrorCodes::LOGICAL_ERROR, "addPartNames() must not be called after preparing");
@ -216,12 +217,13 @@ Strings BackupCoordinationReplicatedTables::getPartNames(const String & table_sh
return it2->second;
}
void BackupCoordinationReplicatedTables::addMutations(
const String & table_shared_id,
const String & table_name_for_logs,
const String & replica_name,
const std::vector<MutationInfo> & mutations)
void BackupCoordinationReplicatedTables::addMutations(MutationsForTableReplica && mutations_for_table_replica)
{
const auto & table_shared_id = mutations_for_table_replica.table_shared_id;
const auto & table_name_for_logs = mutations_for_table_replica.table_name_for_logs;
const auto & replica_name = mutations_for_table_replica.replica_name;
const auto & mutations = mutations_for_table_replica.mutations;
if (prepared)
throw Exception(ErrorCodes::LOGICAL_ERROR, "addMutations() must not be called after preparing");
@ -254,8 +256,11 @@ BackupCoordinationReplicatedTables::getMutations(const String & table_shared_id,
return res;
}
void BackupCoordinationReplicatedTables::addDataPath(const String & table_shared_id, const String & data_path)
void BackupCoordinationReplicatedTables::addDataPath(DataPathForTableReplica && data_path_for_table_replica)
{
const auto & table_shared_id = data_path_for_table_replica.table_shared_id;
const auto & data_path = data_path_for_table_replica.data_path;
auto & table_info = table_infos[table_shared_id];
table_info.data_paths.emplace(data_path);
}

View File

@ -38,15 +38,19 @@ public:
using PartNameAndChecksum = IBackupCoordination::PartNameAndChecksum;
struct PartNamesForTableReplica
{
String table_shared_id;
String table_name_for_logs;
String replica_name;
std::vector<PartNameAndChecksum> part_names_and_checksums;
};
/// Adds part names which a specified replica of a replicated table is going to put to the backup.
/// Multiple replicas of the replicated table call this function and then the added part names can be returned by call of the function
/// getPartNames().
/// Checksums are used only to control that parts under the same names on different replicas are the same.
void addPartNames(
const String & table_shared_id,
const String & table_name_for_logs,
const String & replica_name,
const std::vector<PartNameAndChecksum> & part_names_and_checksums);
void addPartNames(PartNamesForTableReplica && part_names);
/// Returns the names of the parts which a specified replica of a replicated table should put to the backup.
/// This is the same list as it was added by call of the function addPartNames() but without duplications and without
@ -55,20 +59,30 @@ public:
using MutationInfo = IBackupCoordination::MutationInfo;
struct MutationsForTableReplica
{
String table_shared_id;
String table_name_for_logs;
String replica_name;
std::vector<MutationInfo> mutations;
};
/// Adds information about mutations of a replicated table.
void addMutations(
const String & table_shared_id,
const String & table_name_for_logs,
const String & replica_name,
const std::vector<MutationInfo> & mutations);
void addMutations(MutationsForTableReplica && mutations_for_table_replica);
/// Returns all mutations of a replicated table which are not finished for some data parts added by addReplicatedPartNames().
std::vector<MutationInfo> getMutations(const String & table_shared_id, const String & replica_name) const;
struct DataPathForTableReplica
{
String table_shared_id;
String data_path;
};
/// Adds a data path in backup for a replicated table.
/// Multiple replicas of the replicated table call this function and then all the added paths can be returned by call of the function
/// getDataPaths().
void addDataPath(const String & table_shared_id, const String & data_path);
void addDataPath(DataPathForTableReplica && data_path_for_table_replica);
/// Returns all the data paths in backup added for a replicated table (see also addReplicatedDataPath()).
Strings getDataPaths(const String & table_shared_id) const;

View File

@ -1,11 +1,13 @@
#include <Backups/BackupCoordinationStageSync.h>
#include <base/chrono_io.h>
#include <Common/ZooKeeper/Common.h>
#include <Common/Exception.h>
#include <Common/ZooKeeper/KeeperException.h>
#include <IO/ReadBufferFromString.h>
#include <IO/ReadHelpers.h>
#include <IO/WriteBufferFromString.h>
#include <IO/WriteHelpers.h>
#include <base/chrono_io.h>
namespace DB
@ -17,9 +19,12 @@ namespace ErrorCodes
}
BackupCoordinationStageSync::BackupCoordinationStageSync(const String & zookeeper_path_, zkutil::GetZooKeeper get_zookeeper_, Poco::Logger * log_)
: zookeeper_path(zookeeper_path_)
, get_zookeeper(get_zookeeper_)
BackupCoordinationStageSync::BackupCoordinationStageSync(
const String & root_zookeeper_path_,
WithRetries & with_retries_,
Poco::Logger * log_)
: zookeeper_path(root_zookeeper_path_ + "/stage")
, with_retries(with_retries_)
, log(log_)
{
createRootNodes();
@ -27,32 +32,48 @@ BackupCoordinationStageSync::BackupCoordinationStageSync(const String & zookeepe
void BackupCoordinationStageSync::createRootNodes()
{
auto zookeeper = get_zookeeper();
zookeeper->createAncestors(zookeeper_path);
zookeeper->createIfNotExists(zookeeper_path, "");
auto holder = with_retries.createRetriesControlHolder("createRootNodes");
holder.retries_ctl.retryLoop(
[&, &zookeeper = holder.faulty_zookeeper]()
{
with_retries.renewZooKeeper(zookeeper);
zookeeper->createAncestors(zookeeper_path);
zookeeper->createIfNotExists(zookeeper_path, "");
});
}
void BackupCoordinationStageSync::set(const String & current_host, const String & new_stage, const String & message)
{
auto zookeeper = get_zookeeper();
auto holder = with_retries.createRetriesControlHolder("set");
holder.retries_ctl.retryLoop(
[&, &zookeeper = holder.faulty_zookeeper]()
{
with_retries.renewZooKeeper(zookeeper);
/// Make an ephemeral node so the initiator can track if the current host is still working.
String alive_node_path = zookeeper_path + "/alive|" + current_host;
auto code = zookeeper->tryCreate(alive_node_path, "", zkutil::CreateMode::Ephemeral);
if (code != Coordination::Error::ZOK && code != Coordination::Error::ZNODEEXISTS)
throw zkutil::KeeperException(code, alive_node_path);
/// Make an ephemeral node so the initiator can track if the current host is still working.
String alive_node_path = zookeeper_path + "/alive|" + current_host;
auto code = zookeeper->tryCreate(alive_node_path, "", zkutil::CreateMode::Ephemeral);
if (code != Coordination::Error::ZOK && code != Coordination::Error::ZNODEEXISTS)
throw zkutil::KeeperException(code, alive_node_path);
zookeeper->createIfNotExists(zookeeper_path + "/started|" + current_host, "");
zookeeper->create(zookeeper_path + "/current|" + current_host + "|" + new_stage, message, zkutil::CreateMode::Persistent);
zookeeper->createIfNotExists(zookeeper_path + "/started|" + current_host, "");
zookeeper->createIfNotExists(zookeeper_path + "/current|" + current_host + "|" + new_stage, message);
});
}
void BackupCoordinationStageSync::setError(const String & current_host, const Exception & exception)
{
auto zookeeper = get_zookeeper();
WriteBufferFromOwnString buf;
writeStringBinary(current_host, buf);
writeException(exception, buf, true);
zookeeper->createIfNotExists(zookeeper_path + "/error", buf.str());
auto holder = with_retries.createRetriesControlHolder("setError");
holder.retries_ctl.retryLoop(
[&, &zookeeper = holder.faulty_zookeeper]()
{
with_retries.renewZooKeeper(zookeeper);
WriteBufferFromOwnString buf;
writeStringBinary(current_host, buf);
writeException(exception, buf, true);
zookeeper->createIfNotExists(zookeeper_path + "/error", buf.str());
});
}
Strings BackupCoordinationStageSync::wait(const Strings & all_hosts, const String & stage_to_wait)
@ -83,14 +104,24 @@ struct BackupCoordinationStageSync::State
};
BackupCoordinationStageSync::State BackupCoordinationStageSync::readCurrentState(
zkutil::ZooKeeperPtr zookeeper, const Strings & zk_nodes, const Strings & all_hosts, const String & stage_to_wait) const
const Strings & zk_nodes, const Strings & all_hosts, const String & stage_to_wait) const
{
std::unordered_set<std::string_view> zk_nodes_set{zk_nodes.begin(), zk_nodes.end()};
State state;
if (zk_nodes_set.contains("error"))
{
ReadBufferFromOwnString buf{zookeeper->get(zookeeper_path + "/error")};
String errors;
{
auto holder = with_retries.createRetriesControlHolder("readCurrentState");
holder.retries_ctl.retryLoop(
[&, &zookeeper = holder.faulty_zookeeper]()
{
with_retries.renewZooKeeper(zookeeper);
errors = zookeeper->get(zookeeper_path + "/error");
});
}
ReadBufferFromOwnString buf{errors};
String host;
readStringBinary(host, buf);
state.error = std::make_pair(host, readException(buf, fmt::format("Got error from {}", host)));
@ -102,8 +133,38 @@ BackupCoordinationStageSync::State BackupCoordinationStageSync::readCurrentState
if (!zk_nodes_set.contains("current|" + host + "|" + stage_to_wait))
{
UnreadyHostState unready_host_state;
unready_host_state.started = zk_nodes_set.contains("started|" + host);
unready_host_state.alive = zk_nodes_set.contains("alive|" + host);
const String started_node_name = "started|" + host;
const String alive_node_name = "alive|" + host;
const String alive_node_path = zookeeper_path + "/" + alive_node_name;
unready_host_state.started = zk_nodes_set.contains(started_node_name);
/// Because we do retries everywhere we can't fully rely on ephemeral nodes anymore.
/// Though we recreate "alive" node when reconnecting it might be not enough and race condition is possible.
/// And everything we can do here - just retry.
/// In worst case when we won't manage to see the alive node for a long time we will just abort the backup.
unready_host_state.alive = zk_nodes_set.contains(alive_node_name);
if (!unready_host_state.alive)
{
LOG_TRACE(log, "Seems like host ({}) is dead. Will retry the check to confirm", host);
auto holder = with_retries.createRetriesControlHolder("readCurrentState::checkAliveNode");
holder.retries_ctl.retryLoop(
[&, &zookeeper = holder.faulty_zookeeper]()
{
with_retries.renewZooKeeper(zookeeper);
if (zookeeper->existsNoFailureInjection(alive_node_path))
{
unready_host_state.alive = true;
return;
}
// Retry with backoff. We also check whether it is last retry or no, because we won't to rethrow an exception.
if (!holder.retries_ctl.isLastRetry())
holder.retries_ctl.setKeeperError(Coordination::Error::ZNONODE, "There is no alive node for host {}. Will retry", host);
});
}
LOG_TRACE(log, "Host ({}) appeared to be {}", host, unready_host_state.alive ? "alive" : "dead");
state.unready_hosts.emplace(host, unready_host_state);
if (!unready_host_state.alive && unready_host_state.started && !state.host_terminated)
state.host_terminated = host;
@ -113,51 +174,62 @@ BackupCoordinationStageSync::State BackupCoordinationStageSync::readCurrentState
if (state.host_terminated || !state.unready_hosts.empty())
return state;
state.results.reserve(all_hosts.size());
for (const auto & host : all_hosts)
state.results.emplace_back(zookeeper->get(zookeeper_path + "/current|" + host + "|" + stage_to_wait));
auto holder = with_retries.createRetriesControlHolder("waitImpl::collectStagesToWait");
holder.retries_ctl.retryLoop(
[&, &zookeeper = holder.faulty_zookeeper]()
{
with_retries.renewZooKeeper(zookeeper);
Strings results;
for (const auto & host : all_hosts)
results.emplace_back(zookeeper->get(zookeeper_path + "/current|" + host + "|" + stage_to_wait));
state.results = std::move(results);
});
return state;
}
Strings BackupCoordinationStageSync::waitImpl(const Strings & all_hosts, const String & stage_to_wait, std::optional<std::chrono::milliseconds> timeout) const
Strings BackupCoordinationStageSync::waitImpl(
const Strings & all_hosts, const String & stage_to_wait, std::optional<std::chrono::milliseconds> timeout) const
{
if (all_hosts.empty())
return {};
/// Wait until all hosts are ready or an error happens or time is out.
auto zookeeper = get_zookeeper();
/// Set by ZooKepper when list of zk nodes have changed.
auto watch = std::make_shared<Poco::Event>();
bool use_timeout = timeout.has_value();
std::chrono::steady_clock::time_point end_of_timeout;
if (use_timeout)
end_of_timeout = std::chrono::steady_clock::now() + std::chrono::duration_cast<std::chrono::steady_clock::duration>(*timeout);
State state;
String previous_unready_host; /// Used for logging: we don't want to log the same unready host again.
for (;;)
{
/// Get zk nodes and subscribe on their changes.
Strings zk_nodes = zookeeper->getChildren(zookeeper_path, nullptr, watch);
LOG_INFO(log, "Waiting for the stage {}", stage_to_wait);
/// Set by ZooKepper when list of zk nodes have changed.
auto watch = std::make_shared<Poco::Event>();
Strings zk_nodes;
{
auto holder = with_retries.createRetriesControlHolder("waitImpl::getChildren");
holder.retries_ctl.retryLoop(
[&, &zookeeper = holder.faulty_zookeeper]()
{
with_retries.renewZooKeeper(zookeeper);
watch->reset();
/// Get zk nodes and subscribe on their changes.
zk_nodes = zookeeper->getChildren(zookeeper_path, nullptr, watch);
});
}
/// Read and analyze the current state of zk nodes.
state = readCurrentState(zookeeper, zk_nodes, all_hosts, stage_to_wait);
state = readCurrentState(zk_nodes, all_hosts, stage_to_wait);
if (state.error || state.host_terminated || state.unready_hosts.empty())
break; /// Error happened or everything is ready.
/// Log that we will wait for another host.
/// Log that we will wait
const auto & unready_host = state.unready_hosts.begin()->first;
if (unready_host != previous_unready_host)
{
LOG_TRACE(log, "Waiting for host {}", unready_host);
previous_unready_host = unready_host;
}
LOG_INFO(log, "Waiting on ZooKeeper watch for any node to be changed (currently waiting for host {})", unready_host);
/// Wait until `watch_callback` is called by ZooKeeper meaning that zk nodes have changed.
{
@ -195,6 +267,7 @@ Strings BackupCoordinationStageSync::waitImpl(const Strings & all_hosts, const S
unready_host_state.started ? "" : ": Operation didn't start");
}
LOG_TRACE(log, "Everything is Ok. All hosts achieved stage {}", stage_to_wait);
return state.results;
}

View File

@ -1,7 +1,6 @@
#pragma once
#include <Common/ZooKeeper/Common.h>
#include <Backups/WithRetries.h>
namespace DB
{
@ -10,7 +9,10 @@ namespace DB
class BackupCoordinationStageSync
{
public:
BackupCoordinationStageSync(const String & zookeeper_path_, zkutil::GetZooKeeper get_zookeeper_, Poco::Logger * log_);
BackupCoordinationStageSync(
const String & root_zookeeper_path_,
WithRetries & with_retries_,
Poco::Logger * log_);
/// Sets the stage of the current host and signal other hosts if there were other hosts waiting for that.
void set(const String & current_host, const String & new_stage, const String & message);
@ -27,12 +29,13 @@ private:
void createRootNodes();
struct State;
State readCurrentState(zkutil::ZooKeeperPtr zookeeper, const Strings & zk_nodes, const Strings & all_hosts, const String & stage_to_wait) const;
State readCurrentState(const Strings & zk_nodes, const Strings & all_hosts, const String & stage_to_wait) const;
Strings waitImpl(const Strings & all_hosts, const String & stage_to_wait, std::optional<std::chrono::milliseconds> timeout) const;
String zookeeper_path;
zkutil::GetZooKeeper get_zookeeper;
/// A reference to the field of parent object - BackupCoordinationRemote or RestoreCoordinationRemote
WithRetries & with_retries;
Poco::Logger * log;
};

View File

@ -84,6 +84,12 @@ BackupEntriesCollector::BackupEntriesCollector(
, on_cluster_first_sync_timeout(context->getConfigRef().getUInt64("backups.on_cluster_first_sync_timeout", 180000))
, consistent_metadata_snapshot_timeout(context->getConfigRef().getUInt64("backups.consistent_metadata_snapshot_timeout", 600000))
, log(&Poco::Logger::get("BackupEntriesCollector"))
, global_zookeeper_retries_info(
"BackupEntriesCollector",
log,
context->getSettingsRef().backup_restore_keeper_max_retries,
context->getSettingsRef().backup_restore_keeper_retry_initial_backoff_ms,
context->getSettingsRef().backup_restore_keeper_retry_max_backoff_ms)
{
}
@ -482,7 +488,10 @@ std::vector<std::pair<ASTPtr, StoragePtr>> BackupEntriesCollector::findTablesInD
try
{
db_tables = database->getTablesForBackup(filter_by_table_name, context);
/// Database or table could be replicated - so may use ZooKeeper. We need to retry.
auto zookeeper_retries_info = global_zookeeper_retries_info;
ZooKeeperRetriesControl retries_ctl("getTablesForBackup", zookeeper_retries_info);
retries_ctl.retryLoop([&](){ db_tables = database->getTablesForBackup(filter_by_table_name, context); });
}
catch (Exception & e)
{
@ -745,6 +754,7 @@ void BackupEntriesCollector::addPostTask(std::function<void()> task)
/// Runs all the tasks added with addPostCollectingTask().
void BackupEntriesCollector::runPostTasks()
{
LOG_TRACE(log, "Will run {} post tasks", post_tasks.size());
/// Post collecting tasks can add other post collecting tasks, our code is fine with that.
while (!post_tasks.empty())
{
@ -752,6 +762,7 @@ void BackupEntriesCollector::runPostTasks()
post_tasks.pop();
std::move(task)();
}
LOG_TRACE(log, "All post tasks successfully executed");
}
size_t BackupEntriesCollector::getAccessCounter(AccessEntityType type)

View File

@ -6,6 +6,7 @@
#include <Parsers/ASTBackupQuery.h>
#include <Storages/IStorage_fwd.h>
#include <Storages/TableLockHolder.h>
#include <Storages/MergeTree/ZooKeeperRetries.h>
#include <filesystem>
#include <queue>
@ -96,6 +97,9 @@ private:
std::chrono::milliseconds on_cluster_first_sync_timeout;
std::chrono::milliseconds consistent_metadata_snapshot_timeout;
Poco::Logger * log;
/// Unfortunately we can use ZooKeeper for collecting information for backup
/// and we need to retry...
ZooKeeperRetriesInfo global_zookeeper_retries_info;
Strings all_hosts;
DDLRenamingMap renaming_map;

View File

@ -8,10 +8,11 @@ namespace DB
BackupEntryFromAppendOnlyFile::BackupEntryFromAppendOnlyFile(
const DiskPtr & disk_,
const String & file_path_,
const ReadSettings & settings_,
const std::optional<UInt64> & file_size_,
const std::optional<UInt128> & checksum_,
const std::shared_ptr<TemporaryFileOnDisk> & temporary_file_)
: BackupEntryFromImmutableFile(disk_, file_path_, file_size_, checksum_, temporary_file_)
: BackupEntryFromImmutableFile(disk_, file_path_, settings_, file_size_, checksum_, temporary_file_)
, limit(BackupEntryFromImmutableFile::getSize())
{
}

View File

@ -16,6 +16,7 @@ public:
BackupEntryFromAppendOnlyFile(
const DiskPtr & disk_,
const String & file_path_,
const ReadSettings & settings_,
const std::optional<UInt64> & file_size_ = {},
const std::optional<UInt128> & checksum_ = {},
const std::shared_ptr<TemporaryFileOnDisk> & temporary_file_ = {});

View File

@ -11,10 +11,16 @@ namespace DB
BackupEntryFromImmutableFile::BackupEntryFromImmutableFile(
const DiskPtr & disk_,
const String & file_path_,
const ReadSettings & settings_,
const std::optional<UInt64> & file_size_,
const std::optional<UInt128> & checksum_,
const std::shared_ptr<TemporaryFileOnDisk> & temporary_file_)
: disk(disk_), file_path(file_path_), file_size(file_size_), checksum(checksum_), temporary_file_on_disk(temporary_file_)
: disk(disk_)
, file_path(file_path_)
, settings(settings_)
, file_size(file_size_)
, checksum(checksum_)
, temporary_file_on_disk(temporary_file_)
{
}
@ -30,7 +36,7 @@ UInt64 BackupEntryFromImmutableFile::getSize() const
std::unique_ptr<SeekableReadBuffer> BackupEntryFromImmutableFile::getReadBuffer() const
{
return disk->readFile(file_path);
return disk->readFile(file_path, settings);
}

View File

@ -1,6 +1,7 @@
#pragma once
#include <Backups/IBackupEntry.h>
#include <IO/ReadSettings.h>
#include <base/defines.h>
#include <mutex>
@ -19,6 +20,7 @@ public:
BackupEntryFromImmutableFile(
const DiskPtr & disk_,
const String & file_path_,
const ReadSettings & settings_,
const std::optional<UInt64> & file_size_ = {},
const std::optional<UInt128> & checksum_ = {},
const std::shared_ptr<TemporaryFileOnDisk> & temporary_file_ = {});
@ -37,6 +39,7 @@ public:
private:
const DiskPtr disk;
const String file_path;
ReadSettings settings;
mutable std::optional<UInt64> file_size TSA_GUARDED_BY(get_file_size_mutex);
mutable std::mutex get_file_size_mutex;
const std::optional<UInt128> checksum;

View File

@ -3,6 +3,7 @@
#include <IO/copyData.h>
#include <IO/WriteBufferFromFileBase.h>
#include <IO/SeekableReadBuffer.h>
#include <Interpreters/Context.h>
namespace DB
@ -22,6 +23,11 @@ void IBackupReader::copyFileToDisk(const String & file_name, size_t size, DiskPt
write_buffer->finalize();
}
IBackupWriter::IBackupWriter(const ContextPtr & context_)
: read_settings(context_->getBackupReadSettings())
, has_throttling(static_cast<bool>(context_->getBackupsThrottler()))
{}
void IBackupWriter::copyDataToFile(const CreateReadBufferFunction & create_read_buffer, UInt64 offset, UInt64 size, const String & dest_file_name)
{
auto read_buffer = create_read_buffer();

View File

@ -3,6 +3,8 @@
#include <Core/Types.h>
#include <Disks/DiskType.h>
#include <Disks/IDisk.h>
#include <IO/ReadSettings.h>
#include <Interpreters/Context_fwd.h>
namespace DB
{
@ -28,6 +30,8 @@ class IBackupWriter /// BackupWriterFile, BackupWriterDisk
public:
using CreateReadBufferFunction = std::function<std::unique_ptr<SeekableReadBuffer>()>;
explicit IBackupWriter(const ContextPtr & context_);
virtual ~IBackupWriter() = default;
virtual bool fileExists(const String & file_name) = 0;
virtual UInt64 getFileSize(const String & file_name) = 0;
@ -38,7 +42,17 @@ public:
virtual DataSourceDescription getDataSourceDescription() const = 0;
virtual void copyDataToFile(const CreateReadBufferFunction & create_read_buffer, UInt64 offset, UInt64 size, const String & dest_file_name);
virtual bool supportNativeCopy(DataSourceDescription /* data_source_description */) const { return false; }
/// Copy file using native copy (optimized for S3 to use CopyObject)
///
/// NOTE: It still may fall back to copyDataToFile() if native copy is not possible:
/// - different buckets
/// - throttling had been requested
virtual void copyFileNative(DiskPtr src_disk, const String & src_file_name, UInt64 src_offset, UInt64 src_size, const String & dest_file_name);
protected:
const ReadSettings read_settings;
const bool has_throttling;
};
}

View File

@ -50,7 +50,10 @@ void BackupReaderDisk::copyFileToDisk(const String & file_name, size_t size, Dis
}
BackupWriterDisk::BackupWriterDisk(const DiskPtr & disk_, const String & path_) : disk(disk_), path(path_)
BackupWriterDisk::BackupWriterDisk(const DiskPtr & disk_, const String & path_, const ContextPtr & context_)
: IBackupWriter(context_)
, disk(disk_)
, path(path_)
{
}
@ -127,9 +130,9 @@ void BackupWriterDisk::copyFileNative(DiskPtr src_disk, const String & src_file_
if (!src_disk)
throw Exception(ErrorCodes::LOGICAL_ERROR, "Cannot natively copy data to disk without source disk");
if ((src_offset != 0) || (src_size != src_disk->getFileSize(src_file_name)))
if (has_throttling || (src_offset != 0) || (src_size != src_disk->getFileSize(src_file_name)))
{
auto create_read_buffer = [src_disk, src_file_name] { return src_disk->readFile(src_file_name); };
auto create_read_buffer = [this, src_disk, src_file_name] { return src_disk->readFile(src_file_name, read_settings); };
copyDataToFile(create_read_buffer, src_offset, src_size, dest_file_name);
return;
}

View File

@ -2,6 +2,7 @@
#include <filesystem>
#include <Backups/BackupIO.h>
#include <Interpreters/Context_fwd.h>
namespace DB
{
@ -30,7 +31,7 @@ private:
class BackupWriterDisk : public IBackupWriter
{
public:
BackupWriterDisk(const DiskPtr & disk_, const String & path_);
BackupWriterDisk(const DiskPtr & disk_, const String & path_, const ContextPtr & context_);
~BackupWriterDisk() override;
bool fileExists(const String & file_name) override;

View File

@ -49,7 +49,9 @@ void BackupReaderFile::copyFileToDisk(const String & file_name, size_t size, Dis
}
BackupWriterFile::BackupWriterFile(const String & path_) : path(path_)
BackupWriterFile::BackupWriterFile(const String & path_, const ContextPtr & context_)
: IBackupWriter(context_)
, path(path_)
{
}
@ -152,9 +154,9 @@ void BackupWriterFile::copyFileNative(DiskPtr src_disk, const String & src_file_
else
abs_source_path = fs::absolute(src_file_name);
if ((src_offset != 0) || (src_size != fs::file_size(abs_source_path)))
if (has_throttling || (src_offset != 0) || (src_size != fs::file_size(abs_source_path)))
{
auto create_read_buffer = [abs_source_path] { return createReadBufferFromFileBase(abs_source_path, {}); };
auto create_read_buffer = [this, abs_source_path] { return createReadBufferFromFileBase(abs_source_path, read_settings); };
copyDataToFile(create_read_buffer, src_offset, src_size, dest_file_name);
return;
}

View File

@ -2,6 +2,7 @@
#include <filesystem>
#include <Backups/BackupIO.h>
#include <Interpreters/Context_fwd.h>
namespace DB
{
@ -27,7 +28,7 @@ private:
class BackupWriterFile : public IBackupWriter
{
public:
explicit BackupWriterFile(const String & path_);
explicit BackupWriterFile(const String & path_, const ContextPtr & context_);
~BackupWriterFile() override;
bool fileExists(const String & file_name) override;

View File

@ -69,7 +69,7 @@ namespace
S3::CredentialsConfiguration
{
settings.auth_settings.use_environment_credentials.value_or(
context->getConfigRef().getBool("s3.use_environment_credentials", false)),
context->getConfigRef().getBool("s3.use_environment_credentials", true)),
settings.auth_settings.use_insecure_imds_request.value_or(
context->getConfigRef().getBool("s3.use_insecure_imds_request", false)),
settings.auth_settings.expiration_window_seconds.value_or(
@ -161,9 +161,9 @@ void BackupReaderS3::copyFileToDisk(const String & file_name, size_t size, DiskP
BackupWriterS3::BackupWriterS3(
const S3::URI & s3_uri_, const String & access_key_id_, const String & secret_access_key_, const ContextPtr & context_)
: s3_uri(s3_uri_)
: IBackupWriter(context_)
, s3_uri(s3_uri_)
, client(makeS3Client(s3_uri_, access_key_id_, secret_access_key_, context_))
, read_settings(context_->getReadSettings())
, request_settings(context_->getStorageS3Settings().getSettings(s3_uri.uri.toString()).request_settings)
, log(&Poco::Logger::get("BackupWriterS3"))
{
@ -189,7 +189,7 @@ void BackupWriterS3::copyFileNative(DiskPtr src_disk, const String & src_file_na
auto objects = src_disk->getStorageObjects(src_file_name);
if (objects.size() > 1)
{
auto create_read_buffer = [src_disk, src_file_name] { return src_disk->readFile(src_file_name); };
auto create_read_buffer = [this, src_disk, src_file_name] { return src_disk->readFile(src_file_name, read_settings); };
copyDataToFile(create_read_buffer, src_offset, src_size, dest_file_name);
}
else

View File

@ -7,6 +7,7 @@
#include <IO/ReadSettings.h>
#include <IO/S3Common.h>
#include <Storages/StorageS3Settings.h>
#include <Interpreters/Context_fwd.h>
namespace DB
@ -76,7 +77,6 @@ private:
S3::URI s3_uri;
std::shared_ptr<S3::Client> client;
ReadSettings read_settings;
S3Settings::RequestSettings request_settings;
Poco::Logger * log;
std::optional<bool> supports_batch_delete;

View File

@ -58,10 +58,13 @@ namespace
BackupCoordinationRemote::BackupKeeperSettings keeper_settings
{
.keeper_max_retries = context->getSettingsRef().backup_keeper_max_retries,
.keeper_retry_initial_backoff_ms = context->getSettingsRef().backup_keeper_retry_initial_backoff_ms,
.keeper_retry_max_backoff_ms = context->getSettingsRef().backup_keeper_retry_max_backoff_ms,
.keeper_value_max_size = context->getSettingsRef().backup_keeper_value_max_size,
.keeper_max_retries = context->getSettingsRef().backup_restore_keeper_max_retries,
.keeper_retry_initial_backoff_ms = context->getSettingsRef().backup_restore_keeper_retry_initial_backoff_ms,
.keeper_retry_max_backoff_ms = context->getSettingsRef().backup_restore_keeper_retry_max_backoff_ms,
.batch_size_for_keeper_multiread = context->getSettingsRef().backup_restore_batch_size_for_keeper_multiread,
.keeper_fault_injection_probability = context->getSettingsRef().backup_restore_keeper_fault_injection_probability,
.keeper_fault_injection_seed = context->getSettingsRef().backup_restore_keeper_fault_injection_seed,
.keeper_value_max_size = context->getSettingsRef().backup_restore_keeper_value_max_size,
};
auto all_hosts = BackupSettings::Util::filterHostIDs(
@ -92,10 +95,27 @@ namespace
auto get_zookeeper = [global_context = context->getGlobalContext()] { return global_context->getZooKeeper(); };
RestoreCoordinationRemote::RestoreKeeperSettings keeper_settings
{
.keeper_max_retries = context->getSettingsRef().backup_restore_keeper_max_retries,
.keeper_retry_initial_backoff_ms = context->getSettingsRef().backup_restore_keeper_retry_initial_backoff_ms,
.keeper_retry_max_backoff_ms = context->getSettingsRef().backup_restore_keeper_retry_max_backoff_ms,
.batch_size_for_keeper_multiread = context->getSettingsRef().backup_restore_batch_size_for_keeper_multiread,
.keeper_fault_injection_probability = context->getSettingsRef().backup_restore_keeper_fault_injection_probability,
.keeper_fault_injection_seed = context->getSettingsRef().backup_restore_keeper_fault_injection_seed
};
auto all_hosts = BackupSettings::Util::filterHostIDs(
restore_settings.cluster_host_ids, restore_settings.shard_num, restore_settings.replica_num);
return std::make_shared<RestoreCoordinationRemote>(get_zookeeper, root_zk_path, toString(*restore_settings.restore_uuid), all_hosts, restore_settings.host_id, restore_settings.internal);
return std::make_shared<RestoreCoordinationRemote>(
get_zookeeper,
root_zk_path,
keeper_settings,
toString(*restore_settings.restore_uuid),
all_hosts,
restore_settings.host_id,
restore_settings.internal);
}
else
{
@ -660,7 +680,9 @@ void BackupsWorker::doRestore(
restore_coordination = makeRestoreCoordination(context, restore_settings, /* remote= */ on_cluster);
if (!allow_concurrent_restores && restore_coordination->hasConcurrentRestores(std::ref(num_active_restores)))
throw Exception(ErrorCodes::CONCURRENT_ACCESS_NOT_SUPPORTED, "Concurrent restores not supported, turn on setting 'allow_concurrent_restores'");
throw Exception(
ErrorCodes::CONCURRENT_ACCESS_NOT_SUPPORTED,
"Concurrent restores not supported, turn on setting 'allow_concurrent_restores'");
/// Do RESTORE.
if (on_cluster)

View File

@ -1,10 +1,14 @@
#include <Backups/RestoreCoordinationLocal.h>
#include <Common/logger_useful.h>
namespace DB
{
RestoreCoordinationLocal::RestoreCoordinationLocal() = default;
RestoreCoordinationLocal::RestoreCoordinationLocal() : log(&Poco::Logger::get("RestoreCoordinationLocal"))
{
}
RestoreCoordinationLocal::~RestoreCoordinationLocal() = default;
void RestoreCoordinationLocal::setStage(const String &, const String &)
@ -49,7 +53,12 @@ bool RestoreCoordinationLocal::acquireReplicatedSQLObjects(const String &, UserD
bool RestoreCoordinationLocal::hasConcurrentRestores(const std::atomic<size_t> & num_active_restores) const
{
return (num_active_restores > 1);
if (num_active_restores > 1)
{
LOG_WARNING(log, "Found concurrent backups: num_active_restores={}", num_active_restores);
return true;
}
return false;
}
}

View File

@ -42,6 +42,8 @@ public:
bool hasConcurrentRestores(const std::atomic<size_t> & num_active_restores) const override;
private:
Poco::Logger * const log;
std::set<std::pair<String /* database_zk_path */, String /* table_name */>> acquired_tables_in_replicated_databases;
std::unordered_set<String /* table_zk_path */> acquired_data_in_replicated_tables;
mutable std::mutex mutex;

View File

@ -1,9 +1,10 @@
#include <Backups/BackupCoordinationRemote.h>
#include <Backups/BackupCoordinationStage.h>
#include <Backups/RestoreCoordinationRemote.h>
#include <Functions/UserDefined/UserDefinedSQLObjectType.h>
#include <Common/ZooKeeper/KeeperException.h>
#include <Common/escapeForFileName.h>
#include <Backups/BackupCoordinationStage.h>
#include <Backups/BackupCoordinationRemote.h>
#include "Backups/BackupCoordinationStageSync.h"
namespace DB
{
@ -13,23 +14,47 @@ namespace Stage = BackupCoordinationStage;
RestoreCoordinationRemote::RestoreCoordinationRemote(
zkutil::GetZooKeeper get_zookeeper_,
const String & root_zookeeper_path_,
const RestoreKeeperSettings & keeper_settings_,
const String & restore_uuid_,
const Strings & all_hosts_,
const String & current_host_,
bool is_internal_)
: get_zookeeper(get_zookeeper_)
, root_zookeeper_path(root_zookeeper_path_)
, keeper_settings(keeper_settings_)
, restore_uuid(restore_uuid_)
, zookeeper_path(root_zookeeper_path_ + "/restore-" + restore_uuid_)
, all_hosts(all_hosts_)
, current_host(current_host_)
, current_host_index(BackupCoordinationRemote::findCurrentHostIndex(all_hosts, current_host))
, is_internal(is_internal_)
, log(&Poco::Logger::get("RestoreCoordinationRemote"))
, with_retries(
log,
get_zookeeper_,
keeper_settings,
[zookeeper_path = zookeeper_path, current_host = current_host, is_internal = is_internal]
(WithRetries::FaultyKeeper & zk)
{
/// Recreate this ephemeral node to signal that we are alive.
if (is_internal)
{
String alive_node_path = zookeeper_path + "/stage/alive|" + current_host;
auto code = zk->tryCreate(alive_node_path, "", zkutil::CreateMode::Ephemeral);
if (code == Coordination::Error::ZNODEEXISTS)
zk->handleEphemeralNodeExistenceNoFailureInjection(alive_node_path, "");
else if (code != Coordination::Error::ZOK)
throw zkutil::KeeperException(code, alive_node_path);
}
})
{
createRootNodes();
stage_sync.emplace(
zookeeper_path + "/stage", [this] { return getZooKeeper(); }, &Poco::Logger::get("RestoreCoordination"));
zookeeper_path,
with_retries,
log);
}
RestoreCoordinationRemote::~RestoreCoordinationRemote()
@ -45,31 +70,25 @@ RestoreCoordinationRemote::~RestoreCoordinationRemote()
}
}
zkutil::ZooKeeperPtr RestoreCoordinationRemote::getZooKeeper() const
{
std::lock_guard lock{mutex};
if (!zookeeper || zookeeper->expired())
{
zookeeper = get_zookeeper();
/// It's possible that we connected to different [Zoo]Keeper instance
/// so we may read a bit stale state.
zookeeper->sync(zookeeper_path);
}
return zookeeper;
}
void RestoreCoordinationRemote::createRootNodes()
{
auto zk = getZooKeeper();
zk->createAncestors(zookeeper_path);
zk->createIfNotExists(zookeeper_path, "");
zk->createIfNotExists(zookeeper_path + "/repl_databases_tables_acquired", "");
zk->createIfNotExists(zookeeper_path + "/repl_tables_data_acquired", "");
zk->createIfNotExists(zookeeper_path + "/repl_access_storages_acquired", "");
zk->createIfNotExists(zookeeper_path + "/repl_sql_objects_acquired", "");
}
auto holder = with_retries.createRetriesControlHolder("createRootNodes");
holder.retries_ctl.retryLoop(
[&, &zk = holder.faulty_zookeeper]()
{
with_retries.renewZooKeeper(zk);
zk->createAncestors(zookeeper_path);
Coordination::Requests ops;
Coordination::Responses responses;
ops.emplace_back(zkutil::makeCreateRequest(zookeeper_path, "", zkutil::CreateMode::Persistent));
ops.emplace_back(zkutil::makeCreateRequest(zookeeper_path + "/repl_databases_tables_acquired", "", zkutil::CreateMode::Persistent));
ops.emplace_back(zkutil::makeCreateRequest(zookeeper_path + "/repl_tables_data_acquired", "", zkutil::CreateMode::Persistent));
ops.emplace_back(zkutil::makeCreateRequest(zookeeper_path + "/repl_access_storages_acquired", "", zkutil::CreateMode::Persistent));
ops.emplace_back(zkutil::makeCreateRequest(zookeeper_path + "/repl_sql_objects_acquired", "", zkutil::CreateMode::Persistent));
zk->tryMulti(ops, responses);
});
}
void RestoreCoordinationRemote::setStage(const String & new_stage, const String & message)
{
@ -91,66 +110,121 @@ Strings RestoreCoordinationRemote::waitForStage(const String & stage_to_wait, st
return stage_sync->waitFor(all_hosts, stage_to_wait, timeout);
}
bool RestoreCoordinationRemote::acquireCreatingTableInReplicatedDatabase(const String & database_zk_path, const String & table_name)
{
auto zk = getZooKeeper();
bool result = false;
auto holder = with_retries.createRetriesControlHolder("acquireCreatingTableInReplicatedDatabase");
holder.retries_ctl.retryLoop(
[&, &zk = holder.faulty_zookeeper]()
{
with_retries.renewZooKeeper(zk);
String path = zookeeper_path + "/repl_databases_tables_acquired/" + escapeForFileName(database_zk_path);
zk->createIfNotExists(path, "");
String path = zookeeper_path + "/repl_databases_tables_acquired/" + escapeForFileName(database_zk_path);
zk->createIfNotExists(path, "");
path += "/" + escapeForFileName(table_name);
auto code = zk->tryCreate(path, "", zkutil::CreateMode::Persistent);
if ((code != Coordination::Error::ZOK) && (code != Coordination::Error::ZNODEEXISTS))
throw zkutil::KeeperException(code, path);
path += "/" + escapeForFileName(table_name);
auto code = zk->tryCreate(path, toString(current_host_index), zkutil::CreateMode::Persistent);
if ((code != Coordination::Error::ZOK) && (code != Coordination::Error::ZNODEEXISTS))
throw zkutil::KeeperException(code, path);
return (code == Coordination::Error::ZOK);
if (code == Coordination::Error::ZOK)
{
result = true;
return;
}
/// We need to check who created that node
result = zk->get(path) == toString(current_host_index);
});
return result;
}
bool RestoreCoordinationRemote::acquireInsertingDataIntoReplicatedTable(const String & table_zk_path)
{
auto zk = getZooKeeper();
bool result = false;
auto holder = with_retries.createRetriesControlHolder("acquireInsertingDataIntoReplicatedTable");
holder.retries_ctl.retryLoop(
[&, &zk = holder.faulty_zookeeper]()
{
with_retries.renewZooKeeper(zk);
String path = zookeeper_path + "/repl_tables_data_acquired/" + escapeForFileName(table_zk_path);
auto code = zk->tryCreate(path, "", zkutil::CreateMode::Persistent);
if ((code != Coordination::Error::ZOK) && (code != Coordination::Error::ZNODEEXISTS))
throw zkutil::KeeperException(code, path);
String path = zookeeper_path + "/repl_tables_data_acquired/" + escapeForFileName(table_zk_path);
auto code = zk->tryCreate(path, toString(current_host_index), zkutil::CreateMode::Persistent);
if ((code != Coordination::Error::ZOK) && (code != Coordination::Error::ZNODEEXISTS))
throw zkutil::KeeperException(code, path);
return (code == Coordination::Error::ZOK);
if (code == Coordination::Error::ZOK)
{
result = true;
return;
}
/// We need to check who created that node
result = zk->get(path) == toString(current_host_index);
});
return result;
}
bool RestoreCoordinationRemote::acquireReplicatedAccessStorage(const String & access_storage_zk_path)
{
auto zk = getZooKeeper();
bool result = false;
auto holder = with_retries.createRetriesControlHolder("acquireReplicatedAccessStorage");
holder.retries_ctl.retryLoop(
[&, &zk = holder.faulty_zookeeper]()
{
with_retries.renewZooKeeper(zk);
String path = zookeeper_path + "/repl_access_storages_acquired/" + escapeForFileName(access_storage_zk_path);
auto code = zk->tryCreate(path, "", zkutil::CreateMode::Persistent);
if ((code != Coordination::Error::ZOK) && (code != Coordination::Error::ZNODEEXISTS))
throw zkutil::KeeperException(code, path);
String path = zookeeper_path + "/repl_access_storages_acquired/" + escapeForFileName(access_storage_zk_path);
auto code = zk->tryCreate(path, toString(current_host_index), zkutil::CreateMode::Persistent);
if ((code != Coordination::Error::ZOK) && (code != Coordination::Error::ZNODEEXISTS))
throw zkutil::KeeperException(code, path);
return (code == Coordination::Error::ZOK);
if (code == Coordination::Error::ZOK)
{
result = true;
return;
}
/// We need to check who created that node
result = zk->get(path) == toString(current_host_index);
});
return result;
}
bool RestoreCoordinationRemote::acquireReplicatedSQLObjects(const String & loader_zk_path, UserDefinedSQLObjectType object_type)
{
auto zk = getZooKeeper();
bool result = false;
auto holder = with_retries.createRetriesControlHolder("acquireReplicatedSQLObjects");
holder.retries_ctl.retryLoop(
[&, &zk = holder.faulty_zookeeper]()
{
with_retries.renewZooKeeper(zk);
String path = zookeeper_path + "/repl_sql_objects_acquired/" + escapeForFileName(loader_zk_path);
zk->createIfNotExists(path, "");
String path = zookeeper_path + "/repl_sql_objects_acquired/" + escapeForFileName(loader_zk_path);
zk->createIfNotExists(path, "");
path += "/";
switch (object_type)
{
case UserDefinedSQLObjectType::Function:
path += "functions";
break;
}
path += "/";
switch (object_type)
{
case UserDefinedSQLObjectType::Function:
path += "functions";
break;
}
auto code = zk->tryCreate(path, "", zkutil::CreateMode::Persistent);
if ((code != Coordination::Error::ZOK) && (code != Coordination::Error::ZNODEEXISTS))
throw zkutil::KeeperException(code, path);
auto code = zk->tryCreate(path, "", zkutil::CreateMode::Persistent);
if ((code != Coordination::Error::ZOK) && (code != Coordination::Error::ZNODEEXISTS))
throw zkutil::KeeperException(code, path);
return (code == Coordination::Error::ZOK);
if (code == Coordination::Error::ZOK)
{
result = true;
return;
}
/// We need to check who created that node
result = zk->get(path) == toString(current_host_index);
});
return result;
}
void RestoreCoordinationRemote::removeAllNodes()
@ -162,8 +236,13 @@ void RestoreCoordinationRemote::removeAllNodes()
/// at `zookeeper_path` which might cause such hosts to stop with exception "ZNONODE". Or such hosts might still do some part
/// of their restore work before that.
auto zk = getZooKeeper();
zk->removeRecursive(zookeeper_path);
auto holder = with_retries.createRetriesControlHolder("removeAllNodes");
holder.retries_ctl.retryLoop(
[&, &zk = holder.faulty_zookeeper]()
{
with_retries.renewZooKeeper(zk);
zk->removeRecursive(zookeeper_path);
});
}
bool RestoreCoordinationRemote::hasConcurrentRestores(const std::atomic<size_t> &) const
@ -172,43 +251,58 @@ bool RestoreCoordinationRemote::hasConcurrentRestores(const std::atomic<size_t>
if (is_internal)
return false;
auto zk = getZooKeeper();
bool result = false;
std::string path = zookeeper_path +"/stage";
if (! zk->exists(root_zookeeper_path))
zk->createAncestors(root_zookeeper_path);
for (size_t attempt = 0; attempt < MAX_ZOOKEEPER_ATTEMPTS; ++attempt)
{
Coordination::Stat stat;
zk->get(root_zookeeper_path, &stat);
Strings existing_restore_paths = zk->getChildren(root_zookeeper_path);
for (const auto & existing_restore_path : existing_restore_paths)
auto holder = with_retries.createRetriesControlHolder("createRootNodes");
holder.retries_ctl.retryLoop(
[&, &zk = holder.faulty_zookeeper]()
{
if (startsWith(existing_restore_path, "backup-"))
continue;
with_retries.renewZooKeeper(zk);
String existing_restore_uuid = existing_restore_path;
existing_restore_uuid.erase(0, String("restore-").size());
if (! zk->exists(root_zookeeper_path))
zk->createAncestors(root_zookeeper_path);
if (existing_restore_uuid == toString(restore_uuid))
continue;
for (size_t attempt = 0; attempt < MAX_ZOOKEEPER_ATTEMPTS; ++attempt)
{
Coordination::Stat stat;
zk->get(root_zookeeper_path, &stat);
Strings existing_restore_paths = zk->getChildren(root_zookeeper_path);
for (const auto & existing_restore_path : existing_restore_paths)
{
if (startsWith(existing_restore_path, "backup-"))
continue;
String existing_restore_uuid = existing_restore_path;
existing_restore_uuid.erase(0, String("restore-").size());
const auto status = zk->get(root_zookeeper_path + "/" + existing_restore_path + "/stage");
if (status != Stage::COMPLETED)
return true;
}
if (existing_restore_uuid == toString(restore_uuid))
continue;
zk->createIfNotExists(path, "");
auto code = zk->trySet(path, Stage::SCHEDULED_TO_START, stat.version);
if (code == Coordination::Error::ZOK)
break;
bool is_last_attempt = (attempt == MAX_ZOOKEEPER_ATTEMPTS - 1);
if ((code != Coordination::Error::ZBADVERSION) || is_last_attempt)
throw zkutil::KeeperException(code, path);
}
return false;
String status;
if (zk->tryGet(root_zookeeper_path + "/" + existing_restore_path + "/stage", status))
{
/// If status is not COMPLETED it could be because the restore failed, check if 'error' exists
if (status != Stage::COMPLETED && !zk->exists(root_zookeeper_path + "/" + existing_restore_path + "/error"))
{
LOG_WARNING(log, "Found a concurrent restore: {}, current restore: {}", existing_restore_uuid, toString(restore_uuid));
result = true;
return;
}
}
}
zk->createIfNotExists(path, "");
auto code = zk->trySet(path, Stage::SCHEDULED_TO_START, stat.version);
if (code == Coordination::Error::ZOK)
break;
bool is_last_attempt = (attempt == MAX_ZOOKEEPER_ATTEMPTS - 1);
if ((code != Coordination::Error::ZBADVERSION) || is_last_attempt)
throw zkutil::KeeperException(code, path);
}
});
return result;
}
}

View File

@ -2,6 +2,7 @@
#include <Backups/IRestoreCoordination.h>
#include <Backups/BackupCoordinationStageSync.h>
#include <Backups/WithRetries.h>
namespace DB
@ -11,9 +12,12 @@ namespace DB
class RestoreCoordinationRemote : public IRestoreCoordination
{
public:
using RestoreKeeperSettings = WithRetries::KeeperSettings;
RestoreCoordinationRemote(
zkutil::GetZooKeeper get_zookeeper_,
const String & root_zookeeper_path_,
const RestoreKeeperSettings & keeper_settings_,
const String & restore_uuid_,
const Strings & all_hosts_,
const String & current_host_,
@ -45,25 +49,26 @@ public:
bool hasConcurrentRestores(const std::atomic<size_t> & num_active_restores) const override;
private:
zkutil::ZooKeeperPtr getZooKeeper() const;
void createRootNodes();
void removeAllNodes();
class ReplicatedDatabasesMetadataSync;
/// get_zookeeper will provide a zookeeper client without any fault injection
const zkutil::GetZooKeeper get_zookeeper;
const String root_zookeeper_path;
const RestoreKeeperSettings keeper_settings;
const String restore_uuid;
const String zookeeper_path;
const Strings all_hosts;
const String current_host;
const size_t current_host_index;
const bool is_internal;
Poco::Logger * const log;
mutable WithRetries with_retries;
std::optional<BackupCoordinationStageSync> stage_sync;
mutable std::mutex mutex;
mutable zkutil::ZooKeeperPtr zookeeper;
};
}

View File

@ -0,0 +1,61 @@
#include <mutex>
#include <Backups/WithRetries.h>
namespace DB
{
WithRetries::WithRetries(Poco::Logger * log_, zkutil::GetZooKeeper get_zookeeper_, const KeeperSettings & settings_, RenewerCallback callback_)
: log(log_)
, get_zookeeper(get_zookeeper_)
, settings(settings_)
, callback(callback_)
, global_zookeeper_retries_info(
log->name(),
log,
settings.keeper_max_retries,
settings.keeper_retry_initial_backoff_ms,
settings.keeper_retry_max_backoff_ms)
{}
WithRetries::RetriesControlHolder::RetriesControlHolder(const WithRetries * parent, const String & name)
: info(parent->global_zookeeper_retries_info)
, retries_ctl(name, info)
, faulty_zookeeper(parent->getFaultyZooKeeper())
{}
WithRetries::RetriesControlHolder WithRetries::createRetriesControlHolder(const String & name)
{
return RetriesControlHolder(this, name);
}
void WithRetries::renewZooKeeper(FaultyKeeper my_faulty_zookeeper) const
{
std::lock_guard lock(zookeeper_mutex);
if (!zookeeper || zookeeper->expired())
{
zookeeper = get_zookeeper();
my_faulty_zookeeper->setKeeper(zookeeper);
callback(my_faulty_zookeeper);
}
}
WithRetries::FaultyKeeper WithRetries::getFaultyZooKeeper() const
{
/// We need to create new instance of ZooKeeperWithFaultInjection each time a copy a pointer to ZooKeeper client there
/// The reason is that ZooKeeperWithFaultInjection may reset the underlying pointer and there could be a race condition
/// when the same object is used from multiple threads.
auto faulty_zookeeper = ZooKeeperWithFaultInjection::createInstance(
settings.keeper_fault_injection_probability,
settings.keeper_fault_injection_seed,
zookeeper,
log->name(),
log);
return faulty_zookeeper;
}
}

79
src/Backups/WithRetries.h Normal file
View File

@ -0,0 +1,79 @@
#pragma once
#include <Storages/MergeTree/ZooKeeperRetries.h>
#include <Common/ZooKeeper/Common.h>
#include <Common/ZooKeeper/ZooKeeperWithFaultInjection.h>
namespace DB
{
/// In backups every request to [Zoo]Keeper should be retryable
/// and this tiny class encapsulates all the machinery for make it possible -
/// a [Zoo]Keeper client which injects faults with configurable probability
/// and a retries controller which performs retries with growing backoff.
class WithRetries
{
public:
using FaultyKeeper = Coordination::ZooKeeperWithFaultInjection::Ptr;
using RenewerCallback = std::function<void(FaultyKeeper &)>;
struct KeeperSettings
{
UInt64 keeper_max_retries{0};
UInt64 keeper_retry_initial_backoff_ms{0};
UInt64 keeper_retry_max_backoff_ms{0};
UInt64 batch_size_for_keeper_multiread{10000};
Float64 keeper_fault_injection_probability{0};
UInt64 keeper_fault_injection_seed{42};
UInt64 keeper_value_max_size{1048576};
};
/// For simplicity a separate ZooKeeperRetriesInfo and a faulty [Zoo]Keeper client
/// are stored in one place.
/// This helps to avoid writing too much boilerplate each time we need to
/// execute some operation (a set of requests) over [Zoo]Keeper with retries.
/// Why ZooKeeperRetriesInfo is separate for each operation?
/// The reason is that backup usually takes long time to finish and it makes no sense
/// to limit the overall number of retries (for example 1000) for the whole backup
/// and have a continuously growing backoff.
class RetriesControlHolder
{
public:
ZooKeeperRetriesInfo info;
ZooKeeperRetriesControl retries_ctl;
FaultyKeeper faulty_zookeeper;
private:
friend class WithRetries;
RetriesControlHolder(const WithRetries * parent, const String & name);
};
RetriesControlHolder createRetriesControlHolder(const String & name);
WithRetries(Poco::Logger * log, zkutil::GetZooKeeper get_zookeeper_, const KeeperSettings & settings, RenewerCallback callback);
/// Used to re-establish new connection inside a retry loop.
void renewZooKeeper(FaultyKeeper my_faulty_zookeeper) const;
private:
/// This will provide a special wrapper which is useful for testing
FaultyKeeper getFaultyZooKeeper() const;
Poco::Logger * log;
zkutil::GetZooKeeper get_zookeeper;
KeeperSettings settings;
/// This callback is called each time when a new [Zoo]Keeper session is created.
/// In backups it is primarily used to re-create an ephemeral node to signal the coordinator
/// that the host is alive and able to continue writing the backup.
/// Coordinator (or an initiator) of the backup also retries when it doesn't find an ephemeral node
/// for a particular host.
/// Again, this schema is not ideal. False-positives are still possible, but in worst case scenario
/// it could lead just to a failed backup which could possibly be successful
/// if there were a little bit more retries.
RenewerCallback callback;
ZooKeeperRetriesInfo global_zookeeper_retries_info;
/// This is needed only to protect zookeeper object
mutable std::mutex zookeeper_mutex;
mutable zkutil::ZooKeeperPtr zookeeper;
};
}

View File

@ -178,9 +178,9 @@ void registerBackupEnginesFileAndDisk(BackupFactory & factory)
{
std::shared_ptr<IBackupWriter> writer;
if (engine_name == "File")
writer = std::make_shared<BackupWriterFile>(path);
writer = std::make_shared<BackupWriterFile>(path, params.context);
else
writer = std::make_shared<BackupWriterDisk>(disk, path);
writer = std::make_shared<BackupWriterDisk>(disk, path, params.context);
return std::make_unique<BackupImpl>(
backup_name_for_logging,
archive_params,

View File

@ -2,4 +2,4 @@ add_library (bridge
IBridge.cpp
)
target_link_libraries (bridge PRIVATE daemon dbms Poco::Data Poco::Data::ODBC)
target_link_libraries (bridge PRIVATE daemon dbms)

Some files were not shown because too many files have changed in this diff Show More