diff --git a/.clang-tidy b/.clang-tidy index 896052915f7..de19059d09e 100644 --- a/.clang-tidy +++ b/.clang-tidy @@ -37,7 +37,6 @@ Checks: [ '-cert-oop54-cpp', '-cert-oop57-cpp', - '-clang-analyzer-optin.core.EnumCastOutOfRange', # https://github.com/abseil/abseil-cpp/issues/1667 '-clang-analyzer-optin.performance.Padding', '-clang-analyzer-unix.Malloc', diff --git a/.gitmodules b/.gitmodules index 28696428e8c..12d865307d8 100644 --- a/.gitmodules +++ b/.gitmodules @@ -91,13 +91,13 @@ [submodule "contrib/aws"] path = contrib/aws url = https://github.com/ClickHouse/aws-sdk-cpp -[submodule "aws-c-event-stream"] +[submodule "contrib/aws-c-event-stream"] path = contrib/aws-c-event-stream url = https://github.com/awslabs/aws-c-event-stream -[submodule "aws-c-common"] +[submodule "contrib/aws-c-common"] path = contrib/aws-c-common url = https://github.com/awslabs/aws-c-common.git -[submodule "aws-checksums"] +[submodule "contrib/aws-checksums"] path = contrib/aws-checksums url = https://github.com/awslabs/aws-checksums [submodule "contrib/curl"] @@ -163,7 +163,7 @@ url = https://github.com/xz-mirror/xz [submodule "contrib/abseil-cpp"] path = contrib/abseil-cpp - url = https://github.com/abseil/abseil-cpp + url = https://github.com/ClickHouse/abseil-cpp.git [submodule "contrib/dragonbox"] path = contrib/dragonbox url = https://github.com/ClickHouse/dragonbox diff --git a/contrib/abseil-cpp b/contrib/abseil-cpp index 3bd86026c93..a3c4dd3e77f 160000 --- a/contrib/abseil-cpp +++ b/contrib/abseil-cpp @@ -1 +1 @@ -Subproject commit 3bd86026c93da5a40006fd53403dff9d5f5e30e3 +Subproject commit a3c4dd3e77f28b526efbb0eb394b72e29c633936 diff --git a/contrib/abseil-cpp-cmake/CMakeLists.txt b/contrib/abseil-cpp-cmake/CMakeLists.txt index 7372195bb0d..4137547b736 100644 --- a/contrib/abseil-cpp-cmake/CMakeLists.txt +++ b/contrib/abseil-cpp-cmake/CMakeLists.txt @@ -1,6 +1,8 @@ set(ABSL_ROOT_DIR "${ClickHouse_SOURCE_DIR}/contrib/abseil-cpp") set(ABSL_COMMON_INCLUDE_DIRS "${ABSL_ROOT_DIR}") +# This is a minimized version of the function definition in CMake/AbseilHelpers.cmake + # # Copyright 2017 The Abseil Authors. # @@ -16,7 +18,6 @@ set(ABSL_COMMON_INCLUDE_DIRS "${ABSL_ROOT_DIR}") # See the License for the specific language governing permissions and # limitations under the License. # - function(absl_cc_library) cmake_parse_arguments(ABSL_CC_LIB "DISABLE_INSTALL;PUBLIC;TESTONLY" @@ -76,6 +77,12 @@ function(absl_cc_library) add_library(absl::${ABSL_CC_LIB_NAME} ALIAS ${_NAME}) endfunction() +# The following definitions are an amalgamation of the CMakeLists.txt files in absl/*/ +# To refresh them when upgrading to a new version: +# - copy them over from upstream +# - remove calls of 'absl_cc_test' +# - remove calls of `absl_cc_library` that contain `TESTONLY` +# - append '${DIR}' to the file definitions set(DIR ${ABSL_ROOT_DIR}/absl/algorithm) @@ -102,12 +109,12 @@ absl_cc_library( absl::algorithm absl::core_headers absl::meta + absl::nullability PUBLIC ) set(DIR ${ABSL_ROOT_DIR}/absl/base) -# Internal-only target, do not depend on directly. absl_cc_library( NAME atomic_hook @@ -146,6 +153,18 @@ absl_cc_library( ${ABSL_DEFAULT_COPTS} ) +absl_cc_library( + NAME + no_destructor + HDRS + "${DIR}/no_destructor.h" + DEPS + absl::config + absl::nullability + COPTS + ${ABSL_DEFAULT_COPTS} +) + absl_cc_library( NAME nullability @@ -305,6 +324,8 @@ absl_cc_library( ${ABSL_DEFAULT_COPTS} LINKOPTS ${ABSL_DEFAULT_LINKOPTS} + $<$:-lrt> + $<$:-ladvapi32> DEPS absl::atomic_hook absl::base_internal @@ -312,6 +333,7 @@ absl_cc_library( absl::core_headers absl::dynamic_annotations absl::log_severity + absl::nullability absl::raw_logging_internal absl::spinlock_wait absl::type_traits @@ -357,6 +379,7 @@ absl_cc_library( absl::base absl::config absl::core_headers + absl::nullability PUBLIC ) @@ -467,10 +490,11 @@ absl_cc_library( LINKOPTS ${ABSL_DEFAULT_LINKOPTS} DEPS - absl::container_common absl::common_policy_traits absl::compare absl::compressed_tuple + absl::config + absl::container_common absl::container_memory absl::cord absl::core_headers @@ -480,7 +504,6 @@ absl_cc_library( absl::strings absl::throw_delegate absl::type_traits - absl::utility ) # Internal-only target, do not depend on directly. @@ -523,7 +546,9 @@ absl_cc_library( COPTS ${ABSL_DEFAULT_COPTS} DEPS + absl::base_internal absl::compressed_tuple + absl::config absl::core_headers absl::memory absl::span @@ -548,18 +573,6 @@ absl_cc_library( PUBLIC ) -# Internal-only target, do not depend on directly. -absl_cc_library( - NAME - counting_allocator - HDRS - "${DIR}/internal/counting_allocator.h" - COPTS - ${ABSL_DEFAULT_COPTS} - DEPS - absl::config -) - absl_cc_library( NAME flat_hash_map @@ -570,7 +583,7 @@ absl_cc_library( DEPS absl::container_memory absl::core_headers - absl::hash_function_defaults + absl::hash_container_defaults absl::raw_hash_map absl::algorithm_container absl::memory @@ -586,7 +599,7 @@ absl_cc_library( ${ABSL_DEFAULT_COPTS} DEPS absl::container_memory - absl::hash_function_defaults + absl::hash_container_defaults absl::raw_hash_set absl::algorithm_container absl::core_headers @@ -604,7 +617,7 @@ absl_cc_library( DEPS absl::container_memory absl::core_headers - absl::hash_function_defaults + absl::hash_container_defaults absl::node_slot_policy absl::raw_hash_map absl::algorithm_container @@ -620,8 +633,9 @@ absl_cc_library( COPTS ${ABSL_DEFAULT_COPTS} DEPS + absl::container_memory absl::core_headers - absl::hash_function_defaults + absl::hash_container_defaults absl::node_slot_policy absl::raw_hash_set absl::algorithm_container @@ -629,6 +643,19 @@ absl_cc_library( PUBLIC ) +absl_cc_library( + NAME + hash_container_defaults + HDRS + "${DIR}/hash_container_defaults.h" + COPTS + ${ABSL_DEFAULT_COPTS} + DEPS + absl::config + absl::hash_function_defaults + PUBLIC +) + # Internal-only target, do not depend on directly. absl_cc_library( NAME @@ -655,9 +682,11 @@ absl_cc_library( ${ABSL_DEFAULT_COPTS} DEPS absl::config + absl::container_common absl::cord absl::hash absl::strings + absl::type_traits PUBLIC ) @@ -703,6 +732,7 @@ absl_cc_library( absl::base absl::config absl::exponential_biased + absl::no_destructor absl::raw_logging_internal absl::sample_recorder absl::synchronization @@ -756,7 +786,9 @@ absl_cc_library( COPTS ${ABSL_DEFAULT_COPTS} DEPS + absl::config absl::container_memory + absl::core_headers absl::raw_hash_set absl::throw_delegate PUBLIC @@ -817,6 +849,7 @@ absl_cc_library( DEPS absl::config absl::core_headers + absl::debugging_internal absl::meta absl::strings absl::span @@ -931,6 +964,7 @@ absl_cc_library( absl::crc32c absl::config absl::strings + absl::no_destructor ) set(DIR ${ABSL_ROOT_DIR}/absl/debugging) @@ -954,6 +988,8 @@ absl_cc_library( "${DIR}/stacktrace.cc" COPTS ${ABSL_DEFAULT_COPTS} + LINKOPTS + $<$:${EXECINFO_LIBRARY}> DEPS absl::debugging_internal absl::config @@ -980,6 +1016,7 @@ absl_cc_library( ${ABSL_DEFAULT_COPTS} LINKOPTS ${ABSL_DEFAULT_LINKOPTS} + $<$:-ldbghelp> DEPS absl::debugging_internal absl::demangle_internal @@ -1058,8 +1095,10 @@ absl_cc_library( demangle_internal HDRS "${DIR}/internal/demangle.h" + "${DIR}/internal/demangle_rust.h" SRCS "${DIR}/internal/demangle.cc" + "${DIR}/internal/demangle_rust.cc" COPTS ${ABSL_DEFAULT_COPTS} DEPS @@ -1252,6 +1291,7 @@ absl_cc_library( absl::strings absl::synchronization absl::flat_hash_map + absl::no_destructor ) # Internal-only target, do not depend on directly. @@ -1283,12 +1323,9 @@ absl_cc_library( absl_cc_library( NAME flags - SRCS - "${DIR}/flag.cc" HDRS "${DIR}/declare.h" "${DIR}/flag.h" - "${DIR}/internal/flag_msvc.inc" COPTS ${ABSL_DEFAULT_COPTS} LINKOPTS @@ -1299,7 +1336,6 @@ absl_cc_library( absl::flags_config absl::flags_internal absl::flags_reflection - absl::base absl::core_headers absl::strings ) @@ -1379,6 +1415,9 @@ absl_cc_library( absl::synchronization ) +############################################################################ +# Unit tests in alphabetical order. + set(DIR ${ABSL_ROOT_DIR}/absl/functional) absl_cc_library( @@ -1431,6 +1470,18 @@ absl_cc_library( PUBLIC ) +absl_cc_library( + NAME + overload + HDRS + "${DIR}/overload.h" + COPTS + ${ABSL_DEFAULT_COPTS} + DEPS + absl::meta + PUBLIC +) + set(DIR ${ABSL_ROOT_DIR}/absl/hash) absl_cc_library( @@ -1640,6 +1691,7 @@ absl_cc_library( absl::log_internal_conditions absl::log_internal_message absl::log_internal_strip + absl::absl_vlog_is_on ) absl_cc_library( @@ -1721,6 +1773,7 @@ absl_cc_library( absl::log_entry absl::log_severity absl::log_sink + absl::no_destructor absl::raw_logging_internal absl::synchronization absl::span @@ -1771,6 +1824,7 @@ absl_cc_library( LINKOPTS ${ABSL_DEFAULT_LINKOPTS} DEPS + absl::core_headers absl::log_internal_message absl::log_internal_nullstream absl::log_severity @@ -1876,6 +1930,11 @@ absl_cc_library( PUBLIC ) +# Warning: Many linkers will strip the contents of this library because its +# symbols are only used in a global constructor. A workaround is for clients +# to link this using $ instead of +# the plain absl::log_flags. +# TODO(b/320467376): Implement the equivalent of Bazel's alwayslink=True. absl_cc_library( NAME log_flags @@ -1897,6 +1956,7 @@ absl_cc_library( absl::flags absl::flags_marshalling absl::strings + absl::vlog_config_internal PUBLIC ) @@ -1919,6 +1979,7 @@ absl_cc_library( absl::log_severity absl::raw_logging_internal absl::strings + absl::vlog_config_internal ) absl_cc_library( @@ -1952,6 +2013,7 @@ absl_cc_library( ${ABSL_DEFAULT_LINKOPTS} DEPS absl::log_internal_log_impl + absl::vlog_is_on PUBLIC ) @@ -2064,21 +2126,75 @@ absl_cc_library( ) absl_cc_library( - NAME - log_internal_fnmatch - SRCS - "${DIR}/internal/fnmatch.cc" - HDRS - "${DIR}/internal/fnmatch.h" - COPTS - ${ABSL_DEFAULT_COPTS} - LINKOPTS - ${ABSL_DEFAULT_LINKOPTS} - DEPS - absl::config - absl::strings + NAME + vlog_config_internal + SRCS + "${DIR}/internal/vlog_config.cc" + HDRS + "${DIR}/internal/vlog_config.h" + COPTS + ${ABSL_DEFAULT_COPTS} + LINKOPTS + ${ABSL_DEFAULT_LINKOPTS} + DEPS + absl::base + absl::config + absl::core_headers + absl::log_internal_fnmatch + absl::memory + absl::no_destructor + absl::strings + absl::synchronization + absl::optional ) +absl_cc_library( + NAME + absl_vlog_is_on + COPTS + ${ABSL_DEFAULT_COPTS} + LINKOPTS + ${ABSL_DEFAULT_LINKOPTS} + HDRS + "${DIR}/absl_vlog_is_on.h" + DEPS + absl::vlog_config_internal + absl::config + absl::core_headers + absl::strings +) + +absl_cc_library( + NAME + vlog_is_on + COPTS + ${ABSL_DEFAULT_COPTS} + LINKOPTS + ${ABSL_DEFAULT_LINKOPTS} + HDRS + "${DIR}/vlog_is_on.h" + DEPS + absl::absl_vlog_is_on +) + +absl_cc_library( + NAME + log_internal_fnmatch + SRCS + "${DIR}/internal/fnmatch.cc" + HDRS + "${DIR}/internal/fnmatch.h" + COPTS + ${ABSL_DEFAULT_COPTS} + LINKOPTS + ${ABSL_DEFAULT_LINKOPTS} + DEPS + absl::config + absl::strings +) + +# Test targets + set(DIR ${ABSL_ROOT_DIR}/absl/memory) absl_cc_library( @@ -2147,6 +2263,7 @@ absl_cc_library( COPTS ${ABSL_DEFAULT_COPTS} DEPS + absl::compare absl::config absl::core_headers absl::bits @@ -2176,6 +2293,8 @@ absl_cc_library( PUBLIC ) +set(DIR ${ABSL_ROOT_DIR}/absl/profiling) + absl_cc_library( NAME sample_recorder @@ -2188,8 +2307,6 @@ absl_cc_library( absl::synchronization ) -set(DIR ${ABSL_ROOT_DIR}/absl/profiling) - absl_cc_library( NAME exponential_biased @@ -2265,6 +2382,7 @@ absl_cc_library( LINKOPTS ${ABSL_DEFAULT_LINKOPTS} DEPS + absl::config absl::fast_type_id absl::optional ) @@ -2336,11 +2454,13 @@ absl_cc_library( DEPS absl::config absl::inlined_vector + absl::nullability absl::random_internal_pool_urbg absl::random_internal_salted_seed_seq absl::random_internal_seed_material absl::random_seed_gen_exception absl::span + absl::string_view ) # Internal-only target, do not depend on directly. @@ -2399,6 +2519,7 @@ absl_cc_library( ${ABSL_DEFAULT_COPTS} LINKOPTS ${ABSL_DEFAULT_LINKOPTS} + $<$:-lbcrypt> DEPS absl::core_headers absl::optional @@ -2658,6 +2779,29 @@ absl_cc_library( absl::config ) +# Internal-only target, do not depend on directly. +absl_cc_library( + NAME + random_internal_distribution_test_util + SRCS + "${DIR}/internal/chi_square.cc" + "${DIR}/internal/distribution_test_util.cc" + HDRS + "${DIR}/internal/chi_square.h" + "${DIR}/internal/distribution_test_util.h" + COPTS + ${ABSL_DEFAULT_COPTS} + LINKOPTS + ${ABSL_DEFAULT_LINKOPTS} + DEPS + absl::config + absl::core_headers + absl::raw_logging_internal + absl::strings + absl::str_format + absl::span +) + # Internal-only target, do not depend on directly. absl_cc_library( NAME @@ -2699,6 +2843,8 @@ absl_cc_library( absl::function_ref absl::inlined_vector absl::memory + absl::no_destructor + absl::nullability absl::optional absl::raw_logging_internal absl::span @@ -2724,8 +2870,11 @@ absl_cc_library( absl::base absl::config absl::core_headers + absl::has_ostream_operator + absl::nullability absl::raw_logging_internal absl::status + absl::str_format absl::strings absl::type_traits absl::utility @@ -2748,6 +2897,7 @@ absl_cc_library( absl::base absl::config absl::core_headers + absl::nullability absl::throw_delegate PUBLIC ) @@ -2762,6 +2912,7 @@ absl_cc_library( "${DIR}/has_absl_stringify.h" "${DIR}/internal/damerau_levenshtein_distance.h" "${DIR}/internal/string_constant.h" + "${DIR}/internal/has_absl_stringify.h" "${DIR}/match.h" "${DIR}/numbers.h" "${DIR}/str_cat.h" @@ -2805,6 +2956,7 @@ absl_cc_library( absl::endian absl::int128 absl::memory + absl::nullability absl::raw_logging_internal absl::throw_delegate absl::type_traits @@ -2824,6 +2976,18 @@ absl_cc_library( PUBLIC ) +absl_cc_library( + NAME + has_ostream_operator + HDRS + "${DIR}/has_ostream_operator.h" + COPTS + ${ABSL_DEFAULT_COPTS} + DEPS + absl::config + PUBLIC +) + # Internal-only target, do not depend on directly. absl_cc_library( NAME @@ -2855,7 +3019,12 @@ absl_cc_library( COPTS ${ABSL_DEFAULT_COPTS} DEPS + absl::config + absl::core_headers + absl::nullability + absl::span absl::str_format_internal + absl::string_view PUBLIC ) @@ -2886,6 +3055,7 @@ absl_cc_library( absl::strings absl::config absl::core_headers + absl::fixed_array absl::inlined_vector absl::numeric_representation absl::type_traits @@ -2989,6 +3159,7 @@ absl_cc_library( DEPS absl::base absl::config + absl::no_destructor absl::raw_logging_internal absl::synchronization ) @@ -3079,6 +3250,7 @@ absl_cc_library( absl::endian absl::function_ref absl::inlined_vector + absl::nullability absl::optional absl::raw_logging_internal absl::span @@ -3246,6 +3418,8 @@ absl_cc_library( ${ABSL_DEFAULT_COPTS} DEPS Threads::Threads + # TODO(#1495): Use $ once our + # minimum CMake version >= 3.24 $<$:-Wl,-framework,CoreFoundation> ) @@ -3286,8 +3460,8 @@ absl_cc_library( NAME bad_any_cast_impl SRCS - "${DIR}/bad_any_cast.h" - "${DIR}/bad_any_cast.cc" + "${DIR}/bad_any_cast.h" + "${DIR}/bad_any_cast.cc" COPTS ${ABSL_DEFAULT_COPTS} DEPS @@ -3307,6 +3481,7 @@ absl_cc_library( DEPS absl::algorithm absl::core_headers + absl::nullability absl::throw_delegate absl::type_traits PUBLIC @@ -3327,6 +3502,7 @@ absl_cc_library( absl::config absl::core_headers absl::memory + absl::nullability absl::type_traits absl::utility PUBLIC @@ -3389,6 +3565,7 @@ absl_cc_library( COPTS ${ABSL_DEFAULT_COPTS} DEPS + absl::config absl::core_headers absl::type_traits PUBLIC diff --git a/contrib/cld2 b/contrib/cld2 index bc6d493a2f6..217ba8b8805 160000 --- a/contrib/cld2 +++ b/contrib/cld2 @@ -1 +1 @@ -Subproject commit bc6d493a2f64ed1fc1c4c4b4294a542a04e04217 +Subproject commit 217ba8b8805b41557faadaa47bb6e99f2242eea3 diff --git a/contrib/googletest b/contrib/googletest index e47544ad31c..a7f443b80b1 160000 --- a/contrib/googletest +++ b/contrib/googletest @@ -1 +1 @@ -Subproject commit e47544ad31cb3ceecd04cc13e8fe556f8df9fe0b +Subproject commit a7f443b80b105f940225332ed3c31f2790092f47 diff --git a/contrib/openssl b/contrib/openssl index f7b8721dfc6..67c0b63e578 160000 --- a/contrib/openssl +++ b/contrib/openssl @@ -1 +1 @@ -Subproject commit f7b8721dfc66abb147f24ca07b9c9d1d64f40f71 +Subproject commit 67c0b63e578e4c751ac9edf490f5a96124fff8dc diff --git a/docker/images.json b/docker/images.json index 7439517379b..716b76ee217 100644 --- a/docker/images.json +++ b/docker/images.json @@ -41,8 +41,7 @@ "docker/test/stateless": { "name": "clickhouse/stateless-test", "dependent": [ - "docker/test/stateful", - "docker/test/unit" + "docker/test/stateful" ] }, "docker/test/stateful": { @@ -122,15 +121,16 @@ "docker/test/base": { "name": "clickhouse/test-base", "dependent": [ + "docker/test/clickbench", "docker/test/fuzzer", - "docker/test/libfuzzer", "docker/test/integration/base", "docker/test/keeper-jepsen", + "docker/test/libfuzzer", "docker/test/server-jepsen", "docker/test/sqllogic", "docker/test/sqltest", - "docker/test/clickbench", - "docker/test/stateless" + "docker/test/stateless", + "docker/test/unit" ] }, "docker/test/integration/kerberized_hadoop": { diff --git a/docker/test/style/Dockerfile b/docker/test/style/Dockerfile index 91768c8328d..54fab849301 100644 --- a/docker/test/style/Dockerfile +++ b/docker/test/style/Dockerfile @@ -33,6 +33,7 @@ RUN pip3 install \ flake8==4.0.1 \ requests \ thefuzz \ + tqdm==4.66.4 \ types-requests \ unidiff \ && rm -rf /root/.cache/pip diff --git a/docker/test/unit/Dockerfile b/docker/test/unit/Dockerfile index cf5ba1eec7f..af44dc930b2 100644 --- a/docker/test/unit/Dockerfile +++ b/docker/test/unit/Dockerfile @@ -1,9 +1,7 @@ # rebuild in #33610 # docker build -t clickhouse/unit-test . ARG FROM_TAG=latest -FROM clickhouse/stateless-test:$FROM_TAG - -RUN apt-get install gdb +FROM clickhouse/test-base:$FROM_TAG COPY run.sh / CMD ["/bin/bash", "/run.sh"] diff --git a/docs/changelogs/v24.1.6.52-stable.md b/docs/changelogs/v24.1.6.52-stable.md new file mode 100644 index 00000000000..341561e9a64 --- /dev/null +++ b/docs/changelogs/v24.1.6.52-stable.md @@ -0,0 +1,45 @@ +--- +sidebar_position: 1 +sidebar_label: 2024 +--- + +# 2024 Changelog + +### ClickHouse release v24.1.6.52-stable (fa09f677bc9) FIXME as compared to v24.1.5.6-stable (7f67181ff31) + +#### Improvement +* Backported in [#60292](https://github.com/ClickHouse/ClickHouse/issues/60292): Copy S3 file GCP fallback to buffer copy in case GCP returned `Internal Error` with `GATEWAY_TIMEOUT` HTTP error code. [#60164](https://github.com/ClickHouse/ClickHouse/pull/60164) ([Maksim Kita](https://github.com/kitaisreal)). +* Backported in [#60832](https://github.com/ClickHouse/ClickHouse/issues/60832): Update tzdata to 2024a. [#60768](https://github.com/ClickHouse/ClickHouse/pull/60768) ([Raúl Marín](https://github.com/Algunenano)). + +#### Bug Fix (user-visible misbehavior in an official stable release) + +* Backported in [#60413](https://github.com/ClickHouse/ClickHouse/issues/60413): Fix segmentation fault in KQL parser when the input query exceeds the `max_query_size`. Also re-enable the KQL dialect. Fixes [#59036](https://github.com/ClickHouse/ClickHouse/issues/59036) and [#59037](https://github.com/ClickHouse/ClickHouse/issues/59037). [#59626](https://github.com/ClickHouse/ClickHouse/pull/59626) ([Yong Wang](https://github.com/kashwy)). +* Backported in [#60074](https://github.com/ClickHouse/ClickHouse/issues/60074): Fix error `Read beyond last offset` for `AsynchronousBoundedReadBuffer`. [#59630](https://github.com/ClickHouse/ClickHouse/pull/59630) ([Vitaly Baranov](https://github.com/vitlibar)). +* Backported in [#60299](https://github.com/ClickHouse/ClickHouse/issues/60299): Fix having neigher acked nor nacked messages. If exception happens during read-write phase, messages will be nacked. [#59775](https://github.com/ClickHouse/ClickHouse/pull/59775) ([Kseniia Sumarokova](https://github.com/kssenii)). +* Backported in [#60066](https://github.com/ClickHouse/ClickHouse/issues/60066): Fix optimize_uniq_to_count removing the column alias. [#60026](https://github.com/ClickHouse/ClickHouse/pull/60026) ([Raúl Marín](https://github.com/Algunenano)). +* Backported in [#60638](https://github.com/ClickHouse/ClickHouse/issues/60638): Fixed a bug in parallel optimization for queries with `FINAL`, which could give an incorrect result in rare cases. [#60041](https://github.com/ClickHouse/ClickHouse/pull/60041) ([Maksim Kita](https://github.com/kitaisreal)). +* Backported in [#60177](https://github.com/ClickHouse/ClickHouse/issues/60177): Fix cosineDistance crash with Nullable. [#60150](https://github.com/ClickHouse/ClickHouse/pull/60150) ([Raúl Marín](https://github.com/Algunenano)). +* Backported in [#60279](https://github.com/ClickHouse/ClickHouse/issues/60279): Hide sensitive info for `S3Queue` table engine. [#60233](https://github.com/ClickHouse/ClickHouse/pull/60233) ([Kseniia Sumarokova](https://github.com/kssenii)). +* Backported in [#61000](https://github.com/ClickHouse/ClickHouse/issues/61000): Reduce the number of read rows from `system.numbers`. Fixes [#59418](https://github.com/ClickHouse/ClickHouse/issues/59418). [#60546](https://github.com/ClickHouse/ClickHouse/pull/60546) ([JackyWoo](https://github.com/JackyWoo)). +* Backported in [#60791](https://github.com/ClickHouse/ClickHouse/issues/60791): Fix buffer overflow that can happen if the attacker asks the HTTP server to decompress data with a composition of codecs and size triggering numeric overflow. Fix buffer overflow that can happen inside codec NONE on wrong input data. This was submitted by TIANGONG research team through our [Bug Bounty program](https://github.com/ClickHouse/ClickHouse/issues/38986). [#60731](https://github.com/ClickHouse/ClickHouse/pull/60731) ([Alexey Milovidov](https://github.com/alexey-milovidov)). +* Backported in [#60783](https://github.com/ClickHouse/ClickHouse/issues/60783): Functions for SQL/JSON were able to read uninitialized memory. This closes [#60017](https://github.com/ClickHouse/ClickHouse/issues/60017). Found by Fuzzer. [#60738](https://github.com/ClickHouse/ClickHouse/pull/60738) ([Alexey Milovidov](https://github.com/alexey-milovidov)). +* Backported in [#60803](https://github.com/ClickHouse/ClickHouse/issues/60803): Do not set aws custom metadata `x-amz-meta-*` headers on UploadPart & CompleteMultipartUpload calls. [#60748](https://github.com/ClickHouse/ClickHouse/pull/60748) ([Francisco J. Jurado Moreno](https://github.com/Beetelbrox)). +* Backported in [#60820](https://github.com/ClickHouse/ClickHouse/issues/60820): Fix crash in arrayEnumerateRanked. [#60764](https://github.com/ClickHouse/ClickHouse/pull/60764) ([Raúl Marín](https://github.com/Algunenano)). +* Backported in [#60841](https://github.com/ClickHouse/ClickHouse/issues/60841): Fix crash when using input() in INSERT SELECT JOIN. Closes [#60035](https://github.com/ClickHouse/ClickHouse/issues/60035). [#60765](https://github.com/ClickHouse/ClickHouse/pull/60765) ([Kruglov Pavel](https://github.com/Avogar)). +* Backported in [#60904](https://github.com/ClickHouse/ClickHouse/issues/60904): Avoid segfault if too many keys are skipped when reading from S3. [#60849](https://github.com/ClickHouse/ClickHouse/pull/60849) ([Antonio Andelic](https://github.com/antonio2368)). + +#### NO CL CATEGORY + +* Backported in [#60186](https://github.com/ClickHouse/ClickHouse/issues/60186):. [#60181](https://github.com/ClickHouse/ClickHouse/pull/60181) ([Alexey Milovidov](https://github.com/alexey-milovidov)). + +#### NOT FOR CHANGELOG / INSIGNIFICANT + +* Backported in [#60333](https://github.com/ClickHouse/ClickHouse/issues/60333): CI: Fix job failures due to jepsen artifacts. [#59890](https://github.com/ClickHouse/ClickHouse/pull/59890) ([Max K.](https://github.com/maxknv)). +* Backported in [#60034](https://github.com/ClickHouse/ClickHouse/issues/60034): Fix mark release ready. [#59994](https://github.com/ClickHouse/ClickHouse/pull/59994) ([Mikhail f. Shiryaev](https://github.com/Felixoid)). +* Backported in [#60326](https://github.com/ClickHouse/ClickHouse/issues/60326): Ability to detect undead ZooKeeper sessions. [#60044](https://github.com/ClickHouse/ClickHouse/pull/60044) ([Alexander Tokmakov](https://github.com/tavplubix)). +* Backported in [#60363](https://github.com/ClickHouse/ClickHouse/issues/60363): CI: hot fix for gh statuses. [#60201](https://github.com/ClickHouse/ClickHouse/pull/60201) ([Max K.](https://github.com/maxknv)). +* Backported in [#60648](https://github.com/ClickHouse/ClickHouse/issues/60648): Detect io_uring in tests. [#60373](https://github.com/ClickHouse/ClickHouse/pull/60373) ([Azat Khuzhin](https://github.com/azat)). +* Backported in [#60569](https://github.com/ClickHouse/ClickHouse/issues/60569): Remove broken test while we fix it. [#60547](https://github.com/ClickHouse/ClickHouse/pull/60547) ([Raúl Marín](https://github.com/Algunenano)). +* Backported in [#60756](https://github.com/ClickHouse/ClickHouse/issues/60756): Update shellcheck. [#60553](https://github.com/ClickHouse/ClickHouse/pull/60553) ([Mikhail f. Shiryaev](https://github.com/Felixoid)). +* Backported in [#60584](https://github.com/ClickHouse/ClickHouse/issues/60584): CI: fix docker build job name. [#60554](https://github.com/ClickHouse/ClickHouse/pull/60554) ([Max K.](https://github.com/maxknv)). + diff --git a/docs/changelogs/v24.4.2.141-stable.md b/docs/changelogs/v24.4.2.141-stable.md new file mode 100644 index 00000000000..656d0854392 --- /dev/null +++ b/docs/changelogs/v24.4.2.141-stable.md @@ -0,0 +1,101 @@ +--- +sidebar_position: 1 +sidebar_label: 2024 +--- + +# 2024 Changelog + +### ClickHouse release v24.4.2.141-stable (9e23d27bd11) FIXME as compared to v24.4.1.2088-stable (6d4b31322d1) + +#### Improvement +* Backported in [#63467](https://github.com/ClickHouse/ClickHouse/issues/63467): Make rabbitmq nack broken messages. Closes [#45350](https://github.com/ClickHouse/ClickHouse/issues/45350). [#60312](https://github.com/ClickHouse/ClickHouse/pull/60312) ([Kseniia Sumarokova](https://github.com/kssenii)). + +#### Build/Testing/Packaging Improvement +* Backported in [#63612](https://github.com/ClickHouse/ClickHouse/issues/63612): The Dockerfile is reviewed by the docker official library in https://github.com/docker-library/official-images/pull/15846. [#63400](https://github.com/ClickHouse/ClickHouse/pull/63400) ([Mikhail f. Shiryaev](https://github.com/Felixoid)). + +#### Bug Fix (user-visible misbehavior in an official stable release) + +* Backported in [#64279](https://github.com/ClickHouse/ClickHouse/issues/64279): Fix queries with FINAL give wrong result when table does not use adaptive granularity. [#62432](https://github.com/ClickHouse/ClickHouse/pull/62432) ([Duc Canh Le](https://github.com/canhld94)). +* Backported in [#63295](https://github.com/ClickHouse/ClickHouse/issues/63295): Fix crash with untuple and unresolved lambda. [#63131](https://github.com/ClickHouse/ClickHouse/pull/63131) ([Raúl Marín](https://github.com/Algunenano)). +* Backported in [#63978](https://github.com/ClickHouse/ClickHouse/issues/63978): Fix intersect parts when restart after drop range. [#63202](https://github.com/ClickHouse/ClickHouse/pull/63202) ([Han Fei](https://github.com/hanfei1991)). +* Backported in [#63413](https://github.com/ClickHouse/ClickHouse/issues/63413): Fix a misbehavior when SQL security defaults don't load for old tables during server startup. [#63209](https://github.com/ClickHouse/ClickHouse/pull/63209) ([pufit](https://github.com/pufit)). +* Backported in [#63388](https://github.com/ClickHouse/ClickHouse/issues/63388): JOIN filter push down filled join fix. Closes [#63228](https://github.com/ClickHouse/ClickHouse/issues/63228). [#63234](https://github.com/ClickHouse/ClickHouse/pull/63234) ([Maksim Kita](https://github.com/kitaisreal)). +* Backported in [#63618](https://github.com/ClickHouse/ClickHouse/issues/63618): Fix bug which could potentially lead to rare LOGICAL_ERROR during SELECT query with message: `Unexpected return type from materialize. Expected type_XXX. Got type_YYY.` Introduced in [#59379](https://github.com/ClickHouse/ClickHouse/issues/59379). [#63353](https://github.com/ClickHouse/ClickHouse/pull/63353) ([alesapin](https://github.com/alesapin)). +* Backported in [#63451](https://github.com/ClickHouse/ClickHouse/issues/63451): Fix `X-ClickHouse-Timezone` header returning wrong timezone when using `session_timezone` as query level setting. [#63377](https://github.com/ClickHouse/ClickHouse/pull/63377) ([Andrey Zvonov](https://github.com/zvonand)). +* Backported in [#63605](https://github.com/ClickHouse/ClickHouse/issues/63605): Fix backup of projection part in case projection was removed from table metadata, but part still has projection. [#63426](https://github.com/ClickHouse/ClickHouse/pull/63426) ([Kseniia Sumarokova](https://github.com/kssenii)). +* Backported in [#63510](https://github.com/ClickHouse/ClickHouse/issues/63510): Fix 'Every derived table must have its own alias' error for MYSQL dictionary source, close [#63341](https://github.com/ClickHouse/ClickHouse/issues/63341). [#63481](https://github.com/ClickHouse/ClickHouse/pull/63481) ([vdimir](https://github.com/vdimir)). +* Backported in [#63592](https://github.com/ClickHouse/ClickHouse/issues/63592): Avoid segafult in `MergeTreePrefetchedReadPool` while fetching projection parts. [#63513](https://github.com/ClickHouse/ClickHouse/pull/63513) ([Antonio Andelic](https://github.com/antonio2368)). +* Backported in [#63750](https://github.com/ClickHouse/ClickHouse/issues/63750): Read only the necessary columns from VIEW (new analyzer). Closes [#62594](https://github.com/ClickHouse/ClickHouse/issues/62594). [#63688](https://github.com/ClickHouse/ClickHouse/pull/63688) ([Maksim Kita](https://github.com/kitaisreal)). +* Backported in [#63772](https://github.com/ClickHouse/ClickHouse/issues/63772): Fix [#63539](https://github.com/ClickHouse/ClickHouse/issues/63539). Forbid WINDOW redefinition in new analyzer. [#63694](https://github.com/ClickHouse/ClickHouse/pull/63694) ([Dmitry Novik](https://github.com/novikd)). +* Backported in [#63872](https://github.com/ClickHouse/ClickHouse/issues/63872): Flatten_nested is broken with replicated database. [#63695](https://github.com/ClickHouse/ClickHouse/pull/63695) ([Nikolai Kochetov](https://github.com/KochetovNicolai)). +* Backported in [#63854](https://github.com/ClickHouse/ClickHouse/issues/63854): Fix `Not found column` and `CAST AS Map from array requires nested tuple of 2 elements` exceptions for distributed queries which use `Map(Nothing, Nothing)` type. Fixes [#63637](https://github.com/ClickHouse/ClickHouse/issues/63637). [#63753](https://github.com/ClickHouse/ClickHouse/pull/63753) ([Nikolai Kochetov](https://github.com/KochetovNicolai)). +* Backported in [#63847](https://github.com/ClickHouse/ClickHouse/issues/63847): Fix possible `ILLEGAL_COLUMN` error in `partial_merge` join, close [#37928](https://github.com/ClickHouse/ClickHouse/issues/37928). [#63755](https://github.com/ClickHouse/ClickHouse/pull/63755) ([vdimir](https://github.com/vdimir)). +* Backported in [#63908](https://github.com/ClickHouse/ClickHouse/issues/63908): `query_plan_remove_redundant_distinct` can break queries with WINDOW FUNCTIONS (with `allow_experimental_analyzer` is on). Fixes [#62820](https://github.com/ClickHouse/ClickHouse/issues/62820). [#63776](https://github.com/ClickHouse/ClickHouse/pull/63776) ([Igor Nikonov](https://github.com/devcrafter)). +* Backported in [#63955](https://github.com/ClickHouse/ClickHouse/issues/63955): Fix possible crash with SYSTEM UNLOAD PRIMARY KEY. [#63778](https://github.com/ClickHouse/ClickHouse/pull/63778) ([Raúl Marín](https://github.com/Algunenano)). +* Backported in [#63938](https://github.com/ClickHouse/ClickHouse/issues/63938): Allow JOIN filter push down to both streams if only single equivalent column is used in query. Closes [#63799](https://github.com/ClickHouse/ClickHouse/issues/63799). [#63819](https://github.com/ClickHouse/ClickHouse/pull/63819) ([Maksim Kita](https://github.com/kitaisreal)). +* Backported in [#63991](https://github.com/ClickHouse/ClickHouse/issues/63991): Fix incorrect select query result when parallel replicas were used to read from a Materialized View. [#63861](https://github.com/ClickHouse/ClickHouse/pull/63861) ([Nikita Taranov](https://github.com/nickitat)). +* Backported in [#64033](https://github.com/ClickHouse/ClickHouse/issues/64033): Fix a error `Database name is empty` for remote queries with lambdas over the cluster with modified default database. Fixes [#63471](https://github.com/ClickHouse/ClickHouse/issues/63471). [#63864](https://github.com/ClickHouse/ClickHouse/pull/63864) ([Nikolai Kochetov](https://github.com/KochetovNicolai)). +* Backported in [#64561](https://github.com/ClickHouse/ClickHouse/issues/64561): Fix SIGSEGV due to CPU/Real (`query_profiler_real_time_period_ns`/`query_profiler_cpu_time_period_ns`) profiler (has been an issue since 2022, that leads to periodic server crashes, especially if you were using distributed engine). [#63865](https://github.com/ClickHouse/ClickHouse/pull/63865) ([Azat Khuzhin](https://github.com/azat)). +* Backported in [#64011](https://github.com/ClickHouse/ClickHouse/issues/64011): Fix analyzer - IN function with arbitrary deep sub-selects in materialized view to use insertion block. [#63930](https://github.com/ClickHouse/ClickHouse/pull/63930) ([Yakov Olkhovskiy](https://github.com/yakov-olkhovskiy)). +* Backported in [#64238](https://github.com/ClickHouse/ClickHouse/issues/64238): Fix resolve of unqualified COLUMNS matcher. Preserve the input columns order and forbid usage of unknown identifiers. [#63962](https://github.com/ClickHouse/ClickHouse/pull/63962) ([Dmitry Novik](https://github.com/novikd)). +* Backported in [#64103](https://github.com/ClickHouse/ClickHouse/issues/64103): Deserialize untrusted binary inputs in a safer way. [#64024](https://github.com/ClickHouse/ClickHouse/pull/64024) ([Robert Schulze](https://github.com/rschu1ze)). +* Backported in [#64170](https://github.com/ClickHouse/ClickHouse/issues/64170): Add missing settings to recoverLostReplica. [#64040](https://github.com/ClickHouse/ClickHouse/pull/64040) ([Raúl Marín](https://github.com/Algunenano)). +* Backported in [#64322](https://github.com/ClickHouse/ClickHouse/issues/64322): This fix will use a proper redefined context with the correct definer for each individual view in the query pipeline Closes [#63777](https://github.com/ClickHouse/ClickHouse/issues/63777). [#64079](https://github.com/ClickHouse/ClickHouse/pull/64079) ([pufit](https://github.com/pufit)). +* Backported in [#64382](https://github.com/ClickHouse/ClickHouse/issues/64382): Fix analyzer: "Not found column" error is fixed when using INTERPOLATE. [#64096](https://github.com/ClickHouse/ClickHouse/pull/64096) ([Yakov Olkhovskiy](https://github.com/yakov-olkhovskiy)). +* Backported in [#64568](https://github.com/ClickHouse/ClickHouse/issues/64568): Fix creating backups to S3 buckets with different credentials from the disk containing the file. [#64153](https://github.com/ClickHouse/ClickHouse/pull/64153) ([Antonio Andelic](https://github.com/antonio2368)). +* Backported in [#64272](https://github.com/ClickHouse/ClickHouse/issues/64272): Prevent LOGICAL_ERROR on CREATE TABLE as MaterializedView. [#64174](https://github.com/ClickHouse/ClickHouse/pull/64174) ([Raúl Marín](https://github.com/Algunenano)). +* Backported in [#64330](https://github.com/ClickHouse/ClickHouse/issues/64330): The query cache now considers two identical queries against different databases as different. The previous behavior could be used to bypass missing privileges to read from a table. [#64199](https://github.com/ClickHouse/ClickHouse/pull/64199) ([Robert Schulze](https://github.com/rschu1ze)). +* Backported in [#64254](https://github.com/ClickHouse/ClickHouse/issues/64254): Ignore `text_log` config when using Keeper. [#64218](https://github.com/ClickHouse/ClickHouse/pull/64218) ([Antonio Andelic](https://github.com/antonio2368)). +* Backported in [#64690](https://github.com/ClickHouse/ClickHouse/issues/64690): Fix Query Tree size validation. Closes [#63701](https://github.com/ClickHouse/ClickHouse/issues/63701). [#64377](https://github.com/ClickHouse/ClickHouse/pull/64377) ([Dmitry Novik](https://github.com/novikd)). +* Backported in [#64409](https://github.com/ClickHouse/ClickHouse/issues/64409): Fix `Logical error: Bad cast` for `Buffer` table with `PREWHERE`. Fixes [#64172](https://github.com/ClickHouse/ClickHouse/issues/64172). [#64388](https://github.com/ClickHouse/ClickHouse/pull/64388) ([Nikolai Kochetov](https://github.com/KochetovNicolai)). +* Backported in [#64727](https://github.com/ClickHouse/ClickHouse/issues/64727): Fixed `CREATE TABLE AS` queries for tables with default expressions. [#64455](https://github.com/ClickHouse/ClickHouse/pull/64455) ([Anton Popov](https://github.com/CurtizJ)). +* Backported in [#64623](https://github.com/ClickHouse/ClickHouse/issues/64623): Fix an error `Cannot find column` in distributed queries with constant CTE in the `GROUP BY` key. [#64519](https://github.com/ClickHouse/ClickHouse/pull/64519) ([Nikolai Kochetov](https://github.com/KochetovNicolai)). +* Backported in [#64680](https://github.com/ClickHouse/ClickHouse/issues/64680): Fix [#64612](https://github.com/ClickHouse/ClickHouse/issues/64612). Do not rewrite aggregation if `-If` combinator is already used. [#64638](https://github.com/ClickHouse/ClickHouse/pull/64638) ([Dmitry Novik](https://github.com/novikd)). +* Backported in [#64942](https://github.com/ClickHouse/ClickHouse/issues/64942): Fix OrderByLimitByDuplicateEliminationVisitor across subqueries. [#64766](https://github.com/ClickHouse/ClickHouse/pull/64766) ([Raúl Marín](https://github.com/Algunenano)). +* Backported in [#64871](https://github.com/ClickHouse/ClickHouse/issues/64871): Fixed memory possible incorrect memory tracking in several kinds of queries: queries that read any data from S3, queries via http protocol, asynchronous inserts. [#64844](https://github.com/ClickHouse/ClickHouse/pull/64844) ([Anton Popov](https://github.com/CurtizJ)). + +#### CI Fix or Improvement (changelog entry is not required) + +* Backported in [#63364](https://github.com/ClickHouse/ClickHouse/issues/63364): Implement cumulative A Sync status. [#61464](https://github.com/ClickHouse/ClickHouse/pull/61464) ([Mikhail f. Shiryaev](https://github.com/Felixoid)). +* Backported in [#63338](https://github.com/ClickHouse/ClickHouse/issues/63338): Use `/commit/` to have the URLs in [reports](https://play.clickhouse.com/play?user=play#c2VsZWN0IGRpc3RpbmN0IGNvbW1pdF91cmwgZnJvbSBjaGVja3Mgd2hlcmUgY2hlY2tfc3RhcnRfdGltZSA+PSBub3coKSAtIGludGVydmFsIDEgbW9udGggYW5kIHB1bGxfcmVxdWVzdF9udW1iZXI9NjA1MzI=) like https://github.com/ClickHouse/ClickHouse/commit/44f8bc5308b53797bec8cccc3bd29fab8a00235d and not like https://github.com/ClickHouse/ClickHouse/commits/44f8bc5308b53797bec8cccc3bd29fab8a00235d. [#63331](https://github.com/ClickHouse/ClickHouse/pull/63331) ([Mikhail f. Shiryaev](https://github.com/Felixoid)). +* Backported in [#63376](https://github.com/ClickHouse/ClickHouse/issues/63376):. [#63366](https://github.com/ClickHouse/ClickHouse/pull/63366) ([Aleksei Filatov](https://github.com/aalexfvk)). +* Backported in [#63571](https://github.com/ClickHouse/ClickHouse/issues/63571):. [#63551](https://github.com/ClickHouse/ClickHouse/pull/63551) ([Konstantin Bogdanov](https://github.com/thevar1able)). +* Backported in [#63651](https://github.com/ClickHouse/ClickHouse/issues/63651): Fix 02362_part_log_merge_algorithm flaky test. [#63635](https://github.com/ClickHouse/ClickHouse/pull/63635) ([Miсhael Stetsyuk](https://github.com/mstetsyuk)). +* Backported in [#63828](https://github.com/ClickHouse/ClickHouse/issues/63828): Fix test_odbc_interaction from aarch64 [#61457](https://github.com/ClickHouse/ClickHouse/issues/61457). [#63787](https://github.com/ClickHouse/ClickHouse/pull/63787) ([alesapin](https://github.com/alesapin)). +* Backported in [#63897](https://github.com/ClickHouse/ClickHouse/issues/63897): Fix test `test_catboost_evaluate` for aarch64. [#61457](https://github.com/ClickHouse/ClickHouse/issues/61457). [#63789](https://github.com/ClickHouse/ClickHouse/pull/63789) ([alesapin](https://github.com/alesapin)). +* Backported in [#63889](https://github.com/ClickHouse/ClickHouse/issues/63889): Remove HDFS from disks config for one integration test for arm. [#61457](https://github.com/ClickHouse/ClickHouse/issues/61457). [#63832](https://github.com/ClickHouse/ClickHouse/pull/63832) ([alesapin](https://github.com/alesapin)). +* Backported in [#63881](https://github.com/ClickHouse/ClickHouse/issues/63881): Bump version for old image in test_short_strings_aggregation to make it work on arm. [#61457](https://github.com/ClickHouse/ClickHouse/issues/61457). [#63836](https://github.com/ClickHouse/ClickHouse/pull/63836) ([alesapin](https://github.com/alesapin)). +* Backported in [#63919](https://github.com/ClickHouse/ClickHouse/issues/63919): Disable test `test_non_default_compression/test.py::test_preconfigured_deflateqpl_codec` on arm. [#61457](https://github.com/ClickHouse/ClickHouse/issues/61457). [#63839](https://github.com/ClickHouse/ClickHouse/pull/63839) ([alesapin](https://github.com/alesapin)). +* Backported in [#63971](https://github.com/ClickHouse/ClickHouse/issues/63971): Fix 02124_insert_deduplication_token_multiple_blocks. [#63950](https://github.com/ClickHouse/ClickHouse/pull/63950) ([Han Fei](https://github.com/hanfei1991)). +* Backported in [#64049](https://github.com/ClickHouse/ClickHouse/issues/64049): Add `ClickHouseVersion.copy` method. Create a branch release in advance without spinning out the release to increase the stability. [#64039](https://github.com/ClickHouse/ClickHouse/pull/64039) ([Mikhail f. Shiryaev](https://github.com/Felixoid)). +* Backported in [#64078](https://github.com/ClickHouse/ClickHouse/issues/64078): The mime type is not 100% reliable for Python and shell scripts without shebangs; add a check for file extension. [#64062](https://github.com/ClickHouse/ClickHouse/pull/64062) ([Mikhail f. Shiryaev](https://github.com/Felixoid)). +* Backported in [#64161](https://github.com/ClickHouse/ClickHouse/issues/64161): Add retries in git submodule update. [#64125](https://github.com/ClickHouse/ClickHouse/pull/64125) ([Alexey Milovidov](https://github.com/alexey-milovidov)). + +#### Critical Bug Fix (crash, LOGICAL_ERROR, data loss, RBAC) + +* Backported in [#64589](https://github.com/ClickHouse/ClickHouse/issues/64589): Disabled `enable_vertical_final` setting by default. This feature should not be used because it has a bug: [#64543](https://github.com/ClickHouse/ClickHouse/issues/64543). [#64544](https://github.com/ClickHouse/ClickHouse/pull/64544) ([Alexander Tokmakov](https://github.com/tavplubix)). +* Backported in [#64880](https://github.com/ClickHouse/ClickHouse/issues/64880): This PR fixes an error when a user in a specific situation can escalate their privileges on the default database without necessary grants. [#64769](https://github.com/ClickHouse/ClickHouse/pull/64769) ([pufit](https://github.com/pufit)). + +#### NO CL CATEGORY + +* Backported in [#63306](https://github.com/ClickHouse/ClickHouse/issues/63306):. [#63297](https://github.com/ClickHouse/ClickHouse/pull/63297) ([Alexey Milovidov](https://github.com/alexey-milovidov)). +* Backported in [#63710](https://github.com/ClickHouse/ClickHouse/issues/63710):. [#63415](https://github.com/ClickHouse/ClickHouse/pull/63415) ([Nikolai Kochetov](https://github.com/KochetovNicolai)). + +#### NO CL ENTRY + +* NO CL ENTRY: 'Revert "Backport [#64363](https://github.com/ClickHouse/ClickHouse/issues/64363) to 24.4: Split tests 03039_dynamic_all_merge_algorithms to avoid timeouts"'. [#64905](https://github.com/ClickHouse/ClickHouse/pull/64905) ([Raúl Marín](https://github.com/Algunenano)). + +#### NOT FOR CHANGELOG / INSIGNIFICANT + +* group_by_use_nulls strikes back [#62922](https://github.com/ClickHouse/ClickHouse/pull/62922) ([Nikolai Kochetov](https://github.com/KochetovNicolai)). +* Add `FROM` keyword to `TRUNCATE ALL TABLES` [#63241](https://github.com/ClickHouse/ClickHouse/pull/63241) ([Yarik Briukhovetskyi](https://github.com/yariks5s)). +* More checks for concurrently deleted files and dirs in system.remote_data_paths [#63274](https://github.com/ClickHouse/ClickHouse/pull/63274) ([Alexander Gololobov](https://github.com/davenger)). +* Try fix segfault in `MergeTreeReadPoolBase::createTask` [#63323](https://github.com/ClickHouse/ClickHouse/pull/63323) ([Antonio Andelic](https://github.com/antonio2368)). +* Skip unaccessible table dirs in system.remote_data_paths [#63330](https://github.com/ClickHouse/ClickHouse/pull/63330) ([Alexander Gololobov](https://github.com/davenger)). +* Workaround for `oklch()` inside canvas bug for firefox [#63404](https://github.com/ClickHouse/ClickHouse/pull/63404) ([Sergei Trifonov](https://github.com/serxa)). +* Cancel S3 reads properly when parallel reads are used [#63687](https://github.com/ClickHouse/ClickHouse/pull/63687) ([Antonio Andelic](https://github.com/antonio2368)). +* Userspace page cache: don't collect stats if cache is unused [#63730](https://github.com/ClickHouse/ClickHouse/pull/63730) ([Michael Kolupaev](https://github.com/al13n321)). +* Fix sanitizers [#64090](https://github.com/ClickHouse/ClickHouse/pull/64090) ([Azat Khuzhin](https://github.com/azat)). +* Split tests 03039_dynamic_all_merge_algorithms to avoid timeouts [#64363](https://github.com/ClickHouse/ClickHouse/pull/64363) ([Kruglov Pavel](https://github.com/Avogar)). +* CI: Critical bugfix category in PR template [#64480](https://github.com/ClickHouse/ClickHouse/pull/64480) ([Max K.](https://github.com/maxknv)). + diff --git a/docs/en/engines/table-engines/integrations/azureBlobStorage.md b/docs/en/engines/table-engines/integrations/azureBlobStorage.md index 0843ff1ac47..dfc27d6b8cf 100644 --- a/docs/en/engines/table-engines/integrations/azureBlobStorage.md +++ b/docs/en/engines/table-engines/integrations/azureBlobStorage.md @@ -54,6 +54,7 @@ SELECT * FROM test_table; - `_path` — Path to the file. Type: `LowCardinalty(String)`. - `_file` — Name of the file. Type: `LowCardinalty(String)`. - `_size` — Size of the file in bytes. Type: `Nullable(UInt64)`. If the size is unknown, the value is `NULL`. +- `_time` — Last modified time of the file. Type: `Nullable(DateTime)`. If the time is unknown, the value is `NULL`. ## See also diff --git a/docs/en/engines/table-engines/integrations/hdfs.md b/docs/en/engines/table-engines/integrations/hdfs.md index 2749fa7e479..c9df713231a 100644 --- a/docs/en/engines/table-engines/integrations/hdfs.md +++ b/docs/en/engines/table-engines/integrations/hdfs.md @@ -235,6 +235,7 @@ libhdfs3 support HDFS namenode HA. - `_path` — Path to the file. Type: `LowCardinalty(String)`. - `_file` — Name of the file. Type: `LowCardinalty(String)`. - `_size` — Size of the file in bytes. Type: `Nullable(UInt64)`. If the size is unknown, the value is `NULL`. +- `_time` — Last modified time of the file. Type: `Nullable(DateTime)`. If the time is unknown, the value is `NULL`. ## Storage Settings {#storage-settings} diff --git a/docs/en/engines/table-engines/integrations/s3.md b/docs/en/engines/table-engines/integrations/s3.md index cb1da1c8e68..93f4a187656 100644 --- a/docs/en/engines/table-engines/integrations/s3.md +++ b/docs/en/engines/table-engines/integrations/s3.md @@ -53,14 +53,14 @@ For partitioning by month, use the `toYYYYMM(date_column)` expression, where `da This example uses the [docker compose recipe](https://github.com/ClickHouse/examples/tree/5fdc6ff72f4e5137e23ea075c88d3f44b0202490/docker-compose-recipes/recipes/ch-and-minio-S3), which integrates ClickHouse and MinIO. You should be able to reproduce the same queries using S3 by replacing the endpoint and authentication values. -Notice that the S3 endpoint in the `ENGINE` configuration uses the parameter token `{_partition_id}` as part of the S3 object (filename), and that the SELECT queries select against those resulting object names (e.g., `test_3.csv`). +Notice that the S3 endpoint in the `ENGINE` configuration uses the parameter token `{_partition_id}` as part of the S3 object (filename), and that the SELECT queries select against those resulting object names (e.g., `test_3.csv`). :::note As shown in the example, querying from S3 tables that are partitioned is not directly supported at this time, but can be accomplished by querying the individual partitions using the S3 table function. -The primary use-case for writing +The primary use-case for writing partitioned data in S3 is to enable transferring that data into another ClickHouse system (for example, moving from on-prem systems to ClickHouse Cloud). Because ClickHouse datasets are often very large, and network @@ -78,9 +78,9 @@ CREATE TABLE p ) ENGINE = S3( # highlight-next-line - 'http://minio:10000/clickhouse//test_{_partition_id}.csv', - 'minioadmin', - 'minioadminpassword', + 'http://minio:10000/clickhouse//test_{_partition_id}.csv', + 'minioadmin', + 'minioadminpassword', 'CSV') PARTITION BY column3 ``` @@ -145,6 +145,7 @@ Code: 48. DB::Exception: Received from localhost:9000. DB::Exception: Reading fr - `_path` — Path to the file. Type: `LowCardinalty(String)`. - `_file` — Name of the file. Type: `LowCardinalty(String)`. - `_size` — Size of the file in bytes. Type: `Nullable(UInt64)`. If the size is unknown, the value is `NULL`. +- `_time` — Last modified time of the file. Type: `Nullable(DateTime)`. If the time is unknown, the value is `NULL`. For more information about virtual columns see [here](../../../engines/table-engines/index.md#table_engines-virtual_columns). diff --git a/docs/en/engines/table-engines/mergetree-family/mergetree.md b/docs/en/engines/table-engines/mergetree-family/mergetree.md index 0e53d7525b4..f0c4e1b0e34 100644 --- a/docs/en/engines/table-engines/mergetree-family/mergetree.md +++ b/docs/en/engines/table-engines/mergetree-family/mergetree.md @@ -102,7 +102,7 @@ Type of the rule `DELETE|TO DISK 'xxx'|TO VOLUME 'xxx'|GROUP BY` specifies an ac For more details, see [TTL for columns and tables](#table_engine-mergetree-ttl) -#### Settings +#### SETTINGS See [MergeTree Settings](../../../operations/settings/merge-tree-settings.md). diff --git a/docs/en/engines/table-engines/special/file.md b/docs/en/engines/table-engines/special/file.md index 0d422f64762..957b18b5305 100644 --- a/docs/en/engines/table-engines/special/file.md +++ b/docs/en/engines/table-engines/special/file.md @@ -102,6 +102,7 @@ For partitioning by month, use the `toYYYYMM(date_column)` expression, where `da - `_path` — Path to the file. Type: `LowCardinalty(String)`. - `_file` — Name of the file. Type: `LowCardinalty(String)`. - `_size` — Size of the file in bytes. Type: `Nullable(UInt64)`. If the size is unknown, the value is `NULL`. +- `_time` — Last modified time of the file. Type: `Nullable(DateTime)`. If the time is unknown, the value is `NULL`. ## Settings {#settings} diff --git a/docs/en/engines/table-engines/special/url.md b/docs/en/engines/table-engines/special/url.md index f6183a779ae..c906830d0e9 100644 --- a/docs/en/engines/table-engines/special/url.md +++ b/docs/en/engines/table-engines/special/url.md @@ -108,6 +108,7 @@ For partitioning by month, use the `toYYYYMM(date_column)` expression, where `da - `_path` — Path to the `URL`. Type: `LowCardinalty(String)`. - `_file` — Resource name of the `URL`. Type: `LowCardinalty(String)`. - `_size` — Size of the resource in bytes. Type: `Nullable(UInt64)`. If the size is unknown, the value is `NULL`. +- `_time` — Last modified time of the file. Type: `Nullable(DateTime)`. If the time is unknown, the value is `NULL`. ## Storage Settings {#storage-settings} diff --git a/docs/en/interfaces/formats.md b/docs/en/interfaces/formats.md index 66d5bd2e574..fdbfb742a10 100644 --- a/docs/en/interfaces/formats.md +++ b/docs/en/interfaces/formats.md @@ -480,7 +480,7 @@ The CSV format supports the output of totals and extremes the same way as `TabSe - [input_format_csv_detect_header](/docs/en/operations/settings/settings-formats.md/#input_format_csv_detect_header) - automatically detect header with names and types in CSV format. Default value - `true`. - [input_format_csv_skip_trailing_empty_lines](/docs/en/operations/settings/settings-formats.md/#input_format_csv_skip_trailing_empty_lines) - skip trailing empty lines at the end of data. Default value - `false`. - [input_format_csv_trim_whitespaces](/docs/en/operations/settings/settings-formats.md/#input_format_csv_trim_whitespaces) - trim spaces and tabs in non-quoted CSV strings. Default value - `true`. -- [input_format_csv_allow_whitespace_or_tab_as_delimiter](/docs/en/operations/settings/settings-formats.md/# input_format_csv_allow_whitespace_or_tab_as_delimiter) - Allow to use whitespace or tab as field delimiter in CSV strings. Default value - `false`. +- [input_format_csv_allow_whitespace_or_tab_as_delimiter](/docs/en/operations/settings/settings-formats.md/#input_format_csv_allow_whitespace_or_tab_as_delimiter) - Allow to use whitespace or tab as field delimiter in CSV strings. Default value - `false`. - [input_format_csv_allow_variable_number_of_columns](/docs/en/operations/settings/settings-formats.md/#input_format_csv_allow_variable_number_of_columns) - allow variable number of columns in CSV format, ignore extra columns and use default values on missing columns. Default value - `false`. - [input_format_csv_use_default_on_bad_values](/docs/en/operations/settings/settings-formats.md/#input_format_csv_use_default_on_bad_values) - Allow to set default value to column when CSV field deserialization failed on bad value. Default value - `false`. - [input_format_csv_try_infer_numbers_from_strings](/docs/en/operations/settings/settings-formats.md/#input_format_csv_try_infer_numbers_from_strings) - Try to infer numbers from string fields while schema inference. Default value - `false`. @@ -2165,6 +2165,8 @@ To exchange data with Hadoop, you can use [HDFS table engine](/docs/en/engines/t - [output_format_parquet_fixed_string_as_fixed_byte_array](/docs/en/operations/settings/settings-formats.md/#output_format_parquet_fixed_string_as_fixed_byte_array) - use Parquet FIXED_LENGTH_BYTE_ARRAY type instead of Binary/String for FixedString columns. Default value - `true`. - [output_format_parquet_version](/docs/en/operations/settings/settings-formats.md/#output_format_parquet_version) - The version of Parquet format used in output format. Default value - `2.latest`. - [output_format_parquet_compression_method](/docs/en/operations/settings/settings-formats.md/#output_format_parquet_compression_method) - compression method used in output Parquet format. Default value - `lz4`. +- [input_format_parquet_max_block_size](/docs/en/operations/settings/settings-formats.md/#input_format_parquet_max_block_size) - Max block row size for parquet reader. Default value - `65409`. +- [input_format_parquet_prefer_block_bytes](/docs/en/operations/settings/settings-formats.md/#input_format_parquet_prefer_block_bytes) - Average block bytes output by parquet reader. Default value - `16744704`. ## ParquetMetadata {data-format-parquet-metadata} diff --git a/docs/en/operations/named-collections.md b/docs/en/operations/named-collections.md index 1d261d935af..91438cfb675 100644 --- a/docs/en/operations/named-collections.md +++ b/docs/en/operations/named-collections.md @@ -67,6 +67,23 @@ To manage named collections with DDL a user must have the `named_control_collect In the above example the `password_sha256_hex` value is the hexadecimal representation of the SHA256 hash of the password. This configuration for the user `default` has the attribute `replace=true` as in the default configuration has a plain text `password` set, and it is not possible to have both plain text and sha256 hex passwords set for a user. ::: +### Storage for named collections + +Named collections can either be stored on local disk or in zookeeper/keeper. By default local storage is used. + +To configure named collections storage in keeper and a `type` (equal to either `keeper` or `zookeeper`) and `path` (path in keeper, where named collections will be stored) to `named_collections_storage` section in configuration file: +``` + + + zookeeper + /named_collections_path/ + 1000 + + +``` + +An optional configuration parameter `update_timeout_ms` by default is equal to `5000`. + ## Storing named collections in configuration files ### XML example diff --git a/docs/en/operations/settings/merge-tree-settings.md b/docs/en/operations/settings/merge-tree-settings.md index d791683ac2b..b45dc290797 100644 --- a/docs/en/operations/settings/merge-tree-settings.md +++ b/docs/en/operations/settings/merge-tree-settings.md @@ -974,10 +974,12 @@ Default value: false - [exclude_deleted_rows_for_part_size_in_merge](#exclude_deleted_rows_for_part_size_in_merge) setting -### allow_experimental_optimized_row_order +### optimize_row_order Controls if the row order should be optimized during inserts to improve the compressability of the newly inserted table part. +Only has an effect for ordinary MergeTree-engine tables. Does nothing for specialized MergeTree engine tables (e.g. CollapsingMergeTree). + MergeTree tables are (optionally) compressed using [compression codecs](../../sql-reference/statements/create/table.md#column_compression_codec). Generic compression codecs such as LZ4 and ZSTD achieve maximum compression rates if the data exposes patterns. Long runs of the same value typically compress very well. diff --git a/docs/en/operations/settings/settings-formats.md b/docs/en/operations/settings/settings-formats.md index 1a27b350652..6aae1ea62e5 100644 --- a/docs/en/operations/settings/settings-formats.md +++ b/docs/en/operations/settings/settings-formats.md @@ -1417,6 +1417,17 @@ Compression method used in output Parquet format. Supported codecs: `snappy`, `l Default value: `lz4`. +### input_format_parquet_max_block_size {#input_format_parquet_max_block_size} +Max block row size for parquet reader. By controlling the number of rows in each block, you can control the memory usage, +and in some operators that cache blocks, you can improve the accuracy of the operator's memory control。 + +Default value: `65409`. + +### input_format_parquet_prefer_block_bytes {#input_format_parquet_prefer_block_bytes} +Average block bytes output by parquet reader. Lowering the configuration in the case of reading some high compression parquet relieves the memory pressure. + +Default value: `65409 * 256 = 16744704` + ## Hive format settings {#hive-format-settings} ### input_format_hive_text_fields_delimiter {#input_format_hive_text_fields_delimiter} diff --git a/docs/en/operations/utilities/odbc-bridge.md b/docs/en/operations/utilities/odbc-bridge.md index abb8860880e..eb849c6b6ae 100644 --- a/docs/en/operations/utilities/odbc-bridge.md +++ b/docs/en/operations/utilities/odbc-bridge.md @@ -18,7 +18,7 @@ This tool works via HTTP, not via pipes, shared memory, or TCP because: However it can be used as standalone tool from command line with the following parameters in POST-request URL: - `connection_string` -- ODBC connection string. -- `columns` -- columns in ClickHouse NamesAndTypesList format, name in backticks, +- `sample_block` -- columns description in ClickHouse NamesAndTypesList format, name in backticks, type as string. Name and type are space separated, rows separated with newline. - `max_block_size` -- optional parameter, sets maximum size of single block. diff --git a/docs/en/sql-reference/aggregate-functions/combinators.md b/docs/en/sql-reference/aggregate-functions/combinators.md index 8ccc5e292b5..e30aa66b3b3 100644 --- a/docs/en/sql-reference/aggregate-functions/combinators.md +++ b/docs/en/sql-reference/aggregate-functions/combinators.md @@ -106,8 +106,8 @@ To work with these states, use: - [AggregatingMergeTree](../../engines/table-engines/mergetree-family/aggregatingmergetree.md) table engine. - [finalizeAggregation](../../sql-reference/functions/other-functions.md#function-finalizeaggregation) function. - [runningAccumulate](../../sql-reference/functions/other-functions.md#runningaccumulate) function. -- [-Merge](#aggregate_functions_combinators-merge) combinator. -- [-MergeState](#aggregate_functions_combinators-mergestate) combinator. +- [-Merge](#-merge) combinator. +- [-MergeState](#-mergestate) combinator. ## -Merge diff --git a/docs/en/sql-reference/aggregate-functions/parametric-functions.md b/docs/en/sql-reference/aggregate-functions/parametric-functions.md index 1dc89b8dcf9..093d88f939f 100644 --- a/docs/en/sql-reference/aggregate-functions/parametric-functions.md +++ b/docs/en/sql-reference/aggregate-functions/parametric-functions.md @@ -82,10 +82,12 @@ FROM In this case, you should remember that you do not know the histogram bin borders. -## sequenceMatch(pattern)(timestamp, cond1, cond2, ...) +## sequenceMatch Checks whether the sequence contains an event chain that matches the pattern. +**Syntax** + ``` sql sequenceMatch(pattern)(timestamp, cond1, cond2, ...) ``` @@ -102,7 +104,7 @@ Events that occur at the same second may lay in the sequence in an undefined ord **Parameters** -- `pattern` — Pattern string. See [Pattern syntax](#sequence-function-pattern-syntax). +- `pattern` — Pattern string. See [Pattern syntax](#sequencematch). **Returned values** @@ -170,9 +172,9 @@ SELECT sequenceMatch('(?1)(?2)')(time, number = 1, number = 2, number = 4) FROM **See Also** -- [sequenceCount](#function-sequencecount) +- [sequenceCount](#sequencecount) -## sequenceCount(pattern)(time, cond1, cond2, ...) +## sequenceCount Counts the number of event chains that matched the pattern. The function searches event chains that do not overlap. It starts to search for the next chain after the current chain is matched. @@ -180,6 +182,8 @@ Counts the number of event chains that matched the pattern. The function searche Events that occur at the same second may lay in the sequence in an undefined order affecting the result. ::: +**Syntax** + ``` sql sequenceCount(pattern)(timestamp, cond1, cond2, ...) ``` @@ -192,7 +196,7 @@ sequenceCount(pattern)(timestamp, cond1, cond2, ...) **Parameters** -- `pattern` — Pattern string. See [Pattern syntax](#sequence-function-pattern-syntax). +- `pattern` — Pattern string. See [Pattern syntax](#sequencematch). **Returned values** @@ -229,7 +233,7 @@ SELECT sequenceCount('(?1).*(?2)')(time, number = 1, number = 2) FROM t **See Also** -- [sequenceMatch](#function-sequencematch) +- [sequenceMatch](#sequencematch) ## windowFunnel diff --git a/docs/en/sql-reference/aggregate-functions/reference/flame_graph.md b/docs/en/sql-reference/aggregate-functions/reference/flame_graph.md new file mode 100644 index 00000000000..ae17153085c --- /dev/null +++ b/docs/en/sql-reference/aggregate-functions/reference/flame_graph.md @@ -0,0 +1,95 @@ +--- +slug: /en/sql-reference/aggregate-functions/reference/flamegraph +sidebar_position: 110 +--- + +# flameGraph + +Aggregate function which builds a [flamegraph](https://www.brendangregg.com/flamegraphs.html) using the list of stacktraces. Outputs an array of strings which can be used by [flamegraph.pl utility](https://github.com/brendangregg/FlameGraph) to render an SVG of the flamegraph. + +## Syntax + +```sql +flameGraph(traces, [size], [ptr]) +``` + +## Parameters + +- `traces` — a stacktrace. [Array](../../data-types/array.md)([UInt64](../../data-types/int-uint.md)). +- `size` — an allocation size for memory profiling. (optional - default `1`). [UInt64](../../data-types/int-uint.md). +- `ptr` — an allocation address. (optional - default `0`). [UInt64](../../data-types/int-uint.md). + +:::note +In the case where `ptr != 0`, a flameGraph will map allocations (size > 0) and deallocations (size < 0) with the same size and ptr. +Only allocations which were not freed are shown. Non mapped deallocations are ignored. +::: + +## Returned value + +- An array of strings for use with [flamegraph.pl utility](https://github.com/brendangregg/FlameGraph). [Array](../../data-types/array.md)([String](../../data-types/string.md)). + +## Examples + +### Building a flamegraph based on a CPU query profiler + +```sql +SET query_profiler_cpu_time_period_ns=10000000; +SELECT SearchPhrase, COUNT(DISTINCT UserID) AS u FROM hits WHERE SearchPhrase <> '' GROUP BY SearchPhrase ORDER BY u DESC LIMIT 10; +``` + +```text +clickhouse client --allow_introspection_functions=1 -q "select arrayJoin(flameGraph(arrayReverse(trace))) from system.trace_log where trace_type = 'CPU' and query_id = 'xxx'" | ~/dev/FlameGraph/flamegraph.pl > flame_cpu.svg +``` + +### Building a flamegraph based on a memory query profiler, showing all allocations + +```sql +SET memory_profiler_sample_probability=1, max_untracked_memory=1; +SELECT SearchPhrase, COUNT(DISTINCT UserID) AS u FROM hits WHERE SearchPhrase <> '' GROUP BY SearchPhrase ORDER BY u DESC LIMIT 10; +``` + +```text +clickhouse client --allow_introspection_functions=1 -q "select arrayJoin(flameGraph(trace, size)) from system.trace_log where trace_type = 'MemorySample' and query_id = 'xxx'" | ~/dev/FlameGraph/flamegraph.pl --countname=bytes --color=mem > flame_mem.svg +``` + +### Building a flamegraph based on a memory query profiler, showing allocations which were not deallocated in query context + +```sql +SET memory_profiler_sample_probability=1, max_untracked_memory=1, use_uncompressed_cache=1, merge_tree_max_rows_to_use_cache=100000000000, merge_tree_max_bytes_to_use_cache=1000000000000; +SELECT SearchPhrase, COUNT(DISTINCT UserID) AS u FROM hits WHERE SearchPhrase <> '' GROUP BY SearchPhrase ORDER BY u DESC LIMIT 10; +``` + +```text +clickhouse client --allow_introspection_functions=1 -q "SELECT arrayJoin(flameGraph(trace, size, ptr)) FROM system.trace_log WHERE trace_type = 'MemorySample' AND query_id = 'xxx'" | ~/dev/FlameGraph/flamegraph.pl --countname=bytes --color=mem > flame_mem_untracked.svg +``` + +### Build a flamegraph based on memory query profiler, showing active allocations at the fixed point of time + +```sql +SET memory_profiler_sample_probability=1, max_untracked_memory=1; +SELECT SearchPhrase, COUNT(DISTINCT UserID) AS u FROM hits WHERE SearchPhrase <> '' GROUP BY SearchPhrase ORDER BY u DESC LIMIT 10; +``` + +- 1 - Memory usage per second + +```sql +SELECT event_time, m, formatReadableSize(max(s) as m) FROM (SELECT event_time, sum(size) OVER (ORDER BY event_time) AS s FROM system.trace_log WHERE query_id = 'xxx' AND trace_type = 'MemorySample') GROUP BY event_time ORDER BY event_time; +``` + +- 2 - Find a time point with maximal memory usage + +```sql +SELECT argMax(event_time, s), max(s) FROM (SELECT event_time, sum(size) OVER (ORDER BY event_time) AS s FROM system.trace_log WHERE query_id = 'xxx' AND trace_type = 'MemorySample'); +``` + +- 3 - Fix active allocations at fixed point of time + +```text +clickhouse client --allow_introspection_functions=1 -q "SELECT arrayJoin(flameGraph(trace, size, ptr)) FROM (SELECT * FROM system.trace_log WHERE trace_type = 'MemorySample' AND query_id = 'xxx' AND event_time <= 'yyy' ORDER BY event_time)" | ~/dev/FlameGraph/flamegraph.pl --countname=bytes --color=mem > flame_mem_time_point_pos.svg +``` + +- 4 - Find deallocations at fixed point of time + +```text +clickhouse client --allow_introspection_functions=1 -q "SELECT arrayJoin(flameGraph(trace, -size, ptr)) FROM (SELECT * FROM system.trace_log WHERE trace_type = 'MemorySample' AND query_id = 'xxx' AND event_time > 'yyy' ORDER BY event_time desc)" | ~/dev/FlameGraph/flamegraph.pl --countname=bytes --color=mem > flame_mem_time_point_neg.svg +``` diff --git a/docs/en/sql-reference/aggregate-functions/reference/index.md b/docs/en/sql-reference/aggregate-functions/reference/index.md index a56b1c97681..e3725b6a430 100644 --- a/docs/en/sql-reference/aggregate-functions/reference/index.md +++ b/docs/en/sql-reference/aggregate-functions/reference/index.md @@ -58,6 +58,7 @@ ClickHouse-specific aggregate functions: - [topKWeighted](../reference/topkweighted.md) - [deltaSum](../reference/deltasum.md) - [deltaSumTimestamp](../reference/deltasumtimestamp.md) +- [flameGraph](../reference/flame_graph.md) - [groupArray](../reference/grouparray.md) - [groupArrayLast](../reference/grouparraylast.md) - [groupUniqArray](../reference/groupuniqarray.md) diff --git a/docs/en/sql-reference/aggregate-functions/reference/stochasticlinearregression.md b/docs/en/sql-reference/aggregate-functions/reference/stochasticlinearregression.md index ddac82a0977..7ab9e1d3256 100644 --- a/docs/en/sql-reference/aggregate-functions/reference/stochasticlinearregression.md +++ b/docs/en/sql-reference/aggregate-functions/reference/stochasticlinearregression.md @@ -3,7 +3,7 @@ slug: /en/sql-reference/aggregate-functions/reference/stochasticlinearregression sidebar_position: 221 --- -# stochasticLinearRegression +# stochasticLinearRegression {#agg_functions_stochasticlinearregression_parameters} This function implements stochastic linear regression. It supports custom parameters for learning rate, L2 regularization coefficient, mini-batch size, and has a few methods for updating weights ([Adam](https://en.wikipedia.org/wiki/Stochastic_gradient_descent#Adam) (used by default), [simple SGD](https://en.wikipedia.org/wiki/Stochastic_gradient_descent), [Momentum](https://en.wikipedia.org/wiki/Stochastic_gradient_descent#Momentum), and [Nesterov](https://mipt.ru/upload/medialibrary/d7e/41-91.pdf)). @@ -72,5 +72,5 @@ The query will return a column of predicted values. Note that first argument of **See Also** -- [stochasticLogisticRegression](../../../sql-reference/aggregate-functions/reference/stochasticlogisticregression.md#agg_functions-stochasticlogisticregression) +- [stochasticLogisticRegression](../../../sql-reference/aggregate-functions/reference/stochasticlogisticregression.md#stochasticlogisticregression) - [Difference between linear and logistic regressions](https://stackoverflow.com/questions/12146914/what-is-the-difference-between-linear-regression-and-logistic-regression) diff --git a/docs/en/sql-reference/aggregate-functions/reference/stochasticlogisticregression.md b/docs/en/sql-reference/aggregate-functions/reference/stochasticlogisticregression.md index 0a040689681..4bf5529ddcb 100644 --- a/docs/en/sql-reference/aggregate-functions/reference/stochasticlogisticregression.md +++ b/docs/en/sql-reference/aggregate-functions/reference/stochasticlogisticregression.md @@ -11,7 +11,7 @@ This function implements stochastic logistic regression. It can be used for bina Parameters are exactly the same as in stochasticLinearRegression: `learning rate`, `l2 regularization coefficient`, `mini-batch size`, `method for updating weights`. -For more information see [parameters](#agg_functions-stochasticlinearregression-parameters). +For more information see [parameters](../reference/stochasticlinearregression.md/#parameters). ``` text stochasticLogisticRegression(1.0, 1.0, 10, 'SGD') diff --git a/docs/en/sql-reference/aggregate-functions/reference/varpop.md b/docs/en/sql-reference/aggregate-functions/reference/varpop.md index fcabeb4c6a8..4e010248f6e 100644 --- a/docs/en/sql-reference/aggregate-functions/reference/varpop.md +++ b/docs/en/sql-reference/aggregate-functions/reference/varpop.md @@ -27,7 +27,7 @@ Returns an integer of type `Float64`. **Implementation details** -This function uses a numerically unstable algorithm. If you need numerical stability in calculations, use the slower but more stable [`varPopStable` function](#varPopStable). +This function uses a numerically unstable algorithm. If you need numerical stability in calculations, use the slower but more stable [`varPopStable`](#varpopstable) function. **Example** @@ -76,7 +76,7 @@ Returns an integer of type `Float64`. **Implementation details** -Unlike [`varPop()`](#varPop), this function uses a stable, numerically accurate algorithm to calculate the population variance to avoid issues like catastrophic cancellation or loss of precision. This function also handles `NaN` and `Inf` values correctly, excluding them from calculations. +Unlike [`varPop`](#varpop), this function uses a stable, numerically accurate algorithm to calculate the population variance to avoid issues like catastrophic cancellation or loss of precision. This function also handles `NaN` and `Inf` values correctly, excluding them from calculations. **Example** diff --git a/docs/en/sql-reference/aggregate-functions/reference/varsamp.md b/docs/en/sql-reference/aggregate-functions/reference/varsamp.md index be669a16ae8..bd1cfa5742a 100644 --- a/docs/en/sql-reference/aggregate-functions/reference/varsamp.md +++ b/docs/en/sql-reference/aggregate-functions/reference/varsamp.md @@ -40,7 +40,7 @@ Where: The function assumes that the input data set represents a sample from a larger population. If you want to calculate the variance of the entire population (when you have the complete data set), you should use the [`varPop()` function](./varpop#varpop) instead. -This function uses a numerically unstable algorithm. If you need numerical stability in calculations, use the slower but more stable [`varSampStable` function](#varSampStable). +This function uses a numerically unstable algorithm. If you need numerical stability in calculations, use the slower but more stable [`varSampStable`](#varsampstable) function. **Example** @@ -82,11 +82,11 @@ varSampStable(expr) **Returned value** -The `varSampStable()` function returns a Float64 value representing the sample variance of the input data set. +The `varSampStable` function returns a Float64 value representing the sample variance of the input data set. **Implementation details** -The `varSampStable()` function calculates the sample variance using the same formula as the [`varSamp()`](#varSamp function): +The `varSampStable` function calculates the sample variance using the same formula as the [`varSamp`](#varsamp) function: ```plaintext ∑(x - mean(x))^2 / (n - 1) @@ -97,9 +97,9 @@ Where: - `mean(x)` is the arithmetic mean of the data set. - `n` is the number of data points in the data set. -The difference between `varSampStable()` and `varSamp()` is that `varSampStable()` is designed to provide a more deterministic and stable result when dealing with floating-point arithmetic. It uses an algorithm that minimizes the accumulation of rounding errors, which can be particularly important when dealing with large data sets or data with a wide range of values. +The difference between `varSampStable` and `varSamp` is that `varSampStable` is designed to provide a more deterministic and stable result when dealing with floating-point arithmetic. It uses an algorithm that minimizes the accumulation of rounding errors, which can be particularly important when dealing with large data sets or data with a wide range of values. -Like `varSamp()`, the `varSampStable()` function assumes that the input data set represents a sample from a larger population. If you want to calculate the variance of the entire population (when you have the complete data set), you should use the [`varPopStable()` function](./varpop#varpopstable) instead. +Like `varSamp`, the `varSampStable` function assumes that the input data set represents a sample from a larger population. If you want to calculate the variance of the entire population (when you have the complete data set), you should use the [`varPopStable`](./varpop#varpopstable) function instead. **Example** @@ -125,4 +125,4 @@ Response: 0.865 ``` -This query calculates the sample variance of the `value` column in the `example_table` using the `varSampStable()` function. The result shows that the sample variance of the values `[10.5, 12.3, 9.8, 11.2, 10.7]` is approximately 0.865, which may differ slightly from the result of `varSamp()` due to the more precise handling of floating-point arithmetic. +This query calculates the sample variance of the `value` column in the `example_table` using the `varSampStable()` function. The result shows that the sample variance of the values `[10.5, 12.3, 9.8, 11.2, 10.7]` is approximately 0.865, which may differ slightly from the result of `varSamp` due to the more precise handling of floating-point arithmetic. diff --git a/docs/en/sql-reference/data-types/geo.md b/docs/en/sql-reference/data-types/geo.md index 7e3c32b3451..7ffc7447d96 100644 --- a/docs/en/sql-reference/data-types/geo.md +++ b/docs/en/sql-reference/data-types/geo.md @@ -33,7 +33,7 @@ Result: ## Ring -`Ring` is a simple polygon without holes stored as an array of points: [Array](array.md)([Point](#point-data-type)). +`Ring` is a simple polygon without holes stored as an array of points: [Array](array.md)([Point](#point)). **Example** @@ -54,7 +54,7 @@ Result: ## Polygon -`Polygon` is a polygon with holes stored as an array of rings: [Array](array.md)([Ring](#ring-data-type)). First element of outer array is the outer shape of polygon and all the following elements are holes. +`Polygon` is a polygon with holes stored as an array of rings: [Array](array.md)([Ring](#ring)). First element of outer array is the outer shape of polygon and all the following elements are holes. **Example** @@ -76,7 +76,7 @@ Result: ## MultiPolygon -`MultiPolygon` consists of multiple polygons and is stored as an array of polygons: [Array](array.md)([Polygon](#polygon-data-type)). +`MultiPolygon` consists of multiple polygons and is stored as an array of polygons: [Array](array.md)([Polygon](#polygon)). **Example** diff --git a/docs/en/sql-reference/dictionaries/index.md b/docs/en/sql-reference/dictionaries/index.md index 080de94f8b7..4c7421d57c0 100644 --- a/docs/en/sql-reference/dictionaries/index.md +++ b/docs/en/sql-reference/dictionaries/index.md @@ -16,7 +16,7 @@ ClickHouse supports special functions for working with dictionaries that can be ClickHouse supports: - Dictionaries with a [set of functions](../../sql-reference/functions/ext-dict-functions.md). -- [Embedded dictionaries](#embedded_dictionaries) with a specific [set of functions](../../sql-reference/functions/ym-dict-functions.md). +- [Embedded dictionaries](#embedded-dictionaries) with a specific [set of functions](../../sql-reference/functions/ym-dict-functions.md). :::tip Tutorial @@ -82,7 +82,7 @@ You can [configure](#configuring-a-dictionary) any number of dictionaries in the You can convert values for a small dictionary by describing it in a `SELECT` query (see the [transform](../../sql-reference/functions/other-functions.md) function). This functionality is not related to dictionaries. ::: -## Configuring a Dictionary {#configuring-a-dictionary} +## Configuring a Dictionary @@ -123,7 +123,7 @@ LAYOUT(...) -- Memory layout configuration LIFETIME(...) -- Lifetime of dictionary in memory ``` -## Storing Dictionaries in Memory {#storing-dictionaries-in-memory} +## Storing Dictionaries in Memory There are a variety of ways to store dictionaries in memory. @@ -415,7 +415,7 @@ or LAYOUT(COMPLEX_KEY_HASHED_ARRAY([SHARDS 1])) ``` -### range_hashed {#range_hashed} +### range_hashed The dictionary is stored in memory in the form of a hash table with an ordered array of ranges and their corresponding values. @@ -679,7 +679,7 @@ When searching for a dictionary, the cache is searched first. For each block of If keys are not found in dictionary, then update cache task is created and added into update queue. Update queue properties can be controlled with settings `max_update_queue_size`, `update_queue_push_timeout_milliseconds`, `query_wait_timeout_milliseconds`, `max_threads_for_updates`. -For cache dictionaries, the expiration [lifetime](#dictionary-updates) of data in the cache can be set. If more time than `lifetime` has passed since loading the data in a cell, the cell’s value is not used and key becomes expired. The key is re-requested the next time it needs to be used. This behaviour can be configured with setting `allow_read_expired_keys`. +For cache dictionaries, the expiration [lifetime](#refreshing-dictionary-data-using-lifetime) of data in the cache can be set. If more time than `lifetime` has passed since loading the data in a cell, the cell’s value is not used and key becomes expired. The key is re-requested the next time it needs to be used. This behaviour can be configured with setting `allow_read_expired_keys`. This is the least effective of all the ways to store dictionaries. The speed of the cache depends strongly on correct settings and the usage scenario. A cache type dictionary performs well only when the hit rates are high enough (recommended 99% and higher). You can view the average hit rate in the [system.dictionaries](../../operations/system-tables/dictionaries.md) table. @@ -899,7 +899,7 @@ Other types are not supported yet. The function returns the attribute for the pr Data must completely fit into RAM. -## Refreshing dictionary data using LIFETIME {#lifetime} +## Refreshing dictionary data using LIFETIME ClickHouse periodically updates dictionaries based on the `LIFETIME` tag (defined in seconds). `LIFETIME` is the update interval for fully downloaded dictionaries and the invalidation interval for cached dictionaries. @@ -1031,7 +1031,7 @@ SOURCE(CLICKHOUSE(... update_field 'added_time' update_lag 15)) ... ``` -## Dictionary Sources {#dictionary-sources} +## Dictionary Sources @@ -1065,7 +1065,7 @@ SOURCE(SOURCE_TYPE(param1 val1 ... paramN valN)) -- Source configuration The source is configured in the `source` section. -For source types [Local file](#local_file), [Executable file](#executable), [HTTP(s)](#https), [ClickHouse](#clickhouse) +For source types [Local file](#local-file), [Executable file](#executable-file), [HTTP(s)](#https), [ClickHouse](#clickhouse) optional settings are available: ``` xml @@ -1089,10 +1089,10 @@ SETTINGS(format_csv_allow_single_quotes = 0) Types of sources (`source_type`): -- [Local file](#local_file) -- [Executable File](#executable) -- [Executable Pool](#executable_pool) -- [HTTP(S)](#http) +- [Local file](#local-file) +- [Executable File](#executable-file) +- [Executable Pool](#executable-pool) +- [HTTP(S)](#https) - DBMS - [ODBC](#odbc) - [MySQL](#mysql) @@ -1102,7 +1102,7 @@ Types of sources (`source_type`): - [Cassandra](#cassandra) - [PostgreSQL](#postgresql) -### Local File {#local_file} +### Local File Example of settings: @@ -1132,9 +1132,9 @@ When a dictionary with source `FILE` is created via DDL command (`CREATE DICTION - [Dictionary function](../../sql-reference/table-functions/dictionary.md#dictionary-function) -### Executable File {#executable} +### Executable File -Working with executable files depends on [how the dictionary is stored in memory](#storig-dictionaries-in-memory). If the dictionary is stored using `cache` and `complex_key_cache`, ClickHouse requests the necessary keys by sending a request to the executable file’s STDIN. Otherwise, ClickHouse starts the executable file and treats its output as dictionary data. +Working with executable files depends on [how the dictionary is stored in memory](#storing-dictionaries-in-memory). If the dictionary is stored using `cache` and `complex_key_cache`, ClickHouse requests the necessary keys by sending a request to the executable file’s STDIN. Otherwise, ClickHouse starts the executable file and treats its output as dictionary data. Example of settings: @@ -1161,7 +1161,7 @@ Setting fields: That dictionary source can be configured only via XML configuration. Creating dictionaries with executable source via DDL is disabled; otherwise, the DB user would be able to execute arbitrary binaries on the ClickHouse node. -### Executable Pool {#executable_pool} +### Executable Pool Executable pool allows loading data from pool of processes. This source does not work with dictionary layouts that need to load all data from source. Executable pool works if the dictionary [is stored](#ways-to-store-dictionaries-in-memory) using `cache`, `complex_key_cache`, `ssd_cache`, `complex_key_ssd_cache`, `direct`, or `complex_key_direct` layouts. @@ -1196,9 +1196,9 @@ Setting fields: That dictionary source can be configured only via XML configuration. Creating dictionaries with executable source via DDL is disabled, otherwise, the DB user would be able to execute arbitrary binary on ClickHouse node. -### HTTP(S) {#https} +### HTTP(S) -Working with an HTTP(S) server depends on [how the dictionary is stored in memory](#storig-dictionaries-in-memory). If the dictionary is stored using `cache` and `complex_key_cache`, ClickHouse requests the necessary keys by sending a request via the `POST` method. +Working with an HTTP(S) server depends on [how the dictionary is stored in memory](#storing-dictionaries-in-memory). If the dictionary is stored using `cache` and `complex_key_cache`, ClickHouse requests the necessary keys by sending a request via the `POST` method. Example of settings: @@ -1285,7 +1285,7 @@ Setting fields: - `db` – Name of the database. Omit it if the database name is set in the `` parameters. - `table` – Name of the table and schema if exists. - `connection_string` – Connection string. -- `invalidate_query` – Query for checking the dictionary status. Optional parameter. Read more in the section [Updating dictionaries](#dictionary-updates). +- `invalidate_query` – Query for checking the dictionary status. Optional parameter. Read more in the section [Refreshing dictionary data using LIFETIME](#refreshing-dictionary-data-using-lifetime). - `query` – The custom query. Optional parameter. :::note @@ -1575,7 +1575,7 @@ Setting fields: - `where` – The selection criteria. The syntax for conditions is the same as for `WHERE` clause in MySQL, for example, `id > 10 AND id < 20`. Optional parameter. -- `invalidate_query` – Query for checking the dictionary status. Optional parameter. Read more in the section [Updating dictionaries](#dictionary-updates). +- `invalidate_query` – Query for checking the dictionary status. Optional parameter. Read more in the section [Refreshing dictionary data using LIFETIME](#refreshing-dictionary-data-using-lifetime). - `fail_on_connection_loss` – The configuration parameter that controls behavior of the server on connection loss. If `true`, an exception is thrown immediately if the connection between client and server was lost. If `false`, the ClickHouse server retries to execute the query three times before throwing an exception. Note that retrying leads to increased response times. Default value: `false`. @@ -1672,7 +1672,7 @@ Setting fields: - `db` – Name of the database. - `table` – Name of the table. - `where` – The selection criteria. May be omitted. -- `invalidate_query` – Query for checking the dictionary status. Optional parameter. Read more in the section [Updating dictionaries](#dictionary-updates). +- `invalidate_query` – Query for checking the dictionary status. Optional parameter. Read more in the section [Refreshing dictionary data using LIFETIME](#refreshing-dictionary-data-using-lifetime). - `secure` - Use ssl for connection. - `query` – The custom query. Optional parameter. @@ -1849,7 +1849,7 @@ Setting fields: - `db` – Name of the database. - `table` – Name of the table. - `where` – The selection criteria. The syntax for conditions is the same as for `WHERE` clause in PostgreSQL. For example, `id > 10 AND id < 20`. Optional parameter. -- `invalidate_query` – Query for checking the dictionary status. Optional parameter. Read more in the section [Updating dictionaries](#dictionary-updates). +- `invalidate_query` – Query for checking the dictionary status. Optional parameter. Read more in the section [Refreshing dictionary data using LIFETIME](#refreshing-dictionary-data-using-lifetime). - `query` – The custom query. Optional parameter. :::note @@ -1873,7 +1873,7 @@ LAYOUT(FLAT()) LIFETIME(0); ``` -## Dictionary Key and Fields {#dictionary-key-and-fields} +## Dictionary Key and Fields @@ -1963,7 +1963,7 @@ PRIMARY KEY Id ### Composite Key -The key can be a `tuple` from any types of fields. The [layout](#storig-dictionaries-in-memory) in this case must be `complex_key_hashed` or `complex_key_cache`. +The key can be a `tuple` from any types of fields. The [layout](#storing-dictionaries-in-memory) in this case must be `complex_key_hashed` or `complex_key_cache`. :::tip A composite key can consist of a single element. This makes it possible to use a string as the key, for instance. @@ -2030,17 +2030,17 @@ CREATE DICTIONARY somename ( Configuration fields: -| Tag | Description | Required | -|------------------------------------------------------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|----------| -| `name` | Column name. | Yes | -| `type` | ClickHouse data type: [UInt8](../../sql-reference/data-types/int-uint.md), [UInt16](../../sql-reference/data-types/int-uint.md), [UInt32](../../sql-reference/data-types/int-uint.md), [UInt64](../../sql-reference/data-types/int-uint.md), [Int8](../../sql-reference/data-types/int-uint.md), [Int16](../../sql-reference/data-types/int-uint.md), [Int32](../../sql-reference/data-types/int-uint.md), [Int64](../../sql-reference/data-types/int-uint.md), [Float32](../../sql-reference/data-types/float.md), [Float64](../../sql-reference/data-types/float.md), [UUID](../../sql-reference/data-types/uuid.md), [Decimal32](../../sql-reference/data-types/decimal.md), [Decimal64](../../sql-reference/data-types/decimal.md), [Decimal128](../../sql-reference/data-types/decimal.md), [Decimal256](../../sql-reference/data-types/decimal.md),[Date](../../sql-reference/data-types/date.md), [Date32](../../sql-reference/data-types/date32.md), [DateTime](../../sql-reference/data-types/datetime.md), [DateTime64](../../sql-reference/data-types/datetime64.md), [String](../../sql-reference/data-types/string.md), [Array](../../sql-reference/data-types/array.md).
ClickHouse tries to cast value from dictionary to the specified data type. For example, for MySQL, the field might be `TEXT`, `VARCHAR`, or `BLOB` in the MySQL source table, but it can be uploaded as `String` in ClickHouse.
[Nullable](../../sql-reference/data-types/nullable.md) is currently supported for [Flat](#flat), [Hashed](#hashed), [ComplexKeyHashed](#complex_key_hashed), [Direct](#direct), [ComplexKeyDirect](#complex_key_direct), [RangeHashed](#range_hashed), Polygon, [Cache](#cache), [ComplexKeyCache](#complex_key_cache), [SSDCache](#ssd_cache), [SSDComplexKeyCache](#complex_key_ssd_cache) dictionaries. In [IPTrie](#ip_trie) dictionaries `Nullable` types are not supported. | Yes | -| `null_value` | Default value for a non-existing element.
In the example, it is an empty string. [NULL](../syntax.md#null) value can be used only for the `Nullable` types (see the previous line with types description). | Yes | -| `expression` | [Expression](../../sql-reference/syntax.md#expressions) that ClickHouse executes on the value.
The expression can be a column name in the remote SQL database. Thus, you can use it to create an alias for the remote column.

Default value: no expression. | No | -| `hierarchical` | If `true`, the attribute contains the value of a parent key for the current key. See [Hierarchical Dictionaries](#hierarchical-dictionaries).

Default value: `false`. | No | -| `injective` | Flag that shows whether the `id -> attribute` image is [injective](https://en.wikipedia.org/wiki/Injective_function).
If `true`, ClickHouse can automatically place after the `GROUP BY` clause the requests to dictionaries with injection. Usually it significantly reduces the amount of such requests.

Default value: `false`. | No | -| `is_object_id` | Flag that shows whether the query is executed for a MongoDB document by `ObjectID`.

Default value: `false`. +| Tag | Description | Required | +|------------------------------------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|----------| +| `name` | Column name. | Yes | +| `type` | ClickHouse data type: [UInt8](../../sql-reference/data-types/int-uint.md), [UInt16](../../sql-reference/data-types/int-uint.md), [UInt32](../../sql-reference/data-types/int-uint.md), [UInt64](../../sql-reference/data-types/int-uint.md), [Int8](../../sql-reference/data-types/int-uint.md), [Int16](../../sql-reference/data-types/int-uint.md), [Int32](../../sql-reference/data-types/int-uint.md), [Int64](../../sql-reference/data-types/int-uint.md), [Float32](../../sql-reference/data-types/float.md), [Float64](../../sql-reference/data-types/float.md), [UUID](../../sql-reference/data-types/uuid.md), [Decimal32](../../sql-reference/data-types/decimal.md), [Decimal64](../../sql-reference/data-types/decimal.md), [Decimal128](../../sql-reference/data-types/decimal.md), [Decimal256](../../sql-reference/data-types/decimal.md),[Date](../../sql-reference/data-types/date.md), [Date32](../../sql-reference/data-types/date32.md), [DateTime](../../sql-reference/data-types/datetime.md), [DateTime64](../../sql-reference/data-types/datetime64.md), [String](../../sql-reference/data-types/string.md), [Array](../../sql-reference/data-types/array.md).
ClickHouse tries to cast value from dictionary to the specified data type. For example, for MySQL, the field might be `TEXT`, `VARCHAR`, or `BLOB` in the MySQL source table, but it can be uploaded as `String` in ClickHouse.
[Nullable](../../sql-reference/data-types/nullable.md) is currently supported for [Flat](#flat), [Hashed](#hashed), [ComplexKeyHashed](#complex_key_hashed), [Direct](#direct), [ComplexKeyDirect](#complex_key_direct), [RangeHashed](#range_hashed), Polygon, [Cache](#cache), [ComplexKeyCache](#complex_key_cache), [SSDCache](#ssd_cache), [SSDComplexKeyCache](#complex_key_ssd_cache) dictionaries. In [IPTrie](#ip_trie) dictionaries `Nullable` types are not supported. | Yes | +| `null_value` | Default value for a non-existing element.
In the example, it is an empty string. [NULL](../syntax.md#null) value can be used only for the `Nullable` types (see the previous line with types description). | Yes | +| `expression` | [Expression](../../sql-reference/syntax.md#expressions) that ClickHouse executes on the value.
The expression can be a column name in the remote SQL database. Thus, you can use it to create an alias for the remote column.

Default value: no expression. | No | +| `hierarchical` | If `true`, the attribute contains the value of a parent key for the current key. See [Hierarchical Dictionaries](#hierarchical-dictionaries).

Default value: `false`. | No | +| `injective` | Flag that shows whether the `id -> attribute` image is [injective](https://en.wikipedia.org/wiki/Injective_function).
If `true`, ClickHouse can automatically place after the `GROUP BY` clause the requests to dictionaries with injection. Usually it significantly reduces the amount of such requests.

Default value: `false`. | No | +| `is_object_id` | Flag that shows whether the query is executed for a MongoDB document by `ObjectID`.

Default value: `false`. -## Hierarchical Dictionaries {#hierarchical-dictionaries} +## Hierarchical Dictionaries ClickHouse supports hierarchical dictionaries with a [numeric key](#numeric-key). @@ -2165,7 +2165,7 @@ Points can be specified as an array or a tuple of their coordinates. In the curr The user can upload their own data in all formats supported by ClickHouse. -There are 3 types of [in-memory storage](#storig-dictionaries-in-memory) available: +There are 3 types of [in-memory storage](#storing-dictionaries-in-memory) available: - `POLYGON_SIMPLE`. This is a naive implementation, where a linear pass through all polygons is made for each query, and membership is checked for each one without using additional indexes. @@ -2435,7 +2435,7 @@ LIFETIME(0) LAYOUT(regexp_tree); ``` -## Embedded Dictionaries {#embedded-dictionaries} +## Embedded Dictionaries diff --git a/docs/en/sql-reference/functions/array-functions.md b/docs/en/sql-reference/functions/array-functions.md index 7b52fbff714..d87ca4a0fe7 100644 --- a/docs/en/sql-reference/functions/array-functions.md +++ b/docs/en/sql-reference/functions/array-functions.md @@ -1261,7 +1261,7 @@ SELECT arraySort((x) -> -x, [1, 2, 3]) as res; └─────────┘ ``` -For each element of the source array, the lambda function returns the sorting key, that is, \[1 –\> -1, 2 –\> -2, 3 –\> -3\]. Since the `arraySort` function sorts the keys in ascending order, the result is \[3, 2, 1\]. Thus, the `(x) –> -x` lambda function sets the [descending order](#reverse-sort) in a sorting. +For each element of the source array, the lambda function returns the sorting key, that is, \[1 –\> -1, 2 –\> -2, 3 –\> -3\]. Since the `arraySort` function sorts the keys in ascending order, the result is \[3, 2, 1\]. Thus, the `(x) –> -x` lambda function sets the [descending order](#arrayreversesort) in a sorting. The lambda function can accept multiple arguments. In this case, you need to pass the `arraySort` function several arrays of identical length that the arguments of lambda function will correspond to. The resulting array will consist of elements from the first input array; elements from the next input array(s) specify the sorting keys. For example: @@ -1307,10 +1307,15 @@ To improve sorting efficiency, the [Schwartzian transform](https://en.wikipedia. Same as `arraySort` with additional `limit` argument allowing partial sorting. Returns an array of the same size as the original array where elements in range `[1..limit]` are sorted in ascending order. Remaining elements `(limit..N]` shall contain elements in unspecified order. -## arrayReverseSort(\[func,\] arr, ...) {#reverse-sort} +## arrayReverseSort Sorts the elements of the `arr` array in descending order. If the `func` function is specified, `arr` is sorted according to the result of the `func` function applied to the elements of the array, and then the sorted array is reversed. If `func` accepts multiple arguments, the `arrayReverseSort` function is passed several arrays that the arguments of `func` will correspond to. Detailed examples are shown at the end of `arrayReverseSort` description. +**Syntax** + +```sql +arrayReverseSort([func,] arr, ...) +``` Example of integer values sorting: ``` sql @@ -1907,10 +1912,16 @@ FROM numbers(1,10); - [arrayReduce](#arrayreduce) -## arrayReverse(arr) +## arrayReverse Returns an array of the same size as the original array containing the elements in reverse order. +**Syntax** + +```sql +arrayReverse(arr) +``` + Example: ``` sql diff --git a/docs/en/sql-reference/functions/bitmap-functions.md b/docs/en/sql-reference/functions/bitmap-functions.md index a5c8a663b71..d30c0f4dde4 100644 --- a/docs/en/sql-reference/functions/bitmap-functions.md +++ b/docs/en/sql-reference/functions/bitmap-functions.md @@ -74,7 +74,7 @@ bitmapSubsetInRange(bitmap, range_start, range_end) **Arguments** -- `bitmap` – [Bitmap object](#bitmap_functions-bitmapbuild). +- `bitmap` – [Bitmap object](#bitmapbuild). - `range_start` – Start of the range (inclusive). [UInt32](../data-types/int-uint.md). - `range_end` – End of the range (exclusive). [UInt32](../data-types/int-uint.md). @@ -104,7 +104,7 @@ bitmapSubsetLimit(bitmap, range_start, cardinality_limit) **Arguments** -- `bitmap` – [Bitmap object](#bitmap_functions-bitmapbuild). +- `bitmap` – [Bitmap object](#bitmapbuild). - `range_start` – Start of the range (inclusive). [UInt32](../data-types/int-uint.md). - `cardinality_limit` – Maximum cardinality of the subset. [UInt32](../data-types/int-uint.md). @@ -134,7 +134,7 @@ subBitmap(bitmap, offset, cardinality_limit) **Arguments** -- `bitmap` – The bitmap. [Bitmap object](#bitmap_functions-bitmapbuild). +- `bitmap` – The bitmap. [Bitmap object](#bitmapbuild). - `offset` – The position of the first element of the subset. [UInt32](../data-types/int-uint.md). - `cardinality_limit` – The maximum number of elements in the subset. [UInt32](../data-types/int-uint.md). @@ -162,7 +162,7 @@ bitmapContains(bitmap, needle) **Arguments** -- `bitmap` – [Bitmap object](#bitmap_functions-bitmapbuild). +- `bitmap` – [Bitmap object](#bitmapbuild). - `needle` – Searched bit value. [UInt32](../data-types/int-uint.md). **Returned values** @@ -188,7 +188,7 @@ Result: Checks whether two bitmaps intersect. -If `bitmap2` contains exactly one element, consider using [bitmapContains](#bitmap_functions-bitmapcontains) instead as it works more efficiently. +If `bitmap2` contains exactly one element, consider using [bitmapContains](#bitmapcontains) instead as it works more efficiently. **Syntax** diff --git a/docs/en/sql-reference/functions/date-time-functions.md b/docs/en/sql-reference/functions/date-time-functions.md index 4092c83954a..b532e0de8f0 100644 --- a/docs/en/sql-reference/functions/date-time-functions.md +++ b/docs/en/sql-reference/functions/date-time-functions.md @@ -83,7 +83,7 @@ Result: ``` ## makeDate32 -Like [makeDate](#makeDate) but produces a [Date32](../data-types/date32.md). +Like [makeDate](#makedate) but produces a [Date32](../data-types/date32.md). ## makeDateTime @@ -214,7 +214,7 @@ Result: **See also** -- [serverTimeZone](#serverTimeZone) +- [serverTimeZone](#servertimezone) ## serverTimeZone @@ -249,7 +249,7 @@ Result: **See also** -- [timeZone](#timeZone) +- [timeZone](#timezone) ## toTimeZone @@ -305,7 +305,7 @@ int32samoa: 1546300800 **See Also** -- [formatDateTime](#formatDateTime) - supports non-constant timezone. +- [formatDateTime](#formatdatetime) - supports non-constant timezone. - [toString](type-conversion-functions.md#tostring) - supports non-constant timezone. ## timeZoneOf @@ -1006,7 +1006,7 @@ toStartOfWeek(t[, mode[, timezone]]) **Arguments** - `t` - a [Date](../data-types/date.md), [Date32](../data-types/date32.md), [DateTime](../data-types/datetime.md) or [DateTime64](../data-types/datetime64.md) -- `mode` - determines the first day of the week as described in the [toWeek()](date-time-functions#toweek) function +- `mode` - determines the first day of the week as described in the [toWeek()](#toweek) function - `timezone` - Optional parameter, it behaves like any other conversion function **Returned value** @@ -1049,7 +1049,7 @@ toLastDayOfWeek(t[, mode[, timezone]]) **Arguments** - `t` - a [Date](../data-types/date.md), [Date32](../data-types/date32.md), [DateTime](../data-types/datetime.md) or [DateTime64](../data-types/datetime64.md) -- `mode` - determines the last day of the week as described in the [toWeek()](date-time-functions#toweek) function +- `mode` - determines the last day of the week as described in the [toWeek](#toweek) function - `timezone` - Optional parameter, it behaves like any other conversion function **Returned value** @@ -1719,7 +1719,7 @@ Result: **See Also** -- [fromDaysSinceYearZero](#fromDaysSinceYearZero) +- [fromDaysSinceYearZero](#fromdayssinceyearzero) ## fromDaysSinceYearZero @@ -1759,11 +1759,11 @@ Result: **See Also** -- [toDaysSinceYearZero](#toDaysSinceYearZero) +- [toDaysSinceYearZero](#todayssinceyearzero) ## fromDaysSinceYearZero32 -Like [fromDaysSinceYearZero](#fromDaysSinceYearZero) but returns a [Date32](../data-types/date32.md). +Like [fromDaysSinceYearZero](#fromdayssinceyearzero) but returns a [Date32](../data-types/date32.md). ## age @@ -1982,7 +1982,7 @@ Result: **See Also** -- [toStartOfInterval](#tostartofintervaldate_or_date_with_time-interval-x-unit--time_zone) +- [toStartOfInterval](#tostartofinterval) ## date\_add @@ -2055,7 +2055,7 @@ Result: **See Also** -- [addDate](#addDate) +- [addDate](#adddate) ## date\_sub @@ -2129,7 +2129,7 @@ Result: **See Also** -- [subDate](#subDate) +- [subDate](#subdate) ## timestamp\_add @@ -2310,7 +2310,7 @@ Alias: `SUBDATE` - [date_sub](#date_sub) -## now {#now} +## now Returns the current date and time at the moment of query analysis. The function is a constant expression. @@ -3609,7 +3609,7 @@ SELECT timeSlots(toDateTime64('1980-12-12 21:01:02.1234', 4, 'UTC'), toDecimal64 └───────────────────────────────────────────────────────────────────────────────────────────────────────────┘ ``` -## formatDateTime {#formatDateTime} +## formatDateTime Formats a Time according to the given Format string. Format is a constant expression, so you cannot have multiple formats for a single result column. @@ -3734,10 +3734,9 @@ LIMIT 10 **See Also** -- [formatDateTimeInJodaSyntax](##formatDateTimeInJodaSyntax) +- [formatDateTimeInJodaSyntax](#formatdatetimeinjodasyntax) - -## formatDateTimeInJodaSyntax {#formatDateTimeInJodaSyntax} +## formatDateTimeInJodaSyntax Similar to formatDateTime, except that it formats datetime in Joda style instead of MySQL style. Refer to https://joda-time.sourceforge.net/apidocs/org/joda/time/format/DateTimeFormat.html. @@ -3902,11 +3901,11 @@ Result: **See Also** -- [fromUnixTimestampInJodaSyntax](##fromUnixTimestampInJodaSyntax) +- [fromUnixTimestampInJodaSyntax](#fromunixtimestampinjodasyntax) ## fromUnixTimestampInJodaSyntax -Same as [fromUnixTimestamp](#fromUnixTimestamp) but when called in the second way (two or three arguments), the formatting is performed using [Joda style](https://joda-time.sourceforge.net/apidocs/org/joda/time/format/DateTimeFormat.html) instead of MySQL style. +Same as [fromUnixTimestamp](#fromunixtimestamp) but when called in the second way (two or three arguments), the formatting is performed using [Joda style](https://joda-time.sourceforge.net/apidocs/org/joda/time/format/DateTimeFormat.html) instead of MySQL style. **Example:** @@ -4121,7 +4120,7 @@ Result: Returns the current date and time at the moment of query analysis. The function is a constant expression. :::note -This function gives the same result that `now('UTC')` would. It was added only for MySQL support and [`now`](#now-now) is the preferred usage. +This function gives the same result that `now('UTC')` would. It was added only for MySQL support and [`now`](#now) is the preferred usage. ::: **Syntax** diff --git a/docs/en/sql-reference/functions/ext-dict-functions.md b/docs/en/sql-reference/functions/ext-dict-functions.md index 82c21ce40c8..093ee690d47 100644 --- a/docs/en/sql-reference/functions/ext-dict-functions.md +++ b/docs/en/sql-reference/functions/ext-dict-functions.md @@ -12,7 +12,7 @@ For dictionaries created with [DDL queries](../../sql-reference/statements/creat For information on connecting and configuring dictionaries, see [Dictionaries](../../sql-reference/dictionaries/index.md). -## dictGet, dictGetOrDefault, dictGetOrNull {#dictGet} +## dictGet, dictGetOrDefault, dictGetOrNull Retrieves values from a dictionary. diff --git a/docs/en/sql-reference/functions/geo/geohash.md b/docs/en/sql-reference/functions/geo/geohash.md index 8abc8006e5d..b6ac7a74092 100644 --- a/docs/en/sql-reference/functions/geo/geohash.md +++ b/docs/en/sql-reference/functions/geo/geohash.md @@ -4,6 +4,8 @@ sidebar_label: Geohash title: "Functions for Working with Geohash" --- +## Geohash + [Geohash](https://en.wikipedia.org/wiki/Geohash) is the geocode system, which subdivides Earth’s surface into buckets of grid shape and encodes each cell into a short string of letters and digits. It is a hierarchical data structure, so the longer is the geohash string, the more precise is the geographic location. If you need to manually convert geographic coordinates to geohash strings, you can use [geohash.org](http://geohash.org/). diff --git a/docs/en/sql-reference/functions/geo/h3.md b/docs/en/sql-reference/functions/geo/h3.md index bcdd457964a..5fbc2adf2fa 100644 --- a/docs/en/sql-reference/functions/geo/h3.md +++ b/docs/en/sql-reference/functions/geo/h3.md @@ -4,6 +4,8 @@ sidebar_label: H3 Indexes title: "Functions for Working with H3 Indexes" --- +## H3 Index + [H3](https://eng.uber.com/h3/) is a geographical indexing system where Earth’s surface divided into a grid of even hexagonal cells. This system is hierarchical, i. e. each hexagon on the top level ("parent") can be split into seven even but smaller ones ("children"), and so on. The level of the hierarchy is called `resolution` and can receive a value from `0` till `15`, where `0` is the `base` level with the largest and coarsest cells. @@ -16,7 +18,7 @@ The full description of the H3 system is available at [the Uber Engineering site ## h3IsValid -Verifies whether the number is a valid [H3](#h3index) index. +Verifies whether the number is a valid [H3](#h3-index) index. **Syntax** @@ -51,7 +53,7 @@ Result: ## h3GetResolution -Defines the resolution of the given [H3](#h3index) index. +Defines the resolution of the given [H3](#h3-index) index. **Syntax** @@ -86,7 +88,7 @@ Result: ## h3EdgeAngle -Calculates the average length of the [H3](#h3index) hexagon edge in grades. +Calculates the average length of the [H3](#h3-index) hexagon edge in grades. **Syntax** @@ -100,7 +102,7 @@ h3EdgeAngle(resolution) **Returned values** -- The average length of the [H3](#h3index) hexagon edge in grades. [Float64](../../data-types/float.md). +- The average length of the [H3](#h3-index) hexagon edge in grades. [Float64](../../data-types/float.md). **Example** @@ -120,7 +122,7 @@ Result: ## h3EdgeLengthM -Calculates the average length of the [H3](#h3index) hexagon edge in meters. +Calculates the average length of the [H3](#h3-index) hexagon edge in meters. **Syntax** @@ -134,7 +136,7 @@ h3EdgeLengthM(resolution) **Returned values** -- The average length of the [H3](#h3index) hexagon edge in meters. [Float64](../../data-types/float.md). +- The average length of the [H3](#h3-index) hexagon edge in meters. [Float64](../../data-types/float.md). **Example** @@ -154,7 +156,7 @@ Result: ## h3EdgeLengthKm -Calculates the average length of the [H3](#h3index) hexagon edge in kilometers. +Calculates the average length of the [H3](#h3-index) hexagon edge in kilometers. **Syntax** @@ -168,7 +170,7 @@ h3EdgeLengthKm(resolution) **Returned values** -- The average length of the [H3](#h3index) hexagon edge in kilometers. [Float64](../../data-types/float.md). +- The average length of the [H3](#h3-index) hexagon edge in kilometers. [Float64](../../data-types/float.md). **Example** @@ -188,7 +190,7 @@ Result: ## geoToH3 -Returns [H3](#h3index) point index `(lon, lat)` with specified resolution. +Returns [H3](#h3-index) point index `(lon, lat)` with specified resolution. **Syntax** @@ -225,7 +227,7 @@ Result: ## h3ToGeo -Returns the centroid longitude and latitude corresponding to the provided [H3](#h3index) index. +Returns the centroid longitude and latitude corresponding to the provided [H3](#h3-index) index. **Syntax** @@ -294,7 +296,7 @@ Result: ## h3kRing - Lists all the [H3](#h3index) hexagons in the raduis of `k` from the given hexagon in random order. + Lists all the [H3](#h3-index) hexagons in the raduis of `k` from the given hexagon in random order. **Syntax** @@ -335,7 +337,7 @@ Result: ## h3GetBaseCell -Returns the base cell number of the [H3](#h3index) index. +Returns the base cell number of the [H3](#h3-index) index. **Syntax** @@ -437,7 +439,7 @@ Result: ## h3IndexesAreNeighbors -Returns whether or not the provided [H3](#h3index) indexes are neighbors. +Returns whether or not the provided [H3](#h3-index) indexes are neighbors. **Syntax** @@ -473,7 +475,7 @@ Result: ## h3ToChildren -Returns an array of child indexes for the given [H3](#h3index) index. +Returns an array of child indexes for the given [H3](#h3-index) index. **Syntax** @@ -508,7 +510,7 @@ Result: ## h3ToParent -Returns the parent (coarser) index containing the given [H3](#h3index) index. +Returns the parent (coarser) index containing the given [H3](#h3-index) index. **Syntax** @@ -609,7 +611,7 @@ Result: ## h3GetResolution -Returns the resolution of the [H3](#h3index) index. +Returns the resolution of the [H3](#h3-index) index. **Syntax** @@ -643,7 +645,7 @@ Result: ## h3IsResClassIII -Returns whether [H3](#h3index) index has a resolution with Class III orientation. +Returns whether [H3](#h3-index) index has a resolution with Class III orientation. **Syntax** @@ -678,7 +680,7 @@ Result: ## h3IsPentagon -Returns whether this [H3](#h3index) index represents a pentagonal cell. +Returns whether this [H3](#h3-index) index represents a pentagonal cell. **Syntax** @@ -713,7 +715,7 @@ Result: ## h3GetFaces -Returns icosahedron faces intersected by a given [H3](#h3index) index. +Returns icosahedron faces intersected by a given [H3](#h3-index) index. **Syntax** @@ -815,7 +817,7 @@ Result: ## h3ToCenterChild -Returns the center child (finer) [H3](#h3index) index contained by given [H3](#h3index) at the given resolution. +Returns the center child (finer) [H3](#h3-index) index contained by given [H3](#h3-index) at the given resolution. **Syntax** @@ -830,7 +832,7 @@ h3ToCenterChild(index, resolution) **Returned values** -- [H3](#h3index) index of the center child contained by given [H3](#h3index) at the given resolution. [UInt64](../../data-types/int-uint.md). +- [H3](#h3-index) index of the center child contained by given [H3](#h3-index) at the given resolution. [UInt64](../../data-types/int-uint.md). **Example** diff --git a/docs/en/sql-reference/functions/geo/s2.md b/docs/en/sql-reference/functions/geo/s2.md index 3165b21318b..e022ce870b0 100644 --- a/docs/en/sql-reference/functions/geo/s2.md +++ b/docs/en/sql-reference/functions/geo/s2.md @@ -5,6 +5,8 @@ sidebar_label: S2 Geometry # Functions for Working with S2 Index +## S2Index + [S2](https://s2geometry.io/) is a geographical indexing system where all geographical data is represented on a three-dimensional sphere (similar to a globe). In the S2 library points are represented as the S2 Index - a specific number which encodes internally a point on the surface of a unit sphere, unlike traditional (latitude, longitude) pairs. To get the S2 point index for a given point specified in the format (latitude, longitude) use the [geoToS2](#geotos2) function. Also, you can use the [s2ToGeo](#s2togeo) function for getting geographical coordinates corresponding to the specified S2 point index. diff --git a/docs/en/sql-reference/functions/hash-functions.md b/docs/en/sql-reference/functions/hash-functions.md index 506114038f7..e431ed75465 100644 --- a/docs/en/sql-reference/functions/hash-functions.md +++ b/docs/en/sql-reference/functions/hash-functions.md @@ -45,13 +45,13 @@ SELECT halfMD5(array('e','x','a'), 'mple', 10, toDateTime('2019-06-15 23:00:00') Calculates the MD4 from a string and returns the resulting set of bytes as FixedString(16). -## MD5 {#md5} +## MD5 Calculates the MD5 from a string and returns the resulting set of bytes as FixedString(16). If you do not need MD5 in particular, but you need a decent cryptographic 128-bit hash, use the ‘sipHash128’ function instead. If you want to get the same result as output by the md5sum utility, use lower(hex(MD5(s))). -## sipHash64 {#siphash64} +## sipHash64 Produces a 64-bit [SipHash](https://en.wikipedia.org/wiki/SipHash) hash value. diff --git a/docs/en/sql-reference/functions/ip-address-functions.md b/docs/en/sql-reference/functions/ip-address-functions.md index 5b6a3aef2c8..11a7749b33d 100644 --- a/docs/en/sql-reference/functions/ip-address-functions.md +++ b/docs/en/sql-reference/functions/ip-address-functions.md @@ -295,7 +295,7 @@ Same as `toIPv6`, but if the IPv6 address has an invalid format, it returns null ## toIPv6 Converts a string form of IPv6 address to [IPv6](../data-types/ipv6.md) type. If the IPv6 address has an invalid format, returns an empty value. -Similar to [IPv6StringToNum](#ipv6stringtonums) function, which converts IPv6 address to binary format. +Similar to [IPv6StringToNum](#ipv6stringtonum) function, which converts IPv6 address to binary format. If the input string contains a valid IPv4 address, then the IPv6 equivalent of the IPv4 address is returned. diff --git a/docs/en/sql-reference/functions/json-functions.md b/docs/en/sql-reference/functions/json-functions.md index 5d73c9a83b3..7bff6a6cba5 100644 --- a/docs/en/sql-reference/functions/json-functions.md +++ b/docs/en/sql-reference/functions/json-functions.md @@ -5,10 +5,10 @@ sidebar_label: JSON --- There are two sets of functions to parse JSON: - - [`simpleJSON*` (`visitParam*`)](#simplejson--visitparam-functions) which is made for parsing a limited subset of JSON extremely fast. + - [`simpleJSON*` (`visitParam*`)](#simplejson-visitparam-functions) which is made for parsing a limited subset of JSON extremely fast. - [`JSONExtract*`](#jsonextract-functions) which is made for parsing ordinary JSON. -## simpleJSON / visitParam functions +## simpleJSON (visitParam) functions ClickHouse has special functions for working with simplified JSON. All these JSON functions are based on strong assumptions about what the JSON can be. They try to do as little as possible to get the job done as quickly as possible. diff --git a/docs/en/sql-reference/functions/other-functions.md b/docs/en/sql-reference/functions/other-functions.md index 5e63d9824b4..e22dd5d827c 100644 --- a/docs/en/sql-reference/functions/other-functions.md +++ b/docs/en/sql-reference/functions/other-functions.md @@ -762,7 +762,7 @@ LIMIT 10 Given a size (number of bytes), this function returns a readable, rounded size with suffix (KB, MB, etc.) as string. -The opposite operations of this function are [parseReadableSize](#parseReadableSize), [parseReadableSizeOrZero](#parseReadableSizeOrZero), and [parseReadableSizeOrNull](#parseReadableSizeOrNull). +The opposite operations of this function are [parseReadableSize](#parsereadablesize), [parseReadableSizeOrZero](#parsereadablesizeorzero), and [parseReadableSizeOrNull](#parsereadablesizeornull). **Syntax** @@ -795,7 +795,7 @@ Result: Given a size (number of bytes), this function returns a readable, rounded size with suffix (KiB, MiB, etc.) as string. -The opposite operations of this function are [parseReadableSize](#parseReadableSize), [parseReadableSizeOrZero](#parseReadableSizeOrZero), and [parseReadableSizeOrNull](#parseReadableSizeOrNull). +The opposite operations of this function are [parseReadableSize](#parsereadablesize), [parseReadableSizeOrZero](#parsereadablesizeorzero), and [parseReadableSizeOrNull](#parsereadablesizeornull). **Syntax** @@ -926,7 +926,7 @@ SELECT Given a string containing a byte size and `B`, `KiB`, `KB`, `MiB`, `MB`, etc. as a unit (i.e. [ISO/IEC 80000-13](https://en.wikipedia.org/wiki/ISO/IEC_80000) or decimal byte unit), this function returns the corresponding number of bytes. If the function is unable to parse the input value, it throws an exception. -The inverse operations of this function are [formatReadableSize](#formatReadableSize) and [formatReadableDecimalSize](#formatReadableDecimalSize). +The inverse operations of this function are [formatReadableSize](#formatreadablesize) and [formatReadableDecimalSize](#formatreadabledecimalsize). **Syntax** @@ -964,7 +964,7 @@ SELECT Given a string containing a byte size and `B`, `KiB`, `KB`, `MiB`, `MB`, etc. as a unit (i.e. [ISO/IEC 80000-13](https://en.wikipedia.org/wiki/ISO/IEC_80000) or decimal byte unit), this function returns the corresponding number of bytes. If the function is unable to parse the input value, it returns `NULL`. -The inverse operations of this function are [formatReadableSize](#formatReadableSize) and [formatReadableDecimalSize](#formatReadableDecimalSize). +The inverse operations of this function are [formatReadableSize](#formatreadablesize) and [formatReadableDecimalSize](#formatreadabledecimalsize). **Syntax** @@ -1002,7 +1002,7 @@ SELECT Given a string containing a byte size and `B`, `KiB`, `KB`, `MiB`, `MB`, etc. as a unit (i.e. [ISO/IEC 80000-13](https://en.wikipedia.org/wiki/ISO/IEC_80000) or decimal byte unit), this function returns the corresponding number of bytes. If the function is unable to parse the input value, it returns `0`. -The inverse operations of this function are [formatReadableSize](#formatReadableSize) and [formatReadableDecimalSize](#formatReadableDecimalSize). +The inverse operations of this function are [formatReadableSize](#formatreadablesize) and [formatReadableDecimalSize](#formatreadabledecimalsize). **Syntax** @@ -2711,7 +2711,7 @@ countDigits(x) - Number of digits. [UInt8](../data-types/int-uint.md#uint-ranges). :::note -For `Decimal` values takes into account their scales: calculates result over underlying integer type which is `(value * scale)`. For example: `countDigits(42) = 2`, `countDigits(42.000) = 5`, `countDigits(0.04200) = 4`. I.e. you may check decimal overflow for `Decimal64` with `countDecimal(x) > 18`. It's a slow variant of [isDecimalOverflow](#is-decimal-overflow). +For `Decimal` values takes into account their scales: calculates result over underlying integer type which is `(value * scale)`. For example: `countDigits(42) = 2`, `countDigits(42.000) = 5`, `countDigits(0.04200) = 4`. I.e. you may check decimal overflow for `Decimal64` with `countDecimal(x) > 18`. It's a slow variant of [isDecimalOverflow](#isdecimaloverflow). ::: **Example** @@ -2803,7 +2803,7 @@ currentProfiles() ## enabledProfiles -Returns settings profiles, assigned to the current user both explicitly and implicitly. Explicitly assigned profiles are the same as returned by the [currentProfiles](#current-profiles) function. Implicitly assigned profiles include parent profiles of other assigned profiles, profiles assigned via granted roles, profiles assigned via their own settings, and the main default profile (see the `default_profile` section in the main server configuration file). +Returns settings profiles, assigned to the current user both explicitly and implicitly. Explicitly assigned profiles are the same as returned by the [currentProfiles](#currentprofiles) function. Implicitly assigned profiles include parent profiles of other assigned profiles, profiles assigned via granted roles, profiles assigned via their own settings, and the main default profile (see the `default_profile` section in the main server configuration file). **Syntax** @@ -2916,11 +2916,11 @@ Result: └───────────────────────────┘ ``` -## queryID {#queryID} +## queryID Returns the ID of the current query. Other parameters of a query can be extracted from the [system.query_log](../../operations/system-tables/query_log.md) table via `query_id`. -In contrast to [initialQueryID](#initial-query-id) function, `queryID` can return different results on different shards (see the example). +In contrast to [initialQueryID](#initialqueryid) function, `queryID` can return different results on different shards (see the example). **Syntax** @@ -2954,7 +2954,7 @@ Result: Returns the ID of the initial current query. Other parameters of a query can be extracted from the [system.query_log](../../operations/system-tables/query_log.md) table via `initial_query_id`. -In contrast to [queryID](#query-id) function, `initialQueryID` returns the same results on different shards (see example). +In contrast to [queryID](#queryid) function, `initialQueryID` returns the same results on different shards (see example). **Syntax** @@ -3041,7 +3041,7 @@ shardCount() **See Also** -- [shardNum()](#shard-num) function example also contains `shardCount()` function call. +- [shardNum()](#shardnum) function example also contains `shardCount()` function call. ## getOSKernelVersion diff --git a/docs/en/sql-reference/functions/rounding-functions.md b/docs/en/sql-reference/functions/rounding-functions.md index 6495a43fc85..e2f471d47eb 100644 --- a/docs/en/sql-reference/functions/rounding-functions.md +++ b/docs/en/sql-reference/functions/rounding-functions.md @@ -200,7 +200,7 @@ Banker's rounding is a method of rounding fractional numbers When the rounding number is halfway between two numbers, it's rounded to the nearest even digit at the specified decimal position. For example: 3.5 rounds up to 4, 2.5 rounds down to 2. It's the default rounding method for floating point numbers defined in [IEEE 754](https://en.wikipedia.org/wiki/IEEE_754#Roundings_to_nearest). -The [round](#rounding_functions-round) function performs the same rounding for floating point numbers. +The [round](#round) function performs the same rounding for floating point numbers. The `roundBankers` function also rounds integers the same way, for example, `roundBankers(45, -1) = 40`. In other cases, the function rounds numbers to the nearest integer. @@ -274,7 +274,7 @@ roundBankers(10.755, 2) = 10.76 **See Also** -- [round](#rounding_functions-round) +- [round](#round) ## roundToExp2 diff --git a/docs/en/sql-reference/functions/string-functions.md b/docs/en/sql-reference/functions/string-functions.md index 342ca2b9f03..c2d19f58422 100644 --- a/docs/en/sql-reference/functions/string-functions.md +++ b/docs/en/sql-reference/functions/string-functions.md @@ -1994,7 +1994,7 @@ Result: ## stringJaccardIndexUTF8 -Like [stringJaccardIndex](#stringJaccardIndex) but for UTF8-encoded strings. +Like [stringJaccardIndex](#stringjaccardindex) but for UTF8-encoded strings. ## editDistance diff --git a/docs/en/sql-reference/functions/string-search-functions.md b/docs/en/sql-reference/functions/string-search-functions.md index d261cff3580..b7ba1d4feb7 100644 --- a/docs/en/sql-reference/functions/string-search-functions.md +++ b/docs/en/sql-reference/functions/string-search-functions.md @@ -262,7 +262,7 @@ Result: ## multiSearchAllPositionsUTF8 -Like [multiSearchAllPositions](#multiSearchAllPositions) but assumes `haystack` and the `needle` substrings are UTF-8 encoded strings. +Like [multiSearchAllPositions](#multisearchallpositions) but assumes `haystack` and the `needle` substrings are UTF-8 encoded strings. **Syntax** @@ -336,7 +336,7 @@ Result: Like [`position`](#position) but returns the leftmost offset in a `haystack` string which matches any of multiple `needle` strings. -Functions [`multiSearchFirstPositionCaseInsensitive`](#multiSearchFirstPositionCaseInsensitive), [`multiSearchFirstPositionUTF8`](#multiSearchFirstPositionUTF8) and [`multiSearchFirstPositionCaseInsensitiveUTF8`](#multiSearchFirstPositionCaseInsensitiveUTF8) provide case-insensitive and/or UTF-8 variants of this function. +Functions [`multiSearchFirstPositionCaseInsensitive`](#multisearchfirstpositioncaseinsensitive), [`multiSearchFirstPositionUTF8`](#multisearchfirstpositionutf8) and [`multiSearchFirstPositionCaseInsensitiveUTF8`](#multisearchfirstpositioncaseinsensitiveutf8) provide case-insensitive and/or UTF-8 variants of this function. **Syntax** @@ -370,7 +370,7 @@ Result: ## multiSearchFirstPositionCaseInsensitive -Like [`multiSearchFirstPosition`](#multiSearchFirstPosition) but ignores case. +Like [`multiSearchFirstPosition`](#multisearchfirstposition) but ignores case. **Syntax** @@ -404,7 +404,7 @@ Result: ## multiSearchFirstPositionUTF8 -Like [`multiSearchFirstPosition`](#multiSearchFirstPosition) but assumes `haystack` and `needle` to be UTF-8 strings. +Like [`multiSearchFirstPosition`](#multisearchfirstposition) but assumes `haystack` and `needle` to be UTF-8 strings. **Syntax** @@ -440,7 +440,7 @@ Result: ## multiSearchFirstPositionCaseInsensitiveUTF8 -Like [`multiSearchFirstPosition`](#multiSearchFirstPosition) but assumes `haystack` and `needle` to be UTF-8 strings and ignores case. +Like [`multiSearchFirstPosition`](#multisearchfirstposition) but assumes `haystack` and `needle` to be UTF-8 strings and ignores case. **Syntax** @@ -478,7 +478,7 @@ Result: Returns the index `i` (starting from 1) of the leftmost found needlei in the string `haystack` and 0 otherwise. -Functions [`multiSearchFirstIndexCaseInsensitive`](#multiSearchFirstIndexCaseInsensitive), [`multiSearchFirstIndexUTF8`](#multiSearchFirstIndexUTF8) and [`multiSearchFirstIndexCaseInsensitiveUTF8`](#multiSearchFirstIndexCaseInsensitiveUTF8) provide case-insensitive and/or UTF-8 variants of this function. +Functions [`multiSearchFirstIndexCaseInsensitive`](#multisearchfirstindexcaseinsensitive), [`multiSearchFirstIndexUTF8`](#multisearchfirstindexutf8) and [`multiSearchFirstIndexCaseInsensitiveUTF8`](#multisearchfirstindexcaseinsensitiveutf8) provide case-insensitive and/or UTF-8 variants of this function. **Syntax** @@ -615,7 +615,7 @@ Result: Returns 1, if at least one string needlei matches the string `haystack` and 0 otherwise. -Functions [`multiSearchAnyCaseInsensitive`](#multiSearchAnyCaseInsensitive), [`multiSearchAnyUTF8`](#multiSearchAnyUTF8) and []`multiSearchAnyCaseInsensitiveUTF8`](#multiSearchAnyCaseInsensitiveUTF8) provide case-insensitive and/or UTF-8 variants of this function. +Functions [`multiSearchAnyCaseInsensitive`](#multisearchanycaseinsensitive), [`multiSearchAnyUTF8`](#multisearchanyutf8) and [`multiSearchAnyCaseInsensitiveUTF8`](#multisearchanycaseinsensitiveutf8) provide case-insensitive and/or UTF-8 variants of this function. **Syntax** @@ -719,7 +719,7 @@ Result: ## multiSearchAnyCaseInsensitiveUTF8 -Like [multiSearchAnyUTF8](#multiSearchAnyUTF8) but ignores case. +Like [multiSearchAnyUTF8](#multisearchanyutf8) but ignores case. *Syntax** @@ -880,7 +880,7 @@ extractAll(haystack, pattern) Matches all groups of the `haystack` string using the `pattern` regular expression. Returns an array of arrays, where the first array includes all fragments matching the first group, the second array - matching the second group, etc. -This function is slower than [extractAllGroupsVertical](#extractallgroups-vertical). +This function is slower than [extractAllGroupsVertical](#extractallgroupsvertical). **Syntax** @@ -952,7 +952,7 @@ Result: └────────────────────────────────────────────────────────────────────────────────────────┘ ``` -## like {#like} +## like Returns whether string `haystack` matches the LIKE expression `pattern`. @@ -1215,7 +1215,7 @@ Result: ## ngramSearchCaseInsensitive -Provides a case-insensitive variant of [ngramSearch](#ngramSearch). +Provides a case-insensitive variant of [ngramSearch](#ngramsearch). **Syntax** @@ -1630,7 +1630,7 @@ Result: ## hasSubsequenceCaseInsensitive -Like [hasSubsequence](#hasSubsequence) but searches case-insensitively. +Like [hasSubsequence](#hassubsequence) but searches case-insensitively. **Syntax** @@ -1665,7 +1665,7 @@ Result: ## hasSubsequenceUTF8 -Like [hasSubsequence](#hasSubsequence) but assumes `haystack` and `needle` are UTF-8 encoded strings. +Like [hasSubsequence](#hassubsequence) but assumes `haystack` and `needle` are UTF-8 encoded strings. **Syntax** @@ -1700,7 +1700,7 @@ Result: ## hasSubsequenceCaseInsensitiveUTF8 -Like [hasSubsequenceUTF8](#hasSubsequenceUTF8) but searches case-insensitively. +Like [hasSubsequenceUTF8](#hassubsequenceutf8) but searches case-insensitively. **Syntax** diff --git a/docs/en/sql-reference/functions/type-conversion-functions.md b/docs/en/sql-reference/functions/type-conversion-functions.md index 2ec51d43c59..61e84ca72d1 100644 --- a/docs/en/sql-reference/functions/type-conversion-functions.md +++ b/docs/en/sql-reference/functions/type-conversion-functions.md @@ -10,7 +10,7 @@ sidebar_label: Type Conversion ClickHouse generally uses the [same behavior as C++ programs](https://en.cppreference.com/w/cpp/language/implicit_conversion). -`to` functions and [cast](#castx-t) behave differently in some cases, for example in case of [LowCardinality](../data-types/lowcardinality.md): [cast](#castx-t) removes [LowCardinality](../data-types/lowcardinality.md) trait `to` functions don't. The same with [Nullable](../data-types/nullable.md), this behaviour is not compatible with SQL standard, and it can be changed using [cast_keep_nullable](../../operations/settings/settings.md/#cast_keep_nullable) setting. +`to` functions and [cast](#cast) behave differently in some cases, for example in case of [LowCardinality](../data-types/lowcardinality.md): [cast](#cast) removes [LowCardinality](../data-types/lowcardinality.md) trait `to` functions don't. The same with [Nullable](../data-types/nullable.md), this behaviour is not compatible with SQL standard, and it can be changed using [cast_keep_nullable](../../operations/settings/settings.md/#cast_keep_nullable) setting. :::note Be aware of potential data loss if values of a datatype are converted to a smaller datatype (for example from `Int64` to `Int32`) or between @@ -70,7 +70,7 @@ Integer value in the `Int8`, `Int16`, `Int32`, `Int64`, `Int128` or `Int256` dat Functions use [rounding towards zero](https://en.wikipedia.org/wiki/Rounding#Rounding_towards_zero), meaning they truncate fractional digits of numbers. -The behavior of functions for the [NaN and Inf](../data-types/float.md/#data_type-float-nan-inf) arguments is undefined. Remember about [numeric conversions issues](#numeric-conversion-issues), when using the functions. +The behavior of functions for the [NaN and Inf](../data-types/float.md/#data_type-float-nan-inf) arguments is undefined. Remember about [numeric conversions issues](#common-issues-with-data-conversion), when using the functions. **Example** @@ -169,7 +169,7 @@ Converts an input value to the [UInt](../data-types/int-uint.md) data type. This Functions use [rounding towards zero](https://en.wikipedia.org/wiki/Rounding#Rounding_towards_zero), meaning they truncate fractional digits of numbers. -The behavior of functions for negative arguments and for the [NaN and Inf](../data-types/float.md/#data_type-float-nan-inf) arguments is undefined. If you pass a string with a negative number, for example `'-32'`, ClickHouse raises an exception. Remember about [numeric conversions issues](#numeric-conversion-issues), when using the functions. +The behavior of functions for negative arguments and for the [NaN and Inf](../data-types/float.md/#data_type-float-nan-inf) arguments is undefined. If you pass a string with a negative number, for example `'-32'`, ClickHouse raises an exception. Remember about [numeric conversions issues](#common-issues-with-data-conversion), when using the functions. **Example** @@ -996,7 +996,7 @@ Result: ## reinterpretAsUInt8 -Performs byte reinterpretation by treating the input value as a value of type UInt8. Unlike [`CAST`](#castx-t), the function does not attempt to preserve the original value - if the target type is not able to represent the input type, the output is meaningless. +Performs byte reinterpretation by treating the input value as a value of type UInt8. Unlike [`CAST`](#cast), the function does not attempt to preserve the original value - if the target type is not able to represent the input type, the output is meaningless. **Syntax** @@ -1034,7 +1034,7 @@ Result: ## reinterpretAsUInt16 -Performs byte reinterpretation by treating the input value as a value of type UInt16. Unlike [`CAST`](#castx-t), the function does not attempt to preserve the original value - if the target type is not able to represent the input type, the output is meaningless. +Performs byte reinterpretation by treating the input value as a value of type UInt16. Unlike [`CAST`](#cast), the function does not attempt to preserve the original value - if the target type is not able to represent the input type, the output is meaningless. **Syntax** @@ -1072,7 +1072,7 @@ Result: ## reinterpretAsUInt32 -Performs byte reinterpretation by treating the input value as a value of type UInt32. Unlike [`CAST`](#castx-t), the function does not attempt to preserve the original value - if the target type is not able to represent the input type, the output is meaningless. +Performs byte reinterpretation by treating the input value as a value of type UInt32. Unlike [`CAST`](#cast), the function does not attempt to preserve the original value - if the target type is not able to represent the input type, the output is meaningless. **Syntax** @@ -1110,7 +1110,7 @@ Result: ## reinterpretAsUInt64 -Performs byte reinterpretation by treating the input value as a value of type UInt64. Unlike [`CAST`](#castx-t), the function does not attempt to preserve the original value - if the target type is not able to represent the input type, the output is meaningless. +Performs byte reinterpretation by treating the input value as a value of type UInt64. Unlike [`CAST`](#cast), the function does not attempt to preserve the original value - if the target type is not able to represent the input type, the output is meaningless. **Syntax** @@ -1148,7 +1148,7 @@ Result: ## reinterpretAsUInt128 -Performs byte reinterpretation by treating the input value as a value of type UInt128. Unlike [`CAST`](#castx-t), the function does not attempt to preserve the original value - if the target type is not able to represent the input type, the output is meaningless. +Performs byte reinterpretation by treating the input value as a value of type UInt128. Unlike [`CAST`](#cast), the function does not attempt to preserve the original value - if the target type is not able to represent the input type, the output is meaningless. **Syntax** @@ -1186,7 +1186,7 @@ Result: ## reinterpretAsUInt256 -Performs byte reinterpretation by treating the input value as a value of type UInt256. Unlike [`CAST`](#castx-t), the function does not attempt to preserve the original value - if the target type is not able to represent the input type, the output is meaningless. +Performs byte reinterpretation by treating the input value as a value of type UInt256. Unlike [`CAST`](#cast), the function does not attempt to preserve the original value - if the target type is not able to represent the input type, the output is meaningless. **Syntax** @@ -1224,7 +1224,7 @@ Result: ## reinterpretAsInt8 -Performs byte reinterpretation by treating the input value as a value of type Int8. Unlike [`CAST`](#castx-t), the function does not attempt to preserve the original value - if the target type is not able to represent the input type, the output is meaningless. +Performs byte reinterpretation by treating the input value as a value of type Int8. Unlike [`CAST`](#cast), the function does not attempt to preserve the original value - if the target type is not able to represent the input type, the output is meaningless. **Syntax** @@ -1262,7 +1262,7 @@ Result: ## reinterpretAsInt16 -Performs byte reinterpretation by treating the input value as a value of type Int16. Unlike [`CAST`](#castx-t), the function does not attempt to preserve the original value - if the target type is not able to represent the input type, the output is meaningless. +Performs byte reinterpretation by treating the input value as a value of type Int16. Unlike [`CAST`](#cast), the function does not attempt to preserve the original value - if the target type is not able to represent the input type, the output is meaningless. **Syntax** @@ -1300,7 +1300,7 @@ Result: ## reinterpretAsInt32 -Performs byte reinterpretation by treating the input value as a value of type Int32. Unlike [`CAST`](#castx-t), the function does not attempt to preserve the original value - if the target type is not able to represent the input type, the output is meaningless. +Performs byte reinterpretation by treating the input value as a value of type Int32. Unlike [`CAST`](#cast), the function does not attempt to preserve the original value - if the target type is not able to represent the input type, the output is meaningless. **Syntax** @@ -1338,7 +1338,7 @@ Result: ## reinterpretAsInt64 -Performs byte reinterpretation by treating the input value as a value of type Int64. Unlike [`CAST`](#castx-t), the function does not attempt to preserve the original value - if the target type is not able to represent the input type, the output is meaningless. +Performs byte reinterpretation by treating the input value as a value of type Int64. Unlike [`CAST`](#cast), the function does not attempt to preserve the original value - if the target type is not able to represent the input type, the output is meaningless. **Syntax** @@ -1376,7 +1376,7 @@ Result: ## reinterpretAsInt128 -Performs byte reinterpretation by treating the input value as a value of type Int128. Unlike [`CAST`](#castx-t), the function does not attempt to preserve the original value - if the target type is not able to represent the input type, the output is meaningless. +Performs byte reinterpretation by treating the input value as a value of type Int128. Unlike [`CAST`](#cast), the function does not attempt to preserve the original value - if the target type is not able to represent the input type, the output is meaningless. **Syntax** @@ -1414,7 +1414,7 @@ Result: ## reinterpretAsInt256 -Performs byte reinterpretation by treating the input value as a value of type Int256. Unlike [`CAST`](#castx-t), the function does not attempt to preserve the original value - if the target type is not able to represent the input type, the output is meaningless. +Performs byte reinterpretation by treating the input value as a value of type Int256. Unlike [`CAST`](#cast), the function does not attempt to preserve the original value - if the target type is not able to represent the input type, the output is meaningless. **Syntax** @@ -1452,7 +1452,7 @@ Result: ## reinterpretAsFloat32 -Performs byte reinterpretation by treating the input value as a value of type Float32. Unlike [`CAST`](#castx-t), the function does not attempt to preserve the original value - if the target type is not able to represent the input type, the output is meaningless. +Performs byte reinterpretation by treating the input value as a value of type Float32. Unlike [`CAST`](#cast), the function does not attempt to preserve the original value - if the target type is not able to represent the input type, the output is meaningless. **Syntax** @@ -1486,7 +1486,7 @@ Result: ## reinterpretAsFloat64 -Performs byte reinterpretation by treating the input value as a value of type Float64. Unlike [`CAST`](#castx-t), the function does not attempt to preserve the original value - if the target type is not able to represent the input type, the output is meaningless. +Performs byte reinterpretation by treating the input value as a value of type Float64. Unlike [`CAST`](#cast), the function does not attempt to preserve the original value - if the target type is not able to represent the input type, the output is meaningless. **Syntax** @@ -1730,7 +1730,7 @@ Result: └─────────────────────┘ ``` -## reinterpret(x, T) +## reinterpret Uses the same source in-memory bytes sequence for `x` value and reinterprets it to destination type. @@ -1766,9 +1766,9 @@ Result: └─────────────┴──────────────┴───────────────┘ ``` -## CAST(x, T) +## CAST -Converts an input value to the specified data type. Unlike the [reinterpret](#type_conversion_function-reinterpret) function, `CAST` tries to present the same value using the new data type. If the conversion can not be done then an exception is raised. +Converts an input value to the specified data type. Unlike the [reinterpret](#reinterpret) function, `CAST` tries to present the same value using the new data type. If the conversion can not be done then an exception is raised. Several syntax variants are supported. **Syntax** @@ -1875,7 +1875,7 @@ Result: Converts `x` to the `T` data type. -The difference from [cast(x, T)](#type_conversion_function-cast) is that `accurateCast` does not allow overflow of numeric types during cast if type value `x` does not fit the bounds of type `T`. For example, `accurateCast(-1, 'UInt8')` throws an exception. +The difference from [cast](#cast) is that `accurateCast` does not allow overflow of numeric types during cast if type value `x` does not fit the bounds of type `T`. For example, `accurateCast(-1, 'UInt8')` throws an exception. **Example** @@ -2061,7 +2061,7 @@ Result: └───────────────────────────┴──────────────────────────────┘ ``` -## parseDateTime {#type_conversion_functions-parseDateTime} +## parseDateTime Converts a [String](../data-types/string.md) to [DateTime](../data-types/datetime.md) according to a [MySQL format string](https://dev.mysql.com/doc/refman/8.0/en/date-and-time-functions.html#function_date-format). @@ -2102,15 +2102,15 @@ Alias: `TO_TIMESTAMP`. ## parseDateTimeOrZero -Same as for [parseDateTime](#type_conversion_functions-parseDateTime) except that it returns zero date when it encounters a date format that cannot be processed. +Same as for [parseDateTime](#parsedatetime) except that it returns zero date when it encounters a date format that cannot be processed. ## parseDateTimeOrNull -Same as for [parseDateTime](#type_conversion_functions-parseDateTime) except that it returns `NULL` when it encounters a date format that cannot be processed. +Same as for [parseDateTime](#parsedatetime) except that it returns `NULL` when it encounters a date format that cannot be processed. Alias: `str_to_date`. -## parseDateTimeInJodaSyntax {#type_conversion_functions-parseDateTimeInJodaSyntax} +## parseDateTimeInJodaSyntax Similar to [parseDateTime](#parsedatetime), except that the format string is in [Joda](https://joda-time.sourceforge.net/apidocs/org/joda/time/format/DateTimeFormat.html) instead of MySQL syntax. @@ -2151,11 +2151,11 @@ SELECT parseDateTimeInJodaSyntax('2023-02-24 14:53:31', 'yyyy-MM-dd HH:mm:ss', ' ## parseDateTimeInJodaSyntaxOrZero -Same as for [parseDateTimeInJodaSyntax](#type_conversion_functions-parseDateTimeInJodaSyntax) except that it returns zero date when it encounters a date format that cannot be processed. +Same as for [parseDateTimeInJodaSyntax](#parsedatetimeinjodasyntax) except that it returns zero date when it encounters a date format that cannot be processed. ## parseDateTimeInJodaSyntaxOrNull -Same as for [parseDateTimeInJodaSyntax](#type_conversion_functions-parseDateTimeInJodaSyntax) except that it returns `NULL` when it encounters a date format that cannot be processed. +Same as for [parseDateTimeInJodaSyntax](#parsedatetimeinjodasyntax) except that it returns `NULL` when it encounters a date format that cannot be processed. ## parseDateTimeBestEffort ## parseDateTime32BestEffort @@ -2313,11 +2313,11 @@ Same as for [parseDateTimeBestEffort](#parsedatetimebesteffort) except that it r ## parseDateTimeBestEffortUSOrNull -Same as [parseDateTimeBestEffortUS](#parsedatetimebesteffortUS) function except that it returns `NULL` when it encounters a date format that cannot be processed. +Same as [parseDateTimeBestEffortUS](#parsedatetimebesteffortus) function except that it returns `NULL` when it encounters a date format that cannot be processed. ## parseDateTimeBestEffortUSOrZero -Same as [parseDateTimeBestEffortUS](#parsedatetimebesteffortUS) function except that it returns zero date (`1970-01-01`) or zero date with time (`1970-01-01 00:00:00`) when it encounters a date format that cannot be processed. +Same as [parseDateTimeBestEffortUS](#parsedatetimebesteffortus) function except that it returns zero date (`1970-01-01`) or zero date with time (`1970-01-01 00:00:00`) when it encounters a date format that cannot be processed. ## parseDateTime64BestEffort @@ -2389,7 +2389,7 @@ Same as for [parseDateTime64BestEffort](#parsedatetime64besteffort), except that Converts input parameter to the [LowCardinality](../data-types/lowcardinality.md) version of same data type. -To convert data from the `LowCardinality` data type use the [CAST](#type_conversion_function-cast) function. For example, `CAST(x as String)`. +To convert data from the `LowCardinality` data type use the [CAST](#cast) function. For example, `CAST(x as String)`. **Syntax** diff --git a/docs/en/sql-reference/functions/uuid-functions.md b/docs/en/sql-reference/functions/uuid-functions.md index 0323ae728a9..5f15907d029 100644 --- a/docs/en/sql-reference/functions/uuid-functions.md +++ b/docs/en/sql-reference/functions/uuid-functions.md @@ -150,7 +150,7 @@ The function also works for [Arrays](array-functions.md#function-empty) and [Str **Example** -To generate the UUID value, ClickHouse provides the [generateUUIDv4](#uuid-function-generate) function. +To generate the UUID value, ClickHouse provides the [generateUUIDv4](#generateuuidv4) function. Query: @@ -190,7 +190,7 @@ The function also works for [Arrays](array-functions.md#function-notempty) or [S **Example** -To generate the UUID value, ClickHouse provides the [generateUUIDv4](#uuid-function-generate) function. +To generate the UUID value, ClickHouse provides the [generateUUIDv4](#generateuuidv4) function. Query: diff --git a/docs/en/sql-reference/operators/in.md b/docs/en/sql-reference/operators/in.md index 0257d21b30f..ed75b1802d8 100644 --- a/docs/en/sql-reference/operators/in.md +++ b/docs/en/sql-reference/operators/in.md @@ -235,7 +235,7 @@ If `some_predicate` is not selective enough, it will return a large amount of da ### Distributed Subqueries and max_parallel_replicas -When [max_parallel_replicas](#settings-max_parallel_replicas) is greater than 1, distributed queries are further transformed. +When [max_parallel_replicas](#distributed-subqueries-and-max_parallel_replicas) is greater than 1, distributed queries are further transformed. For example, the following: @@ -255,7 +255,7 @@ where `M` is between `1` and `3` depending on which replica the local query is e These settings affect every MergeTree-family table in the query and have the same effect as applying `SAMPLE 1/3 OFFSET (M-1)/3` on each table. -Therefore adding the [max_parallel_replicas](#settings-max_parallel_replicas) setting will only produce correct results if both tables have the same replication scheme and are sampled by UserID or a subkey of it. In particular, if `local_table_2` does not have a sampling key, incorrect results will be produced. The same rule applies to `JOIN`. +Therefore adding the [max_parallel_replicas](#distributed-subqueries-and-max_parallel_replicas) setting will only produce correct results if both tables have the same replication scheme and are sampled by UserID or a subkey of it. In particular, if `local_table_2` does not have a sampling key, incorrect results will be produced. The same rule applies to `JOIN`. One workaround if `local_table_2` does not meet the requirements, is to use `GLOBAL IN` or `GLOBAL JOIN`. diff --git a/docs/en/sql-reference/statements/alter/column.md b/docs/en/sql-reference/statements/alter/column.md index a23710b12bd..aa6f132e08e 100644 --- a/docs/en/sql-reference/statements/alter/column.md +++ b/docs/en/sql-reference/statements/alter/column.md @@ -108,7 +108,7 @@ ALTER TABLE visits RENAME COLUMN webBrowser TO browser CLEAR COLUMN [IF EXISTS] name IN PARTITION partition_name ``` -Resets all data in a column for a specified partition. Read more about setting the partition name in the section [How to set the partition expression](partition.md/#how-to-set-partition-expression). +Resets all data in a column for a specified partition. Read more about setting the partition name in the section [How to set the partition expression](../alter/partition.md/#how-to-set-partition-expression). If the `IF EXISTS` clause is specified, the query won’t return an error if the column does not exist. @@ -173,7 +173,7 @@ ALTER TABLE visits MODIFY COLUMN browser Array(String) Changing the column type is the only complex action – it changes the contents of files with data. For large tables, this may take a long time. -The query also can change the order of the columns using `FIRST | AFTER` clause, see [ADD COLUMN](#alter_add-column) description, but column type is mandatory in this case. +The query also can change the order of the columns using `FIRST | AFTER` clause, see [ADD COLUMN](#add-column) description, but column type is mandatory in this case. Example: diff --git a/docs/en/sql-reference/statements/alter/partition.md b/docs/en/sql-reference/statements/alter/partition.md index 0ed1e523669..778816f8934 100644 --- a/docs/en/sql-reference/statements/alter/partition.md +++ b/docs/en/sql-reference/statements/alter/partition.md @@ -31,7 +31,7 @@ The following operations with [partitions](/docs/en/engines/table-engines/merget ALTER TABLE table_name [ON CLUSTER cluster] DETACH PARTITION|PART partition_expr ``` -Moves all data for the specified partition to the `detached` directory. The server forgets about the detached data partition as if it does not exist. The server will not know about this data until you make the [ATTACH](#alter_attach-partition) query. +Moves all data for the specified partition to the `detached` directory. The server forgets about the detached data partition as if it does not exist. The server will not know about this data until you make the [ATTACH](#attach-partitionpart) query. Example: @@ -252,7 +252,7 @@ Downloads a partition from another server. This query only works for the replica The query does the following: 1. Downloads the partition|part from the specified shard. In ‘path-in-zookeeper’ you must specify a path to the shard in ZooKeeper. -2. Then the query puts the downloaded data to the `detached` directory of the `table_name` table. Use the [ATTACH PARTITION\|PART](#alter_attach-partition) query to add the data to the table. +2. Then the query puts the downloaded data to the `detached` directory of the `table_name` table. Use the [ATTACH PARTITION\|PART](#attach-partitionpart) query to add the data to the table. For example: @@ -353,7 +353,7 @@ You can specify the partition expression in `ALTER ... PARTITION` queries in dif - Using the keyword `ALL`. It can be used only with DROP/DETACH/ATTACH. For example, `ALTER TABLE visits ATTACH PARTITION ALL`. - As a tuple of expressions or constants that matches (in types) the table partitioning keys tuple. In the case of a single element partitioning key, the expression should be wrapped in the `tuple (...)` function. For example, `ALTER TABLE visits DETACH PARTITION tuple(toYYYYMM(toDate('2019-01-25')))`. - Using the partition ID. Partition ID is a string identifier of the partition (human-readable, if possible) that is used as the names of partitions in the file system and in ZooKeeper. The partition ID must be specified in the `PARTITION ID` clause, in a single quotes. For example, `ALTER TABLE visits DETACH PARTITION ID '201901'`. -- In the [ALTER ATTACH PART](#alter_attach-partition) and [DROP DETACHED PART](#alter_drop-detached) query, to specify the name of a part, use string literal with a value from the `name` column of the [system.detached_parts](/docs/en/operations/system-tables/detached_parts.md/#system_tables-detached_parts) table. For example, `ALTER TABLE visits ATTACH PART '201901_1_1_0'`. +- In the [ALTER ATTACH PART](#attach-partitionpart) and [DROP DETACHED PART](#drop-detached-partitionpart) query, to specify the name of a part, use string literal with a value from the `name` column of the [system.detached_parts](/docs/en/operations/system-tables/detached_parts.md/#system_tables-detached_parts) table. For example, `ALTER TABLE visits ATTACH PART '201901_1_1_0'`. Usage of quotes when specifying the partition depends on the type of partition expression. For example, for the `String` type, you have to specify its name in quotes (`'`). For the `Date` and `Int*` types no quotes are needed. diff --git a/docs/en/sql-reference/statements/create/table.md b/docs/en/sql-reference/statements/create/table.md index 628fe1d2875..0253bc647e6 100644 --- a/docs/en/sql-reference/statements/create/table.md +++ b/docs/en/sql-reference/statements/create/table.md @@ -17,8 +17,8 @@ By default, tables are created only on the current server. Distributed DDL queri ``` sql CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster] ( - name1 [type1] [NULL|NOT NULL] [DEFAULT|MATERIALIZED|EPHEMERAL|ALIAS expr1] [compression_codec] [TTL expr1] [COMMENT 'comment for column'], - name2 [type2] [NULL|NOT NULL] [DEFAULT|MATERIALIZED|EPHEMERAL|ALIAS expr2] [compression_codec] [TTL expr2] [COMMENT 'comment for column'], + name1 [type1] [NULL|NOT NULL] [DEFAULT|MATERIALIZED|EPHEMERAL|ALIAS expr1] [COMMENT 'comment for column'] [compression_codec] [TTL expr1], + name2 [type2] [NULL|NOT NULL] [DEFAULT|MATERIALIZED|EPHEMERAL|ALIAS expr2] [COMMENT 'comment for column'] [compression_codec] [TTL expr2], ... ) ENGINE = engine COMMENT 'comment for table' diff --git a/docs/en/sql-reference/statements/create/view.md b/docs/en/sql-reference/statements/create/view.md index 1bdf22b35b0..1fabb6d8cc7 100644 --- a/docs/en/sql-reference/statements/create/view.md +++ b/docs/en/sql-reference/statements/create/view.md @@ -6,7 +6,7 @@ sidebar_label: VIEW # CREATE VIEW -Creates a new view. Views can be [normal](#normal-view), [materialized](#materialized-view), [live](#live-view-experimental), and [window](#window-view-experimental) (live view and window view are experimental features). +Creates a new view. Views can be [normal](#normal-view), [materialized](#materialized-view), [live](#live-view-deprecated), and [window](#window-view-experimental) (live view and window view are experimental features). ## Normal View diff --git a/docs/en/sql-reference/statements/grant.md b/docs/en/sql-reference/statements/grant.md index 2850ce71781..43fa344a16d 100644 --- a/docs/en/sql-reference/statements/grant.md +++ b/docs/en/sql-reference/statements/grant.md @@ -33,7 +33,7 @@ GRANT [ON CLUSTER cluster_name] role [,...] TO {user | another_role | CURRENT_US - `role` — ClickHouse user role. - `user` — ClickHouse user account. -The `WITH ADMIN OPTION` clause grants [ADMIN OPTION](#admin-option-privilege) privilege to `user` or `role`. +The `WITH ADMIN OPTION` clause grants [ADMIN OPTION](#admin-option) privilege to `user` or `role`. The `WITH REPLACE OPTION` clause replace old roles by new role for the `user` or `role`, if is not specified it appends roles. ## Grant Current Grants Syntax @@ -201,7 +201,7 @@ Hierarchy of privileges: - `HDFS` - `S3` - [dictGet](#dictget) -- [displaySecretsInShowAndSelect](#display-secrets) +- [displaySecretsInShowAndSelect](#displaysecretsinshowandselect) - [NAMED COLLECTION ADMIN](#named-collection-admin) - `CREATE NAMED COLLECTION` - `DROP NAMED COLLECTION` @@ -498,7 +498,7 @@ Privilege level: `DICTIONARY`. - `GRANT dictGet ON mydictionary TO john` -### displaySecretsInShowAndSelect {#display-secrets} +### displaySecretsInShowAndSelect Allows a user to view secrets in `SHOW` and `SELECT` queries if both [`display_secrets_in_show_and_select` server setting](../../operations/server-configuration-parameters/settings#display_secrets_in_show_and_select) diff --git a/docs/en/sql-reference/statements/select/sample.md b/docs/en/sql-reference/statements/select/sample.md index 137f86cc8b9..78e05b19bd1 100644 --- a/docs/en/sql-reference/statements/select/sample.md +++ b/docs/en/sql-reference/statements/select/sample.md @@ -27,14 +27,14 @@ The features of data sampling are listed below: For the `SAMPLE` clause the following syntax is supported: -| SAMPLE Clause Syntax | Description | -|----------------------|------------------------------| -| `SAMPLE k` | Here `k` is the number from 0 to 1. The query is executed on `k` fraction of data. For example, `SAMPLE 0.1` runs the query on 10% of data. [Read more](#select-sample-k) | -| `SAMPLE n` | Here `n` is a sufficiently large integer. The query is executed on a sample of at least `n` rows (but not significantly more than this). For example, `SAMPLE 10000000` runs the query on a minimum of 10,000,000 rows. [Read more](#select-sample-n) | -| `SAMPLE k OFFSET m` | Here `k` and `m` are the numbers from 0 to 1. The query is executed on a sample of `k` fraction of the data. The data used for the sample is offset by `m` fraction. [Read more](#select-sample-offset) | +| SAMPLE Clause Syntax | Description | +|----------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| `SAMPLE k` | Here `k` is the number from 0 to 1. The query is executed on `k` fraction of data. For example, `SAMPLE 0.1` runs the query on 10% of data. [Read more](#sample-k) | +| `SAMPLE n` | Here `n` is a sufficiently large integer. The query is executed on a sample of at least `n` rows (but not significantly more than this). For example, `SAMPLE 10000000` runs the query on a minimum of 10,000,000 rows. [Read more](#sample-n) | +| `SAMPLE k OFFSET m` | Here `k` and `m` are the numbers from 0 to 1. The query is executed on a sample of `k` fraction of the data. The data used for the sample is offset by `m` fraction. [Read more](#sample-k-offset-m) | -## SAMPLE K {#select-sample-k} +## SAMPLE K Here `k` is the number from 0 to 1 (both fractional and decimal notations are supported). For example, `SAMPLE 1/2` or `SAMPLE 0.5`. @@ -54,7 +54,7 @@ ORDER BY PageViews DESC LIMIT 1000 In this example, the query is executed on a sample from 0.1 (10%) of data. Values of aggregate functions are not corrected automatically, so to get an approximate result, the value `count()` is manually multiplied by 10. -## SAMPLE N {#select-sample-n} +## SAMPLE N Here `n` is a sufficiently large integer. For example, `SAMPLE 10000000`. @@ -90,7 +90,7 @@ FROM visits SAMPLE 10000000 ``` -## SAMPLE K OFFSET M {#select-sample-offset} +## SAMPLE K OFFSET M Here `k` and `m` are numbers from 0 to 1. Examples are shown below. diff --git a/docs/en/sql-reference/statements/system.md b/docs/en/sql-reference/statements/system.md index 7efbff1b42b..e6d3439d2b9 100644 --- a/docs/en/sql-reference/statements/system.md +++ b/docs/en/sql-reference/statements/system.md @@ -174,7 +174,7 @@ Aborts ClickHouse process (like `kill -9 {$ pid_clickhouse-server}`) ## Managing Distributed Tables -ClickHouse can manage [distributed](../../engines/table-engines/special/distributed.md) tables. When a user inserts data into these tables, ClickHouse first creates a queue of the data that should be sent to cluster nodes, then asynchronously sends it. You can manage queue processing with the [STOP DISTRIBUTED SENDS](#query_language-system-stop-distributed-sends), [FLUSH DISTRIBUTED](#query_language-system-flush-distributed), and [START DISTRIBUTED SENDS](#query_language-system-start-distributed-sends) queries. You can also synchronously insert distributed data with the [distributed_foreground_insert](../../operations/settings/settings.md#distributed_foreground_insert) setting. +ClickHouse can manage [distributed](../../engines/table-engines/special/distributed.md) tables. When a user inserts data into these tables, ClickHouse first creates a queue of the data that should be sent to cluster nodes, then asynchronously sends it. You can manage queue processing with the [STOP DISTRIBUTED SENDS](#stop-distributed-sends), [FLUSH DISTRIBUTED](#flush-distributed), and [START DISTRIBUTED SENDS](#start-distributed-sends) queries. You can also synchronously insert distributed data with the [distributed_foreground_insert](../../operations/settings/settings.md#distributed_foreground_insert) setting. ### STOP DISTRIBUTED SENDS diff --git a/docs/en/sql-reference/syntax.md b/docs/en/sql-reference/syntax.md index fc0286e76ad..6a4afb63db8 100644 --- a/docs/en/sql-reference/syntax.md +++ b/docs/en/sql-reference/syntax.md @@ -54,11 +54,11 @@ Identifiers are: - Cluster, database, table, partition, and column names. - Functions. - Data types. -- [Expression aliases](#expression_aliases). +- [Expression aliases](#expression-aliases). Identifiers can be quoted or non-quoted. The latter is preferred. -Non-quoted identifiers must match the regex `^[a-zA-Z_][0-9a-zA-Z_]*$` and can not be equal to [keywords](#syntax-keywords). Examples: `x`, `_1`, `X_y__Z123_`. +Non-quoted identifiers must match the regex `^[a-zA-Z_][0-9a-zA-Z_]*$` and can not be equal to [keywords](#keywords). Examples: `x`, `_1`, `X_y__Z123_`. If you want to use identifiers the same as keywords or you want to use other symbols in identifiers, quote it using double quotes or backticks, for example, `"id"`, `` `id` ``. diff --git a/docs/en/sql-reference/table-functions/azureBlobStorage.md b/docs/en/sql-reference/table-functions/azureBlobStorage.md index 1510489ce83..f59fedeb3a2 100644 --- a/docs/en/sql-reference/table-functions/azureBlobStorage.md +++ b/docs/en/sql-reference/table-functions/azureBlobStorage.md @@ -72,6 +72,7 @@ SELECT count(*) FROM azureBlobStorage('DefaultEndpointsProtocol=https;AccountNam - `_path` — Path to the file. Type: `LowCardinalty(String)`. - `_file` — Name of the file. Type: `LowCardinalty(String)`. - `_size` — Size of the file in bytes. Type: `Nullable(UInt64)`. If the file size is unknown, the value is `NULL`. +- `_time` — Last modified time of the file. Type: `Nullable(DateTime)`. If the time is unknown, the value is `NULL`. **See Also** diff --git a/docs/en/sql-reference/table-functions/file.md b/docs/en/sql-reference/table-functions/file.md index f66178afbb2..3a3162dad9a 100644 --- a/docs/en/sql-reference/table-functions/file.md +++ b/docs/en/sql-reference/table-functions/file.md @@ -18,7 +18,7 @@ file([path_to_archive ::] path [,format] [,structure] [,compression]) **Parameters** -- `path` — The relative path to the file from [user_files_path](/docs/en/operations/server-configuration-parameters/settings.md#server_configuration_parameters-user_files_path). Supports in read-only mode the following [globs](#globs_in_path): `*`, `?`, `{abc,def}` (with `'abc'` and `'def'` being strings) and `{N..M}` (with `N` and `M` being numbers). +- `path` — The relative path to the file from [user_files_path](/docs/en/operations/server-configuration-parameters/settings.md#server_configuration_parameters-user_files_path). Supports in read-only mode the following [globs](#globs-in-path): `*`, `?`, `{abc,def}` (with `'abc'` and `'def'` being strings) and `{N..M}` (with `N` and `M` being numbers). - `path_to_archive` - The relative path to a zip/tar/7z archive. Supports the same globs as `path`. - `format` — The [format](/docs/en/interfaces/formats.md#formats) of the file. - `structure` — Structure of the table. Format: `'column1_name column1_type, column2_name column2_type, ...'`. @@ -128,7 +128,7 @@ Reading data from `table.csv`, located in `archive1.zip` or/and `archive2.zip`: SELECT * FROM file('user_files/archives/archive{1..2}.zip :: table.csv'); ``` -## Globs in path {#globs_in_path} +## Globs in path Paths may use globbing. Files must match the whole path pattern, not only the suffix or prefix. @@ -196,6 +196,7 @@ SELECT count(*) FROM file('big_dir/**/file002', 'CSV', 'name String, value UInt3 - `_path` — Path to the file. Type: `LowCardinalty(String)`. - `_file` — Name of the file. Type: `LowCardinalty(String)`. - `_size` — Size of the file in bytes. Type: `Nullable(UInt64)`. If the file size is unknown, the value is `NULL`. +- `_time` — Last modified time of the file. Type: `Nullable(DateTime)`. If the time is unknown, the value is `NULL`. ## Settings {#settings} diff --git a/docs/en/sql-reference/table-functions/fileCluster.md b/docs/en/sql-reference/table-functions/fileCluster.md index 4677d2883a7..62b00fadd62 100644 --- a/docs/en/sql-reference/table-functions/fileCluster.md +++ b/docs/en/sql-reference/table-functions/fileCluster.md @@ -22,7 +22,7 @@ fileCluster(cluster_name, path[, format, structure, compression_method]) **Arguments** - `cluster_name` — Name of a cluster that is used to build a set of addresses and connection parameters to remote and local servers. -- `path` — The relative path to the file from [user_files_path](/docs/en/operations/server-configuration-parameters/settings.md#server_configuration_parameters-user_files_path). Path to file also supports [globs](#globs_in_path). +- `path` — The relative path to the file from [user_files_path](/docs/en/operations/server-configuration-parameters/settings.md#server_configuration_parameters-user_files_path). Path to file also supports [globs](#globs-in-path). - `format` — [Format](../../interfaces/formats.md#formats) of the files. Type: [String](../../sql-reference/data-types/string.md). - `structure` — Table structure in `'UserID UInt64, Name String'` format. Determines column names and types. Type: [String](../../sql-reference/data-types/string.md). - `compression_method` — Compression method. Supported compression types are `gz`, `br`, `xz`, `zst`, `lz4`, and `bz2`. @@ -74,7 +74,7 @@ SELECT * FROM fileCluster('my_cluster', 'file{1,2}.csv', 'CSV', 'i UInt32, s Str ``` -## Globs in Path {#globs_in_path} +## Globs in Path All patterns supported by [File](../../sql-reference/table-functions/file.md#globs-in-path) table function are supported by FileCluster. diff --git a/docs/en/sql-reference/table-functions/hdfs.md b/docs/en/sql-reference/table-functions/hdfs.md index d65615e7588..28cba5ccc6a 100644 --- a/docs/en/sql-reference/table-functions/hdfs.md +++ b/docs/en/sql-reference/table-functions/hdfs.md @@ -97,6 +97,7 @@ FROM hdfs('hdfs://hdfs1:9000/big_dir/file{0..9}{0..9}{0..9}', 'CSV', 'name Strin - `_path` — Path to the file. Type: `LowCardinalty(String)`. - `_file` — Name of the file. Type: `LowCardinalty(String)`. - `_size` — Size of the file in bytes. Type: `Nullable(UInt64)`. If the size is unknown, the value is `NULL`. +- `_time` — Last modified time of the file. Type: `Nullable(DateTime)`. If the time is unknown, the value is `NULL`. ## Storage Settings {#storage-settings} diff --git a/docs/en/sql-reference/table-functions/s3.md b/docs/en/sql-reference/table-functions/s3.md index cbef80371a3..1a7e2b8d66a 100644 --- a/docs/en/sql-reference/table-functions/s3.md +++ b/docs/en/sql-reference/table-functions/s3.md @@ -272,6 +272,7 @@ FROM s3( - `_path` — Path to the file. Type: `LowCardinalty(String)`. - `_file` — Name of the file. Type: `LowCardinalty(String)`. - `_size` — Size of the file in bytes. Type: `Nullable(UInt64)`. If the file size is unknown, the value is `NULL`. +- `_time` — Last modified time of the file. Type: `Nullable(DateTime)`. If the time is unknown, the value is `NULL`. ## Storage Settings {#storage-settings} diff --git a/docs/en/sql-reference/table-functions/url.md b/docs/en/sql-reference/table-functions/url.md index 4dc6e435b50..3bb7aff53a7 100644 --- a/docs/en/sql-reference/table-functions/url.md +++ b/docs/en/sql-reference/table-functions/url.md @@ -53,6 +53,7 @@ Character `|` inside patterns is used to specify failover addresses. They are it - `_path` — Path to the `URL`. Type: `LowCardinalty(String)`. - `_file` — Resource name of the `URL`. Type: `LowCardinalty(String)`. - `_size` — Size of the resource in bytes. Type: `Nullable(UInt64)`. If the size is unknown, the value is `NULL`. +- `_time` — Last modified time of the file. Type: `Nullable(DateTime)`. If the time is unknown, the value is `NULL`. ## Storage Settings {#storage-settings} diff --git a/programs/server/Server.cpp b/programs/server/Server.cpp index 8fcb9d87a93..00994b39a40 100644 --- a/programs/server/Server.cpp +++ b/programs/server/Server.cpp @@ -48,6 +48,7 @@ #include #include #include +#include #include #include #include @@ -70,7 +71,6 @@ #include #include #include -#include #include #include #include @@ -773,7 +773,27 @@ try LOG_INFO(log, "Available CPU instruction sets: {}", cpu_info); #endif - bool will_have_trace_collector = hasPHDRCache() && config().has("trace_log"); + bool has_trace_collector = false; + /// Disable it if we collect test coverage information, because it will work extremely slow. +#if !WITH_COVERAGE + /// Profilers cannot work reliably with any other libunwind or without PHDR cache. + has_trace_collector = hasPHDRCache() && config().has("trace_log"); +#endif + + /// Describe multiple reasons when query profiler cannot work. + +#if WITH_COVERAGE + LOG_INFO(log, "Query Profiler and TraceCollector are disabled because they work extremely slow with test coverage."); +#endif + +#if defined(SANITIZER) + LOG_INFO(log, "Query Profiler disabled because they cannot work under sanitizers" + " when two different stack unwinding methods will interfere with each other."); +#endif + + if (!hasPHDRCache()) + LOG_INFO(log, "Query Profiler and TraceCollector are disabled because they require PHDR cache to be created" + " (otherwise the function 'dl_iterate_phdr' is not lock free and not async-signal safe)."); // Initialize global thread pool. Do it before we fetch configs from zookeeper // nodes (`from_zk`), because ZooKeeper interface uses the pool. We will @@ -782,8 +802,27 @@ try server_settings.max_thread_pool_size, server_settings.max_thread_pool_free_size, server_settings.thread_pool_queue_size, - will_have_trace_collector ? server_settings.global_profiler_real_time_period_ns : 0, - will_have_trace_collector ? server_settings.global_profiler_cpu_time_period_ns : 0); + has_trace_collector ? server_settings.global_profiler_real_time_period_ns : 0, + has_trace_collector ? server_settings.global_profiler_cpu_time_period_ns : 0); + + if (has_trace_collector) + { + global_context->createTraceCollector(); + + /// Set up server-wide memory profiler (for total memory tracker). + if (server_settings.total_memory_profiler_step) + total_memory_tracker.setProfilerStep(server_settings.total_memory_profiler_step); + + if (server_settings.total_memory_tracker_sample_probability > 0.0) + total_memory_tracker.setSampleProbability(server_settings.total_memory_tracker_sample_probability); + + if (server_settings.total_memory_profiler_sample_min_allocation_size) + total_memory_tracker.setSampleMinAllocationSize(server_settings.total_memory_profiler_sample_min_allocation_size); + + if (server_settings.total_memory_profiler_sample_max_allocation_size) + total_memory_tracker.setSampleMaxAllocationSize(server_settings.total_memory_profiler_sample_max_allocation_size); + } + /// Wait for all threads to avoid possible use-after-free (for example logging objects can be already destroyed). SCOPE_EXIT({ Stopwatch watch; @@ -1339,7 +1378,7 @@ try CompiledExpressionCacheFactory::instance().init(compiled_expression_cache_max_size_in_bytes, compiled_expression_cache_max_elements); #endif - NamedCollectionUtils::loadIfNot(); + NamedCollectionFactory::instance().loadIfNot(); /// Initialize main config reloader. std::string include_from_path = config().getString("include_from", "/etc/metrika.xml"); @@ -1608,7 +1647,7 @@ try #if USE_SSL CertificateReloader::instance().tryLoad(*config); #endif - NamedCollectionUtils::reloadFromConfig(*config); + NamedCollectionFactory::instance().reloadFromConfig(*config); FileCacheFactory::instance().updateSettingsFromConfig(*config); @@ -1950,52 +1989,9 @@ try LOG_DEBUG(log, "Loaded metadata."); - /// Init trace collector only after trace_log system table was created - /// Disable it if we collect test coverage information, because it will work extremely slow. -#if !WITH_COVERAGE - /// Profilers cannot work reliably with any other libunwind or without PHDR cache. - if (hasPHDRCache()) - { + if (has_trace_collector) global_context->initializeTraceCollector(); - /// Set up server-wide memory profiler (for total memory tracker). - if (server_settings.total_memory_profiler_step) - { - total_memory_tracker.setProfilerStep(server_settings.total_memory_profiler_step); - } - - if (server_settings.total_memory_tracker_sample_probability > 0.0) - { - total_memory_tracker.setSampleProbability(server_settings.total_memory_tracker_sample_probability); - } - - if (server_settings.total_memory_profiler_sample_min_allocation_size) - { - total_memory_tracker.setSampleMinAllocationSize(server_settings.total_memory_profiler_sample_min_allocation_size); - } - - if (server_settings.total_memory_profiler_sample_max_allocation_size) - { - total_memory_tracker.setSampleMaxAllocationSize(server_settings.total_memory_profiler_sample_max_allocation_size); - } - } -#endif - - /// Describe multiple reasons when query profiler cannot work. - -#if WITH_COVERAGE - LOG_INFO(log, "Query Profiler and TraceCollector are disabled because they work extremely slow with test coverage."); -#endif - -#if defined(SANITIZER) - LOG_INFO(log, "Query Profiler disabled because they cannot work under sanitizers" - " when two different stack unwinding methods will interfere with each other."); -#endif - - if (!hasPHDRCache()) - LOG_INFO(log, "Query Profiler and TraceCollector are disabled because they require PHDR cache to be created" - " (otherwise the function 'dl_iterate_phdr' is not lock free and not async-signal safe)."); - #if defined(OS_LINUX) auto tasks_stats_provider = TasksStatsCounters::findBestAvailableProvider(); if (tasks_stats_provider == TasksStatsCounters::MetricsProvider::None) diff --git a/src/Analyzer/FunctionNode.cpp b/src/Analyzer/FunctionNode.cpp index f13842cf67c..e98b04fe9a9 100644 --- a/src/Analyzer/FunctionNode.cpp +++ b/src/Analyzer/FunctionNode.cpp @@ -1,5 +1,7 @@ #include +#include + #include #include @@ -58,12 +60,20 @@ ColumnsWithTypeAndName FunctionNode::getArgumentColumns() const ColumnWithTypeAndName argument_column; + auto * constant = argument->as(); if (isNameOfInFunction(function_name) && i == 1) + { argument_column.type = std::make_shared(); + if (constant) + { + /// Created but not filled for the analysis during function resolution. + FutureSetPtr empty_set; + argument_column.column = ColumnConst::create(ColumnSet::create(1, empty_set), 1); + } + } else argument_column.type = argument->getResultType(); - auto * constant = argument->as(); if (constant && !isNotCreatable(argument_column.type)) argument_column.column = argument_column.type->createColumnConst(1, constant->getValue()); diff --git a/src/Analyzer/InterpolateNode.cpp b/src/Analyzer/InterpolateNode.cpp index e4f7e22b803..97dc79f565b 100644 --- a/src/Analyzer/InterpolateNode.cpp +++ b/src/Analyzer/InterpolateNode.cpp @@ -10,9 +10,12 @@ namespace DB { -InterpolateNode::InterpolateNode(QueryTreeNodePtr expression_, QueryTreeNodePtr interpolate_expression_) +InterpolateNode::InterpolateNode(std::shared_ptr expression_, QueryTreeNodePtr interpolate_expression_) : IQueryTreeNode(children_size) { + if (expression_) + expression_name = expression_->getIdentifier().getFullName(); + children[expression_child_index] = std::move(expression_); children[interpolate_expression_child_index] = std::move(interpolate_expression_); } @@ -41,13 +44,23 @@ void InterpolateNode::updateTreeHashImpl(HashState &, CompareOptions) const QueryTreeNodePtr InterpolateNode::cloneImpl() const { - return std::make_shared(nullptr /*expression*/, nullptr /*interpolate_expression*/); + auto cloned = std::make_shared(nullptr /*expression*/, nullptr /*interpolate_expression*/); + cloned->expression_name = expression_name; + return cloned; } ASTPtr InterpolateNode::toASTImpl(const ConvertToASTOptions & options) const { auto result = std::make_shared(); - result->column = getExpression()->toAST(options)->getColumnName(); + + /// Interpolate parser supports only identifier node. + /// In case of alias, identifier is replaced to expression, which can't be parsed. + /// In this case, keep original alias name. + if (const auto * identifier = getExpression()->as()) + result->column = identifier->toAST(options)->getColumnName(); + else + result->column = expression_name; + result->children.push_back(getInterpolateExpression()->toAST(options)); result->expr = result->children.back(); diff --git a/src/Analyzer/InterpolateNode.h b/src/Analyzer/InterpolateNode.h index 9269d3924f5..ec493ed8bdd 100644 --- a/src/Analyzer/InterpolateNode.h +++ b/src/Analyzer/InterpolateNode.h @@ -1,6 +1,6 @@ #pragma once -#include +#include #include namespace DB @@ -19,7 +19,7 @@ class InterpolateNode final : public IQueryTreeNode { public: /// Initialize interpolate node with expression and interpolate expression - explicit InterpolateNode(QueryTreeNodePtr expression_, QueryTreeNodePtr interpolate_expression_); + explicit InterpolateNode(std::shared_ptr expression_, QueryTreeNodePtr interpolate_expression_); /// Get expression to interpolate const QueryTreeNodePtr & getExpression() const @@ -61,6 +61,9 @@ protected: ASTPtr toASTImpl(const ConvertToASTOptions & options) const override; + /// Initial name from column identifier. + std::string expression_name; + private: static constexpr size_t expression_child_index = 0; static constexpr size_t interpolate_expression_child_index = 1; diff --git a/src/Analyzer/Passes/AggregateFunctionsArithmericOperationsPass.cpp b/src/Analyzer/Passes/AggregateFunctionsArithmericOperationsPass.cpp index 9153bc4eca2..e6798a792dd 100644 --- a/src/Analyzer/Passes/AggregateFunctionsArithmericOperationsPass.cpp +++ b/src/Analyzer/Passes/AggregateFunctionsArithmericOperationsPass.cpp @@ -51,7 +51,7 @@ public: using Base = InDepthQueryTreeVisitorWithContext; using Base::Base; - void leaveImpl(QueryTreeNodePtr & node) + void enterImpl(QueryTreeNodePtr & node) { if (!getSettings().optimize_arithmetic_operations_in_aggregate_functions) return; diff --git a/src/Analyzer/Passes/LogicalExpressionOptimizerPass.cpp b/src/Analyzer/Passes/LogicalExpressionOptimizerPass.cpp index 11811ae4f2d..ac221bd66e7 100644 --- a/src/Analyzer/Passes/LogicalExpressionOptimizerPass.cpp +++ b/src/Analyzer/Passes/LogicalExpressionOptimizerPass.cpp @@ -551,14 +551,25 @@ private: in_function->getArguments().getNodes() = std::move(in_arguments); in_function->resolveAsFunction(in_function_resolver); + + DataTypePtr result_type = in_function->getResultType(); + const auto * type_low_cardinality = typeid_cast(result_type.get()); + if (type_low_cardinality) + result_type = type_low_cardinality->getDictionaryType(); /** For `k :: UInt8`, expression `k = 1 OR k = NULL` with result type Nullable(UInt8) * is replaced with `k IN (1, NULL)` with result type UInt8. * Convert it back to Nullable(UInt8). + * And for `k :: LowCardinality(UInt8)`, the transformation of `k IN (1, NULL)` results in type LowCardinality(UInt8). + * Convert it to LowCardinality(Nullable(UInt8)). */ - if (is_any_nullable && !in_function->getResultType()->isNullable()) + if (is_any_nullable && !result_type->isNullable()) { - auto nullable_result_type = std::make_shared(in_function->getResultType()); - auto in_function_nullable = createCastFunction(std::move(in_function), std::move(nullable_result_type), getContext()); + DataTypePtr new_result_type = std::make_shared(result_type); + if (type_low_cardinality) + { + new_result_type = std::make_shared(new_result_type); + } + auto in_function_nullable = createCastFunction(std::move(in_function), std::move(new_result_type), getContext()); or_operands.push_back(std::move(in_function_nullable)); } else diff --git a/src/Analyzer/Resolve/ScopeAliases.h b/src/Analyzer/Resolve/ScopeAliases.h index baab843988b..830ae72144b 100644 --- a/src/Analyzer/Resolve/ScopeAliases.h +++ b/src/Analyzer/Resolve/ScopeAliases.h @@ -75,7 +75,12 @@ struct ScopeAliases if (jt == transitive_aliases.end()) return {}; - key = &(getKey(jt->second, find_option)); + const auto & new_key = getKey(jt->second, find_option); + /// Ignore potential cyclic aliases. + if (new_key == *key) + return {}; + + key = &new_key; it = alias_map.find(*key); } diff --git a/src/Columns/MaskOperations.cpp b/src/Columns/MaskOperations.cpp index 1f5f94beee9..873a4060872 100644 --- a/src/Columns/MaskOperations.cpp +++ b/src/Columns/MaskOperations.cpp @@ -289,10 +289,14 @@ void executeColumnIfNeeded(ColumnWithTypeAndName & column, bool empty) if (!column_function) return; + size_t original_size = column.column->size(); + if (!empty) column = column_function->reduce(); else - column.column = column_function->getResultType()->createColumn(); + column.column = column_function->getResultType()->createColumnConstWithDefaultValue(original_size)->convertToFullColumnIfConst(); + + chassert(column.column->size() == original_size); } int checkShortCircuitArguments(const ColumnsWithTypeAndName & arguments) diff --git a/src/Common/AsynchronousMetrics.h b/src/Common/AsynchronousMetrics.h index b62529a08e7..10a972d2458 100644 --- a/src/Common/AsynchronousMetrics.h +++ b/src/Common/AsynchronousMetrics.h @@ -45,14 +45,17 @@ struct ProtocolServerMetrics }; /** Periodically (by default, each second) - * calculates and updates some metrics, - * that are not updated automatically (so, need to be asynchronously calculated). + * calculates and updates some metrics, + * that are not updated automatically (so, need to be asynchronously calculated). * - * This includes both ClickHouse-related metrics (like memory usage of ClickHouse process) - * and common OS-related metrics (like total memory usage on the server). + * This includes both general process metrics (like memory usage) + * and common OS-related metrics (like total memory usage on the server). * * All the values are either gauge type (like the total number of tables, the current memory usage). * Or delta-counters representing some accumulation during the interval of time. + * + * Server and Keeper specific metrics are contained inside + * ServerAsynchronousMetrics and KeeperAsynchronousMetrics respectively. */ class AsynchronousMetrics { diff --git a/src/Common/NamedCollections/NamedCollectionUtils.cpp b/src/Common/NamedCollections/NamedCollectionUtils.cpp deleted file mode 100644 index 5dbdeb10795..00000000000 --- a/src/Common/NamedCollections/NamedCollectionUtils.cpp +++ /dev/null @@ -1,484 +0,0 @@ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include - -namespace fs = std::filesystem; - -namespace DB -{ - -namespace ErrorCodes -{ - extern const int NAMED_COLLECTION_ALREADY_EXISTS; - extern const int NAMED_COLLECTION_DOESNT_EXIST; - extern const int BAD_ARGUMENTS; -} - -namespace NamedCollectionUtils -{ - -static std::atomic is_loaded_from_config = false; -static std::atomic is_loaded_from_sql = false; - -class LoadFromConfig -{ -private: - const Poco::Util::AbstractConfiguration & config; - -public: - explicit LoadFromConfig(const Poco::Util::AbstractConfiguration & config_) - : config(config_) {} - - std::vector listCollections() const - { - Poco::Util::AbstractConfiguration::Keys collections_names; - config.keys(NAMED_COLLECTIONS_CONFIG_PREFIX, collections_names); - return collections_names; - } - - NamedCollectionsMap getAll() const - { - NamedCollectionsMap result; - for (const auto & collection_name : listCollections()) - { - if (result.contains(collection_name)) - { - throw Exception( - ErrorCodes::NAMED_COLLECTION_ALREADY_EXISTS, - "Found duplicate named collection `{}`", - collection_name); - } - result.emplace(collection_name, get(collection_name)); - } - return result; - } - - MutableNamedCollectionPtr get(const std::string & collection_name) const - { - const auto collection_prefix = getCollectionPrefix(collection_name); - std::queue enumerate_input; - std::set> enumerate_result; - - enumerate_input.push(collection_prefix); - NamedCollectionConfiguration::listKeys(config, std::move(enumerate_input), enumerate_result, -1); - - /// Collection does not have any keys. - /// (`enumerate_result` == ). - const bool collection_is_empty = enumerate_result.size() == 1 - && *enumerate_result.begin() == collection_prefix; - std::set> keys; - if (!collection_is_empty) - { - /// Skip collection prefix and add +1 to avoid '.' in the beginning. - for (const auto & path : enumerate_result) - keys.emplace(path.substr(collection_prefix.size() + 1)); - } - - return NamedCollection::create( - config, collection_name, collection_prefix, keys, SourceId::CONFIG, /* is_mutable */false); - } - -private: - static constexpr auto NAMED_COLLECTIONS_CONFIG_PREFIX = "named_collections"; - - static std::string getCollectionPrefix(const std::string & collection_name) - { - return fmt::format("{}.{}", NAMED_COLLECTIONS_CONFIG_PREFIX, collection_name); - } -}; - - -class LoadFromSQL : private WithContext -{ -private: - const std::string metadata_path; - -public: - explicit LoadFromSQL(ContextPtr context_) - : WithContext(context_) - , metadata_path(fs::weakly_canonical(context_->getPath()) / NAMED_COLLECTIONS_METADATA_DIRECTORY) - { - if (fs::exists(metadata_path)) - cleanup(); - } - - std::vector listCollections() const - { - if (!fs::exists(metadata_path)) - return {}; - - std::vector collection_names; - fs::directory_iterator it{metadata_path}; - for (; it != fs::directory_iterator{}; ++it) - { - const auto & current_path = it->path(); - if (current_path.extension() == ".sql") - { - collection_names.push_back(it->path().stem()); - } - else - { - LOG_WARNING( - getLogger("NamedCollectionsLoadFromSQL"), - "Unexpected file {} in named collections directory", - current_path.filename().string()); - } - } - return collection_names; - } - - NamedCollectionsMap getAll() const - { - NamedCollectionsMap result; - for (const auto & collection_name : listCollections()) - { - if (result.contains(collection_name)) - { - throw Exception( - ErrorCodes::NAMED_COLLECTION_ALREADY_EXISTS, - "Found duplicate named collection `{}`", - collection_name); - } - result.emplace(collection_name, get(collection_name)); - } - return result; - } - - MutableNamedCollectionPtr get(const std::string & collection_name) const - { - const auto query = readCreateQueryFromMetadata( - getMetadataPath(collection_name), - getContext()->getSettingsRef()); - return createNamedCollectionFromAST(query); - } - - MutableNamedCollectionPtr create(const ASTCreateNamedCollectionQuery & query) - { - writeCreateQueryToMetadata( - query, - getMetadataPath(query.collection_name), - getContext()->getSettingsRef()); - - return createNamedCollectionFromAST(query); - } - - void update(const ASTAlterNamedCollectionQuery & query) - { - const auto path = getMetadataPath(query.collection_name); - auto create_query = readCreateQueryFromMetadata(path, getContext()->getSettings()); - - std::unordered_map result_changes_map; - for (const auto & [name, value] : query.changes) - { - auto [it, inserted] = result_changes_map.emplace(name, value); - if (!inserted) - { - throw Exception( - ErrorCodes::BAD_ARGUMENTS, - "Value with key `{}` is used twice in the SET query (collection name: {})", - name, query.collection_name); - } - } - - for (const auto & [name, value] : create_query.changes) - result_changes_map.emplace(name, value); - - std::unordered_map result_overridability_map; - for (const auto & [name, value] : query.overridability) - result_overridability_map.emplace(name, value); - for (const auto & [name, value] : create_query.overridability) - result_overridability_map.emplace(name, value); - - for (const auto & delete_key : query.delete_keys) - { - auto it = result_changes_map.find(delete_key); - if (it == result_changes_map.end()) - { - throw Exception( - ErrorCodes::BAD_ARGUMENTS, - "Cannot delete key `{}` because it does not exist in collection", - delete_key); - } - else - { - result_changes_map.erase(it); - auto it_override = result_overridability_map.find(delete_key); - if (it_override != result_overridability_map.end()) - result_overridability_map.erase(it_override); - } - } - - create_query.changes.clear(); - for (const auto & [name, value] : result_changes_map) - create_query.changes.emplace_back(name, value); - create_query.overridability = std::move(result_overridability_map); - - if (create_query.changes.empty()) - throw Exception( - ErrorCodes::BAD_ARGUMENTS, - "Named collection cannot be empty (collection name: {})", - query.collection_name); - - writeCreateQueryToMetadata( - create_query, - getMetadataPath(query.collection_name), - getContext()->getSettingsRef(), - true); - } - - void remove(const std::string & collection_name) - { - auto collection_path = getMetadataPath(collection_name); - if (!fs::exists(collection_path)) - { - throw Exception( - ErrorCodes::NAMED_COLLECTION_DOESNT_EXIST, - "Cannot remove collection `{}`, because it doesn't exist", - collection_name); - } - (void)fs::remove(collection_path); - } - -private: - static constexpr auto NAMED_COLLECTIONS_METADATA_DIRECTORY = "named_collections"; - - static MutableNamedCollectionPtr createNamedCollectionFromAST( - const ASTCreateNamedCollectionQuery & query) - { - const auto & collection_name = query.collection_name; - const auto config = NamedCollectionConfiguration::createConfiguration(collection_name, query.changes, query.overridability); - - std::set> keys; - for (const auto & [name, _] : query.changes) - keys.insert(name); - - return NamedCollection::create( - *config, collection_name, "", keys, SourceId::SQL, /* is_mutable */true); - } - - std::string getMetadataPath(const std::string & collection_name) const - { - return fs::path(metadata_path) / (escapeForFileName(collection_name) + ".sql"); - } - - /// Delete .tmp files. They could be left undeleted in case of - /// some exception or abrupt server restart. - void cleanup() - { - fs::directory_iterator it{metadata_path}; - std::vector files_to_remove; - for (; it != fs::directory_iterator{}; ++it) - { - const auto & current_path = it->path(); - if (current_path.extension() == ".tmp") - files_to_remove.push_back(current_path); - } - for (const auto & file : files_to_remove) - (void)fs::remove(file); - } - - static ASTCreateNamedCollectionQuery readCreateQueryFromMetadata( - const std::string & path, - const Settings & settings) - { - ReadBufferFromFile in(path); - std::string query; - readStringUntilEOF(query, in); - - ParserCreateNamedCollectionQuery parser; - auto ast = parseQuery(parser, query, "in file " + path, 0, settings.max_parser_depth, settings.max_parser_backtracks); - const auto & create_query = ast->as(); - return create_query; - } - - void writeCreateQueryToMetadata( - const ASTCreateNamedCollectionQuery & query, - const std::string & path, - const Settings & settings, - bool replace = false) const - { - if (!replace && fs::exists(path)) - { - throw Exception( - ErrorCodes::NAMED_COLLECTION_ALREADY_EXISTS, - "Metadata file {} for named collection already exists", - path); - } - - fs::create_directories(metadata_path); - - auto tmp_path = path + ".tmp"; - String formatted_query = serializeAST(query); - WriteBufferFromFile out(tmp_path, formatted_query.size(), O_WRONLY | O_CREAT | O_EXCL); - writeString(formatted_query, out); - - out.next(); - if (settings.fsync_metadata) - out.sync(); - out.close(); - - fs::rename(tmp_path, path); - } -}; - -std::unique_lock lockNamedCollectionsTransaction() -{ - static std::mutex transaction_lock; - return std::unique_lock(transaction_lock); -} - -void loadFromConfigUnlocked(const Poco::Util::AbstractConfiguration & config, std::unique_lock &) -{ - auto named_collections = LoadFromConfig(config).getAll(); - LOG_TRACE( - getLogger("NamedCollectionsUtils"), - "Loaded {} collections from config", named_collections.size()); - - NamedCollectionFactory::instance().add(std::move(named_collections)); - is_loaded_from_config = true; -} - -void loadFromConfig(const Poco::Util::AbstractConfiguration & config) -{ - auto lock = lockNamedCollectionsTransaction(); - loadFromConfigUnlocked(config, lock); -} - -void reloadFromConfig(const Poco::Util::AbstractConfiguration & config) -{ - auto lock = lockNamedCollectionsTransaction(); - auto collections = LoadFromConfig(config).getAll(); - auto & instance = NamedCollectionFactory::instance(); - instance.removeById(SourceId::CONFIG); - instance.add(collections); - is_loaded_from_config = true; -} - -void loadFromSQLUnlocked(ContextPtr context, std::unique_lock &) -{ - auto named_collections = LoadFromSQL(context).getAll(); - LOG_TRACE( - getLogger("NamedCollectionsUtils"), - "Loaded {} collections from SQL", named_collections.size()); - - NamedCollectionFactory::instance().add(std::move(named_collections)); - is_loaded_from_sql = true; -} - -void loadFromSQL(ContextPtr context) -{ - auto lock = lockNamedCollectionsTransaction(); - loadFromSQLUnlocked(context, lock); -} - -void loadIfNotUnlocked(std::unique_lock & lock) -{ - auto global_context = Context::getGlobalContextInstance(); - if (!is_loaded_from_config) - loadFromConfigUnlocked(global_context->getConfigRef(), lock); - if (!is_loaded_from_sql) - loadFromSQLUnlocked(global_context, lock); -} - -void loadIfNot() -{ - if (is_loaded_from_sql && is_loaded_from_config) - return; - auto lock = lockNamedCollectionsTransaction(); - loadIfNotUnlocked(lock); -} - -void removeFromSQL(const ASTDropNamedCollectionQuery & query, ContextPtr context) -{ - auto lock = lockNamedCollectionsTransaction(); - loadIfNotUnlocked(lock); - auto & instance = NamedCollectionFactory::instance(); - if (!instance.exists(query.collection_name)) - { - if (!query.if_exists) - { - throw Exception( - ErrorCodes::NAMED_COLLECTION_DOESNT_EXIST, - "Cannot remove collection `{}`, because it doesn't exist", - query.collection_name); - } - return; - } - LoadFromSQL(context).remove(query.collection_name); - instance.remove(query.collection_name); -} - -void createFromSQL(const ASTCreateNamedCollectionQuery & query, ContextPtr context) -{ - auto lock = lockNamedCollectionsTransaction(); - loadIfNotUnlocked(lock); - auto & instance = NamedCollectionFactory::instance(); - if (instance.exists(query.collection_name)) - { - if (!query.if_not_exists) - { - throw Exception( - ErrorCodes::NAMED_COLLECTION_ALREADY_EXISTS, - "A named collection `{}` already exists", - query.collection_name); - } - return; - } - instance.add(query.collection_name, LoadFromSQL(context).create(query)); -} - -void updateFromSQL(const ASTAlterNamedCollectionQuery & query, ContextPtr context) -{ - auto lock = lockNamedCollectionsTransaction(); - loadIfNotUnlocked(lock); - auto & instance = NamedCollectionFactory::instance(); - if (!instance.exists(query.collection_name)) - { - if (!query.if_exists) - { - throw Exception( - ErrorCodes::NAMED_COLLECTION_DOESNT_EXIST, - "Cannot remove collection `{}`, because it doesn't exist", - query.collection_name); - } - return; - } - LoadFromSQL(context).update(query); - - auto collection = instance.getMutable(query.collection_name); - auto collection_lock = collection->lock(); - - for (const auto & [name, value] : query.changes) - { - auto it_override = query.overridability.find(name); - if (it_override != query.overridability.end()) - collection->setOrUpdate(name, convertFieldToString(value), it_override->second); - else - collection->setOrUpdate(name, convertFieldToString(value), {}); - } - - for (const auto & key : query.delete_keys) - collection->remove(key); -} - -} - -} diff --git a/src/Common/NamedCollections/NamedCollectionUtils.h b/src/Common/NamedCollections/NamedCollectionUtils.h deleted file mode 100644 index 293b3ea659d..00000000000 --- a/src/Common/NamedCollections/NamedCollectionUtils.h +++ /dev/null @@ -1,42 +0,0 @@ -#pragma once -#include - -namespace Poco { namespace Util { class AbstractConfiguration; } } - -namespace DB -{ - -class ASTCreateNamedCollectionQuery; -class ASTAlterNamedCollectionQuery; -class ASTDropNamedCollectionQuery; - -namespace NamedCollectionUtils -{ - -enum class SourceId : uint8_t -{ - NONE = 0, - CONFIG = 1, - SQL = 2, -}; - -void loadFromConfig(const Poco::Util::AbstractConfiguration & config); -void reloadFromConfig(const Poco::Util::AbstractConfiguration & config); - -/// Load named collections from `context->getPath() / named_collections /`. -void loadFromSQL(ContextPtr context); - -/// Remove collection as well as its metadata from `context->getPath() / named_collections /`. -void removeFromSQL(const ASTDropNamedCollectionQuery & query, ContextPtr context); - -/// Create a new collection from AST and put it to `context->getPath() / named_collections /`. -void createFromSQL(const ASTCreateNamedCollectionQuery & query, ContextPtr context); - -/// Update definition of already existing collection from AST and update result in `context->getPath() / named_collections /`. -void updateFromSQL(const ASTAlterNamedCollectionQuery & query, ContextPtr context); - -void loadIfNot(); - -} - -} diff --git a/src/Common/NamedCollections/NamedCollections.cpp b/src/Common/NamedCollections/NamedCollections.cpp index 04d2099f4df..74ce405f71d 100644 --- a/src/Common/NamedCollections/NamedCollections.cpp +++ b/src/Common/NamedCollections/NamedCollections.cpp @@ -4,7 +4,6 @@ #include #include #include -#include #include @@ -297,7 +296,7 @@ MutableNamedCollectionPtr NamedCollection::duplicate() const auto impl = pimpl->createCopy(collection_name); return std::unique_ptr( new NamedCollection( - std::move(impl), collection_name, NamedCollectionUtils::SourceId::NONE, true)); + std::move(impl), collection_name, SourceId::NONE, true)); } NamedCollection::Keys NamedCollection::getKeys(ssize_t depth, const std::string & prefix) const diff --git a/src/Common/NamedCollections/NamedCollections.h b/src/Common/NamedCollections/NamedCollections.h index c253c56594f..23862c4515a 100644 --- a/src/Common/NamedCollections/NamedCollections.h +++ b/src/Common/NamedCollections/NamedCollections.h @@ -1,7 +1,6 @@ #pragma once #include #include -#include namespace Poco { namespace Util { class AbstractConfiguration; } } @@ -23,7 +22,12 @@ class NamedCollection public: using Key = std::string; using Keys = std::set>; - using SourceId = NamedCollectionUtils::SourceId; + enum class SourceId : uint8_t + { + NONE = 0, + CONFIG = 1, + SQL = 2, + }; static MutableNamedCollectionPtr create( const Poco::Util::AbstractConfiguration & config, diff --git a/src/Common/NamedCollections/NamedCollectionsFactory.cpp b/src/Common/NamedCollections/NamedCollectionsFactory.cpp index dd69952429f..14105a8651d 100644 --- a/src/Common/NamedCollections/NamedCollectionsFactory.cpp +++ b/src/Common/NamedCollections/NamedCollectionsFactory.cpp @@ -1,5 +1,7 @@ #include -#include +#include +#include +#include namespace DB { @@ -17,23 +19,29 @@ NamedCollectionFactory & NamedCollectionFactory::instance() return instance; } +NamedCollectionFactory::~NamedCollectionFactory() +{ + shutdown(); +} + +void NamedCollectionFactory::shutdown() +{ + shutdown_called = true; + if (update_task) + update_task->deactivate(); + metadata_storage.reset(); +} + bool NamedCollectionFactory::exists(const std::string & collection_name) const { std::lock_guard lock(mutex); - return existsUnlocked(collection_name, lock); -} - -bool NamedCollectionFactory::existsUnlocked( - const std::string & collection_name, - std::lock_guard & /* lock */) const -{ - return loaded_named_collections.contains(collection_name); + return exists(collection_name, lock); } NamedCollectionPtr NamedCollectionFactory::get(const std::string & collection_name) const { std::lock_guard lock(mutex); - auto collection = tryGetUnlocked(collection_name, lock); + auto collection = tryGet(collection_name, lock); if (!collection) { throw Exception( @@ -47,14 +55,35 @@ NamedCollectionPtr NamedCollectionFactory::get(const std::string & collection_na NamedCollectionPtr NamedCollectionFactory::tryGet(const std::string & collection_name) const { std::lock_guard lock(mutex); - return tryGetUnlocked(collection_name, lock); + return tryGet(collection_name, lock); +} + +NamedCollectionsMap NamedCollectionFactory::getAll() const +{ + std::lock_guard lock(mutex); + return loaded_named_collections; +} + +bool NamedCollectionFactory::exists(const std::string & collection_name, std::lock_guard &) const +{ + return loaded_named_collections.contains(collection_name); +} + +MutableNamedCollectionPtr NamedCollectionFactory::tryGet( + const std::string & collection_name, + std::lock_guard &) const +{ + auto it = loaded_named_collections.find(collection_name); + if (it == loaded_named_collections.end()) + return nullptr; + return it->second; } MutableNamedCollectionPtr NamedCollectionFactory::getMutable( - const std::string & collection_name) const + const std::string & collection_name, + std::lock_guard & lock) const { - std::lock_guard lock(mutex); - auto collection = tryGetUnlocked(collection_name, lock); + auto collection = tryGet(collection_name, lock); if (!collection) { throw Exception( @@ -73,35 +102,10 @@ MutableNamedCollectionPtr NamedCollectionFactory::getMutable( return collection; } -MutableNamedCollectionPtr NamedCollectionFactory::tryGetUnlocked( - const std::string & collection_name, - std::lock_guard & /* lock */) const -{ - auto it = loaded_named_collections.find(collection_name); - if (it == loaded_named_collections.end()) - return nullptr; - return it->second; -} - void NamedCollectionFactory::add( - const std::string & collection_name, - MutableNamedCollectionPtr collection) -{ - std::lock_guard lock(mutex); - addUnlocked(collection_name, collection, lock); -} - -void NamedCollectionFactory::add(NamedCollectionsMap collections) -{ - std::lock_guard lock(mutex); - for (const auto & [collection_name, collection] : collections) - addUnlocked(collection_name, collection, lock); -} - -void NamedCollectionFactory::addUnlocked( const std::string & collection_name, MutableNamedCollectionPtr collection, - std::lock_guard & /* lock */) + std::lock_guard &) { auto [it, inserted] = loaded_named_collections.emplace(collection_name, collection); if (!inserted) @@ -113,10 +117,15 @@ void NamedCollectionFactory::addUnlocked( } } -void NamedCollectionFactory::remove(const std::string & collection_name) +void NamedCollectionFactory::add(NamedCollectionsMap collections, std::lock_guard & lock) { - std::lock_guard lock(mutex); - bool removed = removeIfExistsUnlocked(collection_name, lock); + for (const auto & [collection_name, collection] : collections) + add(collection_name, collection, lock); +} + +void NamedCollectionFactory::remove(const std::string & collection_name, std::lock_guard & lock) +{ + bool removed = removeIfExists(collection_name, lock); if (!removed) { throw Exception( @@ -126,17 +135,11 @@ void NamedCollectionFactory::remove(const std::string & collection_name) } } -void NamedCollectionFactory::removeIfExists(const std::string & collection_name) -{ - std::lock_guard lock(mutex); - removeIfExistsUnlocked(collection_name, lock); // NOLINT -} - -bool NamedCollectionFactory::removeIfExistsUnlocked( +bool NamedCollectionFactory::removeIfExists( const std::string & collection_name, std::lock_guard & lock) { - auto collection = tryGetUnlocked(collection_name, lock); + auto collection = tryGet(collection_name, lock); if (!collection) return false; @@ -152,18 +155,246 @@ bool NamedCollectionFactory::removeIfExistsUnlocked( return true; } -void NamedCollectionFactory::removeById(NamedCollectionUtils::SourceId id) +void NamedCollectionFactory::removeById(NamedCollection::SourceId id, std::lock_guard &) { - std::lock_guard lock(mutex); std::erase_if( loaded_named_collections, [&](const auto & value) { return value.second->getSourceId() == id; }); } -NamedCollectionsMap NamedCollectionFactory::getAll() const +namespace +{ + constexpr auto NAMED_COLLECTIONS_CONFIG_PREFIX = "named_collections"; + + std::vector listCollections(const Poco::Util::AbstractConfiguration & config) + { + Poco::Util::AbstractConfiguration::Keys collections_names; + config.keys(NAMED_COLLECTIONS_CONFIG_PREFIX, collections_names); + return collections_names; + } + + MutableNamedCollectionPtr getCollection( + const Poco::Util::AbstractConfiguration & config, + const std::string & collection_name) + { + const auto collection_prefix = fmt::format("{}.{}", NAMED_COLLECTIONS_CONFIG_PREFIX, collection_name); + std::queue enumerate_input; + std::set> enumerate_result; + + enumerate_input.push(collection_prefix); + NamedCollectionConfiguration::listKeys(config, std::move(enumerate_input), enumerate_result, -1); + + /// Collection does not have any keys. (`enumerate_result` == ). + const bool collection_is_empty = enumerate_result.size() == 1 + && *enumerate_result.begin() == collection_prefix; + + std::set> keys; + if (!collection_is_empty) + { + /// Skip collection prefix and add +1 to avoid '.' in the beginning. + for (const auto & path : enumerate_result) + keys.emplace(path.substr(collection_prefix.size() + 1)); + } + + return NamedCollection::create( + config, collection_name, collection_prefix, keys, NamedCollection::SourceId::CONFIG, /* is_mutable */false); + } + + NamedCollectionsMap getNamedCollections(const Poco::Util::AbstractConfiguration & config) + { + NamedCollectionsMap result; + for (const auto & collection_name : listCollections(config)) + { + if (result.contains(collection_name)) + { + throw Exception( + ErrorCodes::NAMED_COLLECTION_ALREADY_EXISTS, + "Found duplicate named collection `{}`", + collection_name); + } + result.emplace(collection_name, getCollection(config, collection_name)); + } + return result; + } +} + +void NamedCollectionFactory::loadIfNot() { std::lock_guard lock(mutex); - return loaded_named_collections; + loadIfNot(lock); +} + +bool NamedCollectionFactory::loadIfNot(std::lock_guard & lock) +{ + if (loaded) + return false; + + auto context = Context::getGlobalContextInstance(); + metadata_storage = NamedCollectionsMetadataStorage::create(context); + + loadFromConfig(context->getConfigRef(), lock); + loadFromSQL(lock); + + if (metadata_storage->supportsPeriodicUpdate()) + { + update_task = context->getSchedulePool().createTask("NamedCollectionsMetadataStorage", [this]{ updateFunc(); }); + update_task->activate(); + update_task->schedule(); + } + + loaded = true; + return true; +} + +void NamedCollectionFactory::loadFromConfig(const Poco::Util::AbstractConfiguration & config, std::lock_guard & lock) +{ + auto collections = getNamedCollections(config); + LOG_TEST(log, "Loaded {} collections from config", collections.size()); + add(std::move(collections), lock); +} + +void NamedCollectionFactory::reloadFromConfig(const Poco::Util::AbstractConfiguration & config) +{ + std::lock_guard lock(mutex); + if (loadIfNot(lock)) + return; + + auto collections = getNamedCollections(config); + LOG_TEST(log, "Loaded {} collections from config", collections.size()); + + removeById(NamedCollection::SourceId::CONFIG, lock); + add(std::move(collections), lock); +} + +void NamedCollectionFactory::loadFromSQL(std::lock_guard & lock) +{ + auto collections = metadata_storage->getAll(); + LOG_TEST(log, "Loaded {} collections from sql", collections.size()); + add(std::move(collections), lock); +} + +void NamedCollectionFactory::createFromSQL(const ASTCreateNamedCollectionQuery & query) +{ + std::lock_guard lock(mutex); + loadIfNot(lock); + + if (exists(query.collection_name, lock)) + { + if (query.if_not_exists) + return; + + throw Exception( + ErrorCodes::NAMED_COLLECTION_ALREADY_EXISTS, + "A named collection `{}` already exists", + query.collection_name); + } + + add(query.collection_name, metadata_storage->create(query), lock); +} + +void NamedCollectionFactory::removeFromSQL(const ASTDropNamedCollectionQuery & query) +{ + std::lock_guard lock(mutex); + loadIfNot(lock); + + if (!exists(query.collection_name, lock)) + { + if (query.if_exists) + return; + + throw Exception( + ErrorCodes::NAMED_COLLECTION_DOESNT_EXIST, + "Cannot remove collection `{}`, because it doesn't exist", + query.collection_name); + } + + metadata_storage->remove(query.collection_name); + remove(query.collection_name, lock); +} + +void NamedCollectionFactory::updateFromSQL(const ASTAlterNamedCollectionQuery & query) +{ + std::lock_guard lock(mutex); + loadIfNot(lock); + + if (!exists(query.collection_name, lock)) + { + if (query.if_exists) + return; + + throw Exception( + ErrorCodes::NAMED_COLLECTION_DOESNT_EXIST, + "Cannot remove collection `{}`, because it doesn't exist", + query.collection_name); + } + + metadata_storage->update(query); + + auto collection = getMutable(query.collection_name, lock); + auto collection_lock = collection->lock(); + + for (const auto & [name, value] : query.changes) + { + auto it_override = query.overridability.find(name); + if (it_override != query.overridability.end()) + collection->setOrUpdate(name, convertFieldToString(value), it_override->second); + else + collection->setOrUpdate(name, convertFieldToString(value), {}); + } + + for (const auto & key : query.delete_keys) + collection->remove(key); +} + +void NamedCollectionFactory::reloadFromSQL() +{ + std::lock_guard lock(mutex); + if (loadIfNot(lock)) + return; + + auto collections = metadata_storage->getAll(); + removeById(NamedCollection::SourceId::SQL, lock); + add(std::move(collections), lock); +} + +void NamedCollectionFactory::updateFunc() +{ + LOG_TRACE(log, "Named collections background updating thread started"); + + while (!shutdown_called.load()) + { + if (metadata_storage->waitUpdate()) + { + try + { + reloadFromSQL(); + } + catch (const Coordination::Exception & e) + { + if (Coordination::isHardwareError(e.code)) + { + LOG_INFO(log, "Lost ZooKeeper connection, will try to connect again: {}", + DB::getCurrentExceptionMessage(true)); + + sleepForSeconds(1); + } + else + { + tryLogCurrentException(__PRETTY_FUNCTION__); + chassert(false); + } + continue; + } + catch (...) + { + DB::tryLogCurrentException(__PRETTY_FUNCTION__); + chassert(false); + continue; + } + } + } + + LOG_TRACE(log, "Named collections background updating thread finished"); } } diff --git a/src/Common/NamedCollections/NamedCollectionsFactory.h b/src/Common/NamedCollections/NamedCollectionsFactory.h index 2d64a03bde3..6ee5940e686 100644 --- a/src/Common/NamedCollections/NamedCollectionsFactory.h +++ b/src/Common/NamedCollections/NamedCollectionsFactory.h @@ -1,58 +1,83 @@ #pragma once #include +#include +#include namespace DB { +class ASTCreateNamedCollectionQuery; +class ASTDropNamedCollectionQuery; +class ASTAlterNamedCollectionQuery; class NamedCollectionFactory : boost::noncopyable { public: static NamedCollectionFactory & instance(); + ~NamedCollectionFactory(); + bool exists(const std::string & collection_name) const; NamedCollectionPtr get(const std::string & collection_name) const; NamedCollectionPtr tryGet(const std::string & collection_name) const; - MutableNamedCollectionPtr getMutable(const std::string & collection_name) const; - - void add(const std::string & collection_name, MutableNamedCollectionPtr collection); - - void add(NamedCollectionsMap collections); - - void update(NamedCollectionsMap collections); - - void remove(const std::string & collection_name); - - void removeIfExists(const std::string & collection_name); - - void removeById(NamedCollectionUtils::SourceId id); - NamedCollectionsMap getAll() const; -private: - bool existsUnlocked( - const std::string & collection_name, - std::lock_guard & lock) const; + void reloadFromConfig(const Poco::Util::AbstractConfiguration & config); - MutableNamedCollectionPtr tryGetUnlocked( - const std::string & collection_name, - std::lock_guard & lock) const; + void reloadFromSQL(); - void addUnlocked( - const std::string & collection_name, - MutableNamedCollectionPtr collection, - std::lock_guard & lock); + void createFromSQL(const ASTCreateNamedCollectionQuery & query); - bool removeIfExistsUnlocked( - const std::string & collection_name, - std::lock_guard & lock); + void removeFromSQL(const ASTDropNamedCollectionQuery & query); + void updateFromSQL(const ASTAlterNamedCollectionQuery & query); + + void loadIfNot(); + + void shutdown(); + +protected: mutable NamedCollectionsMap loaded_named_collections; - mutable std::mutex mutex; - bool is_initialized = false; + + const LoggerPtr log = getLogger("NamedCollectionFactory"); + + bool loaded = false; + std::atomic shutdown_called = false; + std::unique_ptr metadata_storage; + BackgroundSchedulePool::TaskHolder update_task; + + bool loadIfNot(std::lock_guard & lock); + + bool exists( + const std::string & collection_name, + std::lock_guard & lock) const; + + MutableNamedCollectionPtr getMutable(const std::string & collection_name, std::lock_guard & lock) const; + + void add(const std::string & collection_name, MutableNamedCollectionPtr collection, std::lock_guard & lock); + + void add(NamedCollectionsMap collections, std::lock_guard & lock); + + void update(NamedCollectionsMap collections, std::lock_guard & lock); + + void remove(const std::string & collection_name, std::lock_guard & lock); + + bool removeIfExists(const std::string & collection_name, std::lock_guard & lock); + + MutableNamedCollectionPtr tryGet(const std::string & collection_name, std::lock_guard & lock) const; + + void removeById(NamedCollection::SourceId id, std::lock_guard & lock); + + void loadFromConfig( + const Poco::Util::AbstractConfiguration & config, + std::lock_guard & lock); + + void loadFromSQL(std::lock_guard & lock); + + void updateFunc(); }; } diff --git a/src/Common/NamedCollections/NamedCollectionsMetadataStorage.cpp b/src/Common/NamedCollections/NamedCollectionsMetadataStorage.cpp new file mode 100644 index 00000000000..32fdb25abd3 --- /dev/null +++ b/src/Common/NamedCollections/NamedCollectionsMetadataStorage.cpp @@ -0,0 +1,519 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace fs = std::filesystem; + +namespace DB +{ +namespace ErrorCodes +{ + extern const int NAMED_COLLECTION_ALREADY_EXISTS; + extern const int NAMED_COLLECTION_DOESNT_EXIST; + extern const int INVALID_CONFIG_PARAMETER; + extern const int BAD_ARGUMENTS; + extern const int LOGICAL_ERROR; +} + +static const std::string named_collections_storage_config_path = "named_collections_storage"; + +namespace +{ + MutableNamedCollectionPtr createNamedCollectionFromAST(const ASTCreateNamedCollectionQuery & query) + { + const auto & collection_name = query.collection_name; + const auto config = NamedCollectionConfiguration::createConfiguration(collection_name, query.changes, query.overridability); + + std::set> keys; + for (const auto & [name, _] : query.changes) + keys.insert(name); + + return NamedCollection::create( + *config, collection_name, "", keys, NamedCollection::SourceId::SQL, /* is_mutable */true); + } + + std::string getFileName(const std::string & collection_name) + { + return escapeForFileName(collection_name) + ".sql"; + } +} + +class NamedCollectionsMetadataStorage::INamedCollectionsStorage +{ +public: + virtual ~INamedCollectionsStorage() = default; + + virtual bool exists(const std::string & path) const = 0; + + virtual std::vector list() const = 0; + + virtual std::string read(const std::string & path) const = 0; + + virtual void write(const std::string & path, const std::string & data, bool replace) = 0; + + virtual void remove(const std::string & path) = 0; + + virtual bool removeIfExists(const std::string & path) = 0; + + virtual bool supportsPeriodicUpdate() const = 0; + + virtual bool waitUpdate(size_t /* timeout */) { return false; } +}; + + +class NamedCollectionsMetadataStorage::LocalStorage : public INamedCollectionsStorage, private WithContext +{ +private: + std::string root_path; + +public: + LocalStorage(ContextPtr context_, const std::string & path_) + : WithContext(context_) + , root_path(path_) + { + if (fs::exists(root_path)) + cleanup(); + } + + ~LocalStorage() override = default; + + bool supportsPeriodicUpdate() const override { return false; } + + std::vector list() const override + { + if (!fs::exists(root_path)) + return {}; + + std::vector elements; + for (fs::directory_iterator it{root_path}; it != fs::directory_iterator{}; ++it) + { + const auto & current_path = it->path(); + if (current_path.extension() == ".sql") + { + elements.push_back(it->path()); + } + else + { + LOG_WARNING( + getLogger("LocalStorage"), + "Unexpected file {} in named collections directory", + current_path.filename().string()); + } + } + return elements; + } + + bool exists(const std::string & path) const override + { + return fs::exists(getPath(path)); + } + + std::string read(const std::string & path) const override + { + ReadBufferFromFile in(getPath(path)); + std::string data; + readStringUntilEOF(data, in); + return data; + } + + void write(const std::string & path, const std::string & data, bool replace) override + { + if (!replace && fs::exists(path)) + { + throw Exception( + ErrorCodes::NAMED_COLLECTION_ALREADY_EXISTS, + "Metadata file {} for named collection already exists", + path); + } + + fs::create_directories(root_path); + + auto tmp_path = getPath(path + ".tmp"); + WriteBufferFromFile out(tmp_path, data.size(), O_WRONLY | O_CREAT | O_EXCL); + writeString(data, out); + + out.next(); + if (getContext()->getSettingsRef().fsync_metadata) + out.sync(); + out.close(); + + fs::rename(tmp_path, getPath(path)); + } + + void remove(const std::string & path) override + { + if (!removeIfExists(getPath(path))) + { + throw Exception( + ErrorCodes::NAMED_COLLECTION_DOESNT_EXIST, + "Cannot remove `{}`, because it doesn't exist", path); + } + } + + bool removeIfExists(const std::string & path) override + { + return fs::remove(getPath(path)); + } + +private: + std::string getPath(const std::string & path) const + { + return fs::path(root_path) / path; + } + + /// Delete .tmp files. They could be left undeleted in case of + /// some exception or abrupt server restart. + void cleanup() + { + std::vector files_to_remove; + for (fs::directory_iterator it{root_path}; it != fs::directory_iterator{}; ++it) + { + const auto & current_path = it->path(); + if (current_path.extension() == ".tmp") + files_to_remove.push_back(current_path); + } + for (const auto & file : files_to_remove) + fs::remove(file); + } +}; + + +class NamedCollectionsMetadataStorage::ZooKeeperStorage : public INamedCollectionsStorage, private WithContext +{ +private: + std::string root_path; + mutable zkutil::ZooKeeperPtr zookeeper_client{nullptr}; + mutable zkutil::EventPtr wait_event; + mutable Int32 collections_node_cversion = 0; + +public: + ZooKeeperStorage(ContextPtr context_, const std::string & path_) + : WithContext(context_) + , root_path(path_) + { + if (root_path.empty()) + throw Exception(ErrorCodes::INVALID_CONFIG_PARAMETER, "Collections path cannot be empty"); + + if (root_path != "/" && root_path.back() == '/') + root_path.resize(root_path.size() - 1); + if (root_path.front() != '/') + root_path = "/" + root_path; + + auto client = getClient(); + if (root_path != "/" && !client->exists(root_path)) + { + client->createAncestors(root_path); + client->createIfNotExists(root_path, ""); + } + } + + ~ZooKeeperStorage() override = default; + + bool supportsPeriodicUpdate() const override { return true; } + + /// Return true if children changed. + bool waitUpdate(size_t timeout) override + { + if (!wait_event) + { + /// We did not yet made any list() attempt, so do that. + return true; + } + + if (wait_event->tryWait(timeout)) + { + /// Children changed before timeout. + return true; + } + + std::string res; + Coordination::Stat stat; + + if (!getClient()->tryGet(root_path, res, &stat)) + { + /// We do create root_path in constructor of this class, + /// so this case is not really possible. + chassert(false); + return false; + } + + return stat.cversion != collections_node_cversion; + } + + std::vector list() const override + { + if (!wait_event) + wait_event = std::make_shared(); + + Coordination::Stat stat; + auto children = getClient()->getChildren(root_path, &stat, wait_event); + collections_node_cversion = stat.cversion; + return children; + } + + bool exists(const std::string & path) const override + { + return getClient()->exists(getPath(path)); + } + + std::string read(const std::string & path) const override + { + return getClient()->get(getPath(path)); + } + + void write(const std::string & path, const std::string & data, bool replace) override + { + if (replace) + { + getClient()->createOrUpdate(getPath(path), data, zkutil::CreateMode::Persistent); + } + else + { + auto code = getClient()->tryCreate(getPath(path), data, zkutil::CreateMode::Persistent); + + if (code == Coordination::Error::ZNODEEXISTS) + { + throw Exception( + ErrorCodes::NAMED_COLLECTION_ALREADY_EXISTS, + "Metadata file {} for named collection already exists", + path); + } + } + } + + void remove(const std::string & path) override + { + getClient()->remove(getPath(path)); + } + + bool removeIfExists(const std::string & path) override + { + auto code = getClient()->tryRemove(getPath(path)); + if (code == Coordination::Error::ZOK) + return true; + if (code == Coordination::Error::ZNONODE) + return false; + throw Coordination::Exception::fromPath(code, getPath(path)); + } + +private: + zkutil::ZooKeeperPtr getClient() const + { + if (!zookeeper_client || zookeeper_client->expired()) + { + zookeeper_client = getContext()->getZooKeeper(); + zookeeper_client->sync(root_path); + } + return zookeeper_client; + } + + std::string getPath(const std::string & path) const + { + return fs::path(root_path) / path; + } +}; + +NamedCollectionsMetadataStorage::NamedCollectionsMetadataStorage( + std::shared_ptr storage_, + ContextPtr context_) + : WithContext(context_) + , storage(std::move(storage_)) +{ +} + +MutableNamedCollectionPtr NamedCollectionsMetadataStorage::get(const std::string & collection_name) const +{ + const auto query = readCreateQuery(collection_name); + return createNamedCollectionFromAST(query); +} + +NamedCollectionsMap NamedCollectionsMetadataStorage::getAll() const +{ + NamedCollectionsMap result; + for (const auto & collection_name : listCollections()) + { + if (result.contains(collection_name)) + { + throw Exception( + ErrorCodes::NAMED_COLLECTION_ALREADY_EXISTS, + "Found duplicate named collection `{}`", + collection_name); + } + result.emplace(collection_name, get(collection_name)); + } + return result; +} + +MutableNamedCollectionPtr NamedCollectionsMetadataStorage::create(const ASTCreateNamedCollectionQuery & query) +{ + writeCreateQuery(query); + return createNamedCollectionFromAST(query); +} + +void NamedCollectionsMetadataStorage::remove(const std::string & collection_name) +{ + storage->remove(getFileName(collection_name)); +} + +bool NamedCollectionsMetadataStorage::removeIfExists(const std::string & collection_name) +{ + return storage->removeIfExists(getFileName(collection_name)); +} + +void NamedCollectionsMetadataStorage::update(const ASTAlterNamedCollectionQuery & query) +{ + auto create_query = readCreateQuery(query.collection_name); + + std::unordered_map result_changes_map; + for (const auto & [name, value] : query.changes) + { + auto [it, inserted] = result_changes_map.emplace(name, value); + if (!inserted) + { + throw Exception( + ErrorCodes::BAD_ARGUMENTS, + "Value with key `{}` is used twice in the SET query (collection name: {})", + name, query.collection_name); + } + } + + for (const auto & [name, value] : create_query.changes) + result_changes_map.emplace(name, value); + + std::unordered_map result_overridability_map; + for (const auto & [name, value] : query.overridability) + result_overridability_map.emplace(name, value); + for (const auto & [name, value] : create_query.overridability) + result_overridability_map.emplace(name, value); + + for (const auto & delete_key : query.delete_keys) + { + auto it = result_changes_map.find(delete_key); + if (it == result_changes_map.end()) + { + throw Exception( + ErrorCodes::BAD_ARGUMENTS, + "Cannot delete key `{}` because it does not exist in collection", + delete_key); + } + else + { + result_changes_map.erase(it); + auto it_override = result_overridability_map.find(delete_key); + if (it_override != result_overridability_map.end()) + result_overridability_map.erase(it_override); + } + } + + create_query.changes.clear(); + for (const auto & [name, value] : result_changes_map) + create_query.changes.emplace_back(name, value); + create_query.overridability = std::move(result_overridability_map); + + if (create_query.changes.empty()) + throw Exception( + ErrorCodes::BAD_ARGUMENTS, + "Named collection cannot be empty (collection name: {})", + query.collection_name); + + chassert(create_query.collection_name == query.collection_name); + writeCreateQuery(create_query, true); +} + +std::vector NamedCollectionsMetadataStorage::listCollections() const +{ + auto paths = storage->list(); + std::vector collections; + collections.reserve(paths.size()); + for (const auto & path : paths) + collections.push_back(std::filesystem::path(path).stem()); + return collections; +} + +ASTCreateNamedCollectionQuery NamedCollectionsMetadataStorage::readCreateQuery(const std::string & collection_name) const +{ + const auto path = getFileName(collection_name); + auto query = storage->read(path); + const auto & settings = getContext()->getSettingsRef(); + + ParserCreateNamedCollectionQuery parser; + auto ast = parseQuery(parser, query, "in file " + path, 0, settings.max_parser_depth, settings.max_parser_backtracks); + const auto & create_query = ast->as(); + return create_query; +} + +void NamedCollectionsMetadataStorage::writeCreateQuery(const ASTCreateNamedCollectionQuery & query, bool replace) +{ + auto normalized_query = query.clone(); + auto & changes = typeid_cast(normalized_query.get())->changes; + ::sort( + changes.begin(), changes.end(), + [](const SettingChange & lhs, const SettingChange & rhs) { return lhs.name < rhs.name; }); + + storage->write(getFileName(query.collection_name), serializeAST(*normalized_query), replace); +} + +bool NamedCollectionsMetadataStorage::supportsPeriodicUpdate() const +{ + return storage->supportsPeriodicUpdate(); +} + +bool NamedCollectionsMetadataStorage::waitUpdate() +{ + if (!storage->supportsPeriodicUpdate()) + throw Exception(ErrorCodes::LOGICAL_ERROR, "Periodic updates are not supported"); + + const auto & config = Context::getGlobalContextInstance()->getConfigRef(); + const size_t timeout = config.getUInt(named_collections_storage_config_path + ".update_timeout_ms", 5000); + + return storage->waitUpdate(timeout); +} + +std::unique_ptr NamedCollectionsMetadataStorage::create(const ContextPtr & context_) +{ + const auto & config = context_->getConfigRef(); + const auto storage_type = config.getString(named_collections_storage_config_path + ".type", "local"); + + if (storage_type == "local") + { + const auto path = config.getString( + named_collections_storage_config_path + ".path", + std::filesystem::path(context_->getPath()) / "named_collections"); + + LOG_TRACE(getLogger("NamedCollectionsMetadataStorage"), + "Using local storage for named collections at path: {}", path); + + auto local_storage = std::make_unique(context_, path); + return std::unique_ptr( + new NamedCollectionsMetadataStorage(std::move(local_storage), context_)); + } + if (storage_type == "zookeeper" || storage_type == "keeper") + { + const auto path = config.getString(named_collections_storage_config_path + ".path"); + auto zk_storage = std::make_unique(context_, path); + + LOG_TRACE(getLogger("NamedCollectionsMetadataStorage"), + "Using zookeeper storage for named collections at path: {}", path); + + return std::unique_ptr( + new NamedCollectionsMetadataStorage(std::move(zk_storage), context_)); + } + + throw Exception( + ErrorCodes::INVALID_CONFIG_PARAMETER, + "Unknown storage for named collections: {}", storage_type); +} + +} diff --git a/src/Common/NamedCollections/NamedCollectionsMetadataStorage.h b/src/Common/NamedCollections/NamedCollectionsMetadataStorage.h new file mode 100644 index 00000000000..3c089fe2fa2 --- /dev/null +++ b/src/Common/NamedCollections/NamedCollectionsMetadataStorage.h @@ -0,0 +1,52 @@ +#pragma once +#include +#include +#include +#include +#include + +namespace DB +{ + +class NamedCollectionsMetadataStorage : private WithContext +{ +public: + static std::unique_ptr create(const ContextPtr & context); + + NamedCollectionsMap getAll() const; + + MutableNamedCollectionPtr get(const std::string & collection_name) const; + + MutableNamedCollectionPtr create(const ASTCreateNamedCollectionQuery & query); + + void remove(const std::string & collection_name); + + bool removeIfExists(const std::string & collection_name); + + void update(const ASTAlterNamedCollectionQuery & query); + + void shutdown(); + + /// Return true if update was made + bool waitUpdate(); + + bool supportsPeriodicUpdate() const; + +private: + class INamedCollectionsStorage; + class LocalStorage; + class ZooKeeperStorage; + + std::shared_ptr storage; + + NamedCollectionsMetadataStorage(std::shared_ptr storage_, ContextPtr context_); + + std::vector listCollections() const; + + ASTCreateNamedCollectionQuery readCreateQuery(const std::string & collection_name) const; + + void writeCreateQuery(const ASTCreateNamedCollectionQuery & query, bool replace = false); +}; + + +} diff --git a/src/Common/ProfileEvents.cpp b/src/Common/ProfileEvents.cpp index f73e16c517d..ac05a362735 100644 --- a/src/Common/ProfileEvents.cpp +++ b/src/Common/ProfileEvents.cpp @@ -643,7 +643,8 @@ The server successfully detected this situation and will download merged part fr \ M(ServerStartupMilliseconds, "Time elapsed from starting server to listening to sockets in milliseconds")\ M(IOUringSQEsSubmitted, "Total number of io_uring SQEs submitted") \ - M(IOUringSQEsResubmits, "Total number of io_uring SQE resubmits performed") \ + M(IOUringSQEsResubmitsAsync, "Total number of asynchronous io_uring SQE resubmits performed") \ + M(IOUringSQEsResubmitsSync, "Total number of synchronous io_uring SQE resubmits performed") \ M(IOUringCQEsCompleted, "Total number of successfully completed io_uring CQEs") \ M(IOUringCQEsFailed, "Total number of completed io_uring CQEs with failures") \ \ diff --git a/src/Common/QueryProfiler.cpp b/src/Common/QueryProfiler.cpp index c3affbdd968..746010b5462 100644 --- a/src/Common/QueryProfiler.cpp +++ b/src/Common/QueryProfiler.cpp @@ -228,9 +228,9 @@ void Timer::cleanup() #endif template -QueryProfilerBase::QueryProfilerBase([[maybe_unused]] UInt64 thread_id, [[maybe_unused]] int clock_type, [[maybe_unused]] UInt32 period, [[maybe_unused]] int pause_signal_) - : log(getLogger("QueryProfiler")) - , pause_signal(pause_signal_) +QueryProfilerBase::QueryProfilerBase( + [[maybe_unused]] UInt64 thread_id, [[maybe_unused]] int clock_type, [[maybe_unused]] UInt32 period, [[maybe_unused]] int pause_signal_) + : log(getLogger("QueryProfiler")), pause_signal(pause_signal_) { #if defined(SANITIZER) throw Exception(ErrorCodes::NOT_IMPLEMENTED, "QueryProfiler disabled because they cannot work under sanitizers"); diff --git a/src/Common/tests/gtest_named_collections.cpp b/src/Common/tests/gtest_named_collections.cpp index 8a8a364961b..8d9aa2bc213 100644 --- a/src/Common/tests/gtest_named_collections.cpp +++ b/src/Common/tests/gtest_named_collections.cpp @@ -1,12 +1,40 @@ #include #include -#include #include #include #include using namespace DB; +/// A class which allows to test private methods of NamedCollectionFactory. +class NamedCollectionFactoryFriend : public NamedCollectionFactory +{ +public: + static NamedCollectionFactoryFriend & instance() + { + static NamedCollectionFactoryFriend instance; + return instance; + } + + void loadFromConfig(const Poco::Util::AbstractConfiguration & config) + { + std::lock_guard lock(mutex); + NamedCollectionFactory::loadFromConfig(config, lock); + } + + void add(const std::string & collection_name, MutableNamedCollectionPtr collection) + { + std::lock_guard lock(mutex); + NamedCollectionFactory::add(collection_name, collection, lock); + } + + void remove(const std::string & collection_name) + { + std::lock_guard lock(mutex); + NamedCollectionFactory::remove(collection_name, lock); + } +}; + TEST(NamedCollections, SimpleConfig) { std::string xml(R"CONFIG( @@ -29,13 +57,13 @@ TEST(NamedCollections, SimpleConfig) Poco::AutoPtr document = dom_parser.parseString(xml); Poco::AutoPtr config = new Poco::Util::XMLConfiguration(document); - NamedCollectionUtils::loadFromConfig(*config); + NamedCollectionFactoryFriend::instance().loadFromConfig(*config); - ASSERT_TRUE(NamedCollectionFactory::instance().exists("collection1")); - ASSERT_TRUE(NamedCollectionFactory::instance().exists("collection2")); - ASSERT_TRUE(NamedCollectionFactory::instance().tryGet("collection3") == nullptr); + ASSERT_TRUE(NamedCollectionFactoryFriend::instance().exists("collection1")); + ASSERT_TRUE(NamedCollectionFactoryFriend::instance().exists("collection2")); + ASSERT_TRUE(NamedCollectionFactoryFriend::instance().tryGet("collection3") == nullptr); - auto collections = NamedCollectionFactory::instance().getAll(); + auto collections = NamedCollectionFactoryFriend::instance().getAll(); ASSERT_EQ(collections.size(), 2); ASSERT_TRUE(collections.contains("collection1")); ASSERT_TRUE(collections.contains("collection2")); @@ -47,7 +75,7 @@ key3: 3.3 key4: -4 )CONFIG"); - auto collection1 = NamedCollectionFactory::instance().get("collection1"); + auto collection1 = NamedCollectionFactoryFriend::instance().get("collection1"); ASSERT_TRUE(collection1 != nullptr); ASSERT_TRUE(collection1->get("key1") == "value1"); @@ -61,7 +89,7 @@ key5: 5 key6: 6.6 )CONFIG"); - auto collection2 = NamedCollectionFactory::instance().get("collection2"); + auto collection2 = NamedCollectionFactoryFriend::instance().get("collection2"); ASSERT_TRUE(collection2 != nullptr); ASSERT_TRUE(collection2->get("key4") == "value4"); @@ -69,9 +97,9 @@ key6: 6.6 ASSERT_TRUE(collection2->get("key6") == 6.6); auto collection2_copy = collections["collection2"]->duplicate(); - NamedCollectionFactory::instance().add("collection2_copy", collection2_copy); - ASSERT_TRUE(NamedCollectionFactory::instance().exists("collection2_copy")); - ASSERT_EQ(NamedCollectionFactory::instance().get("collection2_copy")->dumpStructure(), + NamedCollectionFactoryFriend::instance().add("collection2_copy", collection2_copy); + ASSERT_TRUE(NamedCollectionFactoryFriend::instance().exists("collection2_copy")); + ASSERT_EQ(NamedCollectionFactoryFriend::instance().get("collection2_copy")->dumpStructure(), R"CONFIG(key4: value4 key5: 5 key6: 6.6 @@ -88,8 +116,8 @@ key6: 6.6 collection2_copy->setOrUpdate("key4", "value45", {}); ASSERT_EQ(collection2_copy->getOrDefault("key4", "N"), "value45"); - NamedCollectionFactory::instance().remove("collection2_copy"); - ASSERT_FALSE(NamedCollectionFactory::instance().exists("collection2_copy")); + NamedCollectionFactoryFriend::instance().remove("collection2_copy"); + ASSERT_FALSE(NamedCollectionFactoryFriend::instance().exists("collection2_copy")); config.reset(); } @@ -119,11 +147,11 @@ TEST(NamedCollections, NestedConfig) Poco::AutoPtr document = dom_parser.parseString(xml); Poco::AutoPtr config = new Poco::Util::XMLConfiguration(document); - NamedCollectionUtils::loadFromConfig(*config); + NamedCollectionFactoryFriend::instance().loadFromConfig(*config); - ASSERT_TRUE(NamedCollectionFactory::instance().exists("collection3")); + ASSERT_TRUE(NamedCollectionFactoryFriend::instance().exists("collection3")); - auto collection = NamedCollectionFactory::instance().get("collection3"); + auto collection = NamedCollectionFactoryFriend::instance().get("collection3"); ASSERT_TRUE(collection != nullptr); ASSERT_EQ(collection->dumpStructure(), @@ -171,8 +199,8 @@ TEST(NamedCollections, NestedConfigDuplicateKeys) Poco::AutoPtr document = dom_parser.parseString(xml); Poco::AutoPtr config = new Poco::Util::XMLConfiguration(document); - NamedCollectionUtils::loadFromConfig(*config); - auto collection = NamedCollectionFactory::instance().get("collection"); + NamedCollectionFactoryFriend::instance().loadFromConfig(*config); + auto collection = NamedCollectionFactoryFriend::instance().get("collection"); auto keys = collection->getKeys(); ASSERT_EQ(keys.size(), 6); diff --git a/src/Common/tests/gtest_resolve_pool.cpp b/src/Common/tests/gtest_resolve_pool.cpp index 2391fc8bacf..b760b9b1524 100644 --- a/src/Common/tests/gtest_resolve_pool.cpp +++ b/src/Common/tests/gtest_resolve_pool.cpp @@ -1,12 +1,39 @@ #include -#include #include #include -#include "base/defines.h" +#include + +#include #include +#include #include -#include + + +using namespace std::literals::chrono_literals; + + +auto now() +{ + return std::chrono::steady_clock::now(); +} + +void sleep_until(auto time_point) +{ + std::this_thread::sleep_until(time_point); +} + +void sleep_for(auto duration) +{ + std::this_thread::sleep_for(duration); +} + +size_t toMilliseconds(auto duration) +{ + return std::chrono::duration_cast(duration).count(); +} + +const auto epsilon = 500us; class ResolvePoolMock : public DB::HostResolver { @@ -267,13 +294,14 @@ TEST_F(ResolvePoolTest, CanFailAndHeal) TEST_F(ResolvePoolTest, CanExpire) { - auto resolver = make_resolver(); + auto history = 5ms; + auto resolver = make_resolver(toMilliseconds(history)); auto expired_addr = resolver->resolve(); ASSERT_TRUE(addresses.contains(*expired_addr)); - addresses.erase(*expired_addr); - sleepForSeconds(1); + + sleep_for(history + epsilon); for (size_t i = 0; i < 1000; ++i) { @@ -310,12 +338,19 @@ TEST_F(ResolvePoolTest, DuplicatesInAddresses) ASSERT_EQ(3, DB::CurrentThread::getProfileEvents()[metrics.discovered]); } -void check_no_failed_address(size_t iteration, auto & resolver, auto & addresses, auto & failed_addr, auto & metrics) +void check_no_failed_address(size_t iteration, auto & resolver, auto & addresses, auto & failed_addr, auto & metrics, auto deadline) { ASSERT_EQ(iteration, DB::CurrentThread::getProfileEvents()[metrics.failed]); for (size_t i = 0; i < 100; ++i) { auto next_addr = resolver->resolve(); + + if (now() > deadline) + { + ASSERT_NE(i, 0); + break; + } + ASSERT_TRUE(addresses.contains(*next_addr)); ASSERT_NE(*next_addr, *failed_addr); } @@ -323,52 +358,60 @@ void check_no_failed_address(size_t iteration, auto & resolver, auto & addresses TEST_F(ResolvePoolTest, BannedForConsiquenceFail) { - size_t history_ms = 5; - auto resolver = make_resolver(history_ms); - + auto history = 5ms; + auto resolver = make_resolver(toMilliseconds(history)); auto failed_addr = resolver->resolve(); ASSERT_TRUE(addresses.contains(*failed_addr)); + auto start_at = now(); + failed_addr.setFail(); ASSERT_EQ(3, CurrentMetrics::get(metrics.active_count)); ASSERT_EQ(1, CurrentMetrics::get(metrics.banned_count)); - check_no_failed_address(1, resolver, addresses, failed_addr, metrics); + check_no_failed_address(1, resolver, addresses, failed_addr, metrics, start_at + history - epsilon); + + sleep_until(start_at + history + epsilon); + start_at = now(); - sleepForMilliseconds(history_ms + 1); resolver->update(); ASSERT_EQ(3, CurrentMetrics::get(metrics.active_count)); ASSERT_EQ(0, CurrentMetrics::get(metrics.banned_count)); failed_addr.setFail(); - check_no_failed_address(2, resolver, addresses, failed_addr, metrics); + check_no_failed_address(2, resolver, addresses, failed_addr, metrics, start_at + history - epsilon); + + sleep_until(start_at + history + epsilon); + start_at = now(); - sleepForMilliseconds(history_ms + 1); resolver->update(); ASSERT_EQ(3, CurrentMetrics::get(metrics.active_count)); ASSERT_EQ(1, CurrentMetrics::get(metrics.banned_count)); // ip still banned adter history_ms + update, because it was his second consiquent fail - check_no_failed_address(2, resolver, addresses, failed_addr, metrics); + check_no_failed_address(2, resolver, addresses, failed_addr, metrics, start_at + history - epsilon); } TEST_F(ResolvePoolTest, NoAditionalBannForConcurrentFail) { - size_t history_ms = 5; - auto resolver = make_resolver(history_ms); + auto history = 5ms; + auto resolver = make_resolver(toMilliseconds(history)); auto failed_addr = resolver->resolve(); ASSERT_TRUE(addresses.contains(*failed_addr)); + auto start_at = now(); + failed_addr.setFail(); failed_addr.setFail(); failed_addr.setFail(); ASSERT_EQ(3, CurrentMetrics::get(metrics.active_count)); ASSERT_EQ(1, CurrentMetrics::get(metrics.banned_count)); - check_no_failed_address(3, resolver, addresses, failed_addr, metrics); + check_no_failed_address(3, resolver, addresses, failed_addr, metrics, start_at + history - epsilon); + + sleep_until(start_at + history + epsilon); - sleepForMilliseconds(history_ms + 1); resolver->update(); // ip is cleared after just 1 history_ms interval. ASSERT_EQ(3, CurrentMetrics::get(metrics.active_count)); @@ -377,8 +420,8 @@ TEST_F(ResolvePoolTest, NoAditionalBannForConcurrentFail) TEST_F(ResolvePoolTest, StillBannedAfterSuccess) { - size_t history_ms = 5; - auto resolver = make_resolver(history_ms); + auto history = 5ms; + auto resolver = make_resolver(toMilliseconds(history)); auto failed_addr = resolver->resolve(); ASSERT_TRUE(addresses.contains(*failed_addr)); @@ -395,11 +438,12 @@ TEST_F(ResolvePoolTest, StillBannedAfterSuccess) } chassert(again_addr); + auto start_at = now(); failed_addr.setFail(); ASSERT_EQ(3, CurrentMetrics::get(metrics.active_count)); ASSERT_EQ(1, CurrentMetrics::get(metrics.banned_count)); - check_no_failed_address(1, resolver, addresses, failed_addr, metrics); + check_no_failed_address(1, resolver, addresses, failed_addr, metrics, start_at + history - epsilon); again_addr = std::nullopt; // success; diff --git a/src/Coordination/KeeperConstants.cpp b/src/Coordination/KeeperConstants.cpp index 8251dca3d1e..51bf037c1c9 100644 --- a/src/Coordination/KeeperConstants.cpp +++ b/src/Coordination/KeeperConstants.cpp @@ -258,7 +258,8 @@ M(KeeperExistsRequest) \ \ M(IOUringSQEsSubmitted) \ - M(IOUringSQEsResubmits) \ + M(IOUringSQEsResubmitsAsync) \ + M(IOUringSQEsResubmitsSync) \ M(IOUringCQEsCompleted) \ M(IOUringCQEsFailed) \ \ diff --git a/src/Coordination/tests/gtest_coordination.cpp b/src/Coordination/tests/gtest_coordination.cpp index 30a0eea3040..653b1fa0a84 100644 --- a/src/Coordination/tests/gtest_coordination.cpp +++ b/src/Coordination/tests/gtest_coordination.cpp @@ -2045,6 +2045,7 @@ TEST_P(CoordinationTest, TestPreprocessWhenCloseSessionIsPrecommitted) state_machine->pre_commit(1, create_entry_1->get_buf()); EXPECT_TRUE(uncommitted_state.nodes.contains(node_path_1)); + state_machine->commit(1, create_entry_1->get_buf()); EXPECT_TRUE(storage.container.contains(node_path_1)); diff --git a/src/Core/NamesAndTypes.cpp b/src/Core/NamesAndTypes.cpp index d6380a632f1..49ab822c738 100644 --- a/src/Core/NamesAndTypes.cpp +++ b/src/Core/NamesAndTypes.cpp @@ -188,6 +188,18 @@ NamesAndTypesList NamesAndTypesList::filter(const Names & names) const return filter(NameSet(names.begin(), names.end())); } +NamesAndTypesList NamesAndTypesList::eraseNames(const NameSet & names) const +{ + NamesAndTypesList res; + for (const auto & column : *this) + { + if (!names.contains(column.name)) + res.push_back(column); + } + return res; +} + + NamesAndTypesList NamesAndTypesList::addTypes(const Names & names) const { /// NOTE: It's better to make a map in `IStorage` than to create it here every time again. diff --git a/src/Core/NamesAndTypes.h b/src/Core/NamesAndTypes.h index 915add9b7bc..29f40c45938 100644 --- a/src/Core/NamesAndTypes.h +++ b/src/Core/NamesAndTypes.h @@ -111,6 +111,9 @@ public: /// Leave only the columns whose names are in the `names`. In `names` there can be superfluous columns. NamesAndTypesList filter(const Names & names) const; + /// Leave only the columns whose names are not in the `names`. + NamesAndTypesList eraseNames(const NameSet & names) const; + /// Unlike `filter`, returns columns in the order in which they go in `names`. NamesAndTypesList addTypes(const Names & names) const; diff --git a/src/Core/Settings.h b/src/Core/Settings.h index cf572459cbc..b3e83092a77 100644 --- a/src/Core/Settings.h +++ b/src/Core/Settings.h @@ -334,7 +334,7 @@ class IColumn; M(Bool, fsync_metadata, true, "Do fsync after changing metadata for tables and databases (.sql files). Could be disabled in case of poor latency on server with high load of DDL queries and high load of disk subsystem.", 0) \ \ M(Bool, join_use_nulls, false, "Use NULLs for non-joined rows of outer JOINs for types that can be inside Nullable. If false, use default value of corresponding columns data type.", IMPORTANT) \ - M(Bool, allow_experimental_join_condition, false, "Support join with inequal conditions which involve columns from both left and right table. e.g. t1.y < t2.y.", IMPORTANT) \ + M(Bool, allow_experimental_join_condition, false, "Support join with inequal conditions which involve columns from both left and right table. e.g. t1.y < t2.y.", 0) \ \ M(JoinStrictness, join_default_strictness, JoinStrictness::All, "Set default strictness in JOIN query. Possible values: empty string, 'ANY', 'ALL'. If empty, query without strictness will throw exception.", 0) \ M(Bool, any_join_distinct_right_table_keys, false, "Enable old ANY JOIN logic with many-to-one left-to-right table keys mapping for all ANY JOINs. It leads to confusing not equal results for 't1 ANY LEFT JOIN t2' and 't2 ANY RIGHT JOIN t1'. ANY RIGHT JOIN needs one-to-many keys mapping to be consistent with LEFT one.", IMPORTANT) \ @@ -1059,7 +1059,8 @@ class IColumn; M(Bool, input_format_tsv_detect_header, true, "Automatically detect header with names and types in TSV format", 0) \ M(Bool, input_format_custom_detect_header, true, "Automatically detect header with names and types in CustomSeparated format", 0) \ M(Bool, input_format_parquet_skip_columns_with_unsupported_types_in_schema_inference, false, "Skip columns with unsupported types while schema inference for format Parquet", 0) \ - M(UInt64, input_format_parquet_max_block_size, 8192, "Max block size for parquet reader.", 0) \ + M(UInt64, input_format_parquet_max_block_size, DEFAULT_BLOCK_SIZE, "Max block size for parquet reader.", 0) \ + M(UInt64, input_format_parquet_prefer_block_bytes, DEFAULT_BLOCK_SIZE * 256, "Average block bytes output by parquet reader", 0) \ M(Bool, input_format_protobuf_skip_fields_with_unsupported_types_in_schema_inference, false, "Skip fields with unsupported types while schema inference for format Protobuf", 0) \ M(Bool, input_format_capn_proto_skip_fields_with_unsupported_types_in_schema_inference, false, "Skip columns with unsupported types while schema inference for format CapnProto", 0) \ M(Bool, input_format_orc_skip_columns_with_unsupported_types_in_schema_inference, false, "Skip columns with unsupported types while schema inference for format ORC", 0) \ diff --git a/src/Core/SettingsChangesHistory.h b/src/Core/SettingsChangesHistory.h index 7eb14047ace..69bc8c5d207 100644 --- a/src/Core/SettingsChangesHistory.h +++ b/src/Core/SettingsChangesHistory.h @@ -96,6 +96,8 @@ static const std::map #include #include +#include #include #include #include @@ -174,4 +175,9 @@ DataTypePtr removeNullableOrLowCardinalityNullable(const DataTypePtr & type) } +bool canContainNull(const IDataType & type) +{ + return type.isNullable() || type.isLowCardinalityNullable() || isDynamic(type) || isVariant(type); +} + } diff --git a/src/DataTypes/DataTypeNullable.h b/src/DataTypes/DataTypeNullable.h index 71abe48c151..7a8a54fdf3a 100644 --- a/src/DataTypes/DataTypeNullable.h +++ b/src/DataTypes/DataTypeNullable.h @@ -62,4 +62,6 @@ DataTypePtr makeNullableOrLowCardinalityNullableSafe(const DataTypePtr & type); /// Nullable(T) -> T, LowCardinality(Nullable(T)) -> T DataTypePtr removeNullableOrLowCardinalityNullable(const DataTypePtr & type); +bool canContainNull(const IDataType & type); + } diff --git a/src/Databases/DatabaseReplicated.cpp b/src/Databases/DatabaseReplicated.cpp index b91596a555d..badfedeec9b 100644 --- a/src/Databases/DatabaseReplicated.cpp +++ b/src/Databases/DatabaseReplicated.cpp @@ -122,6 +122,13 @@ DatabaseReplicated::DatabaseReplicated( fillClusterAuthInfo(db_settings.collection_name.value, context_->getConfigRef()); replica_group_name = context_->getConfigRef().getString("replica_group_name", ""); + + if (!replica_group_name.empty() && database_name.starts_with(DatabaseReplicated::ALL_GROUPS_CLUSTER_PREFIX)) + { + context_->addWarningMessage(fmt::format("There's a Replicated database with a name starting from '{}', " + "and replica_group_name is configured. It may cause collisions in cluster names.", + ALL_GROUPS_CLUSTER_PREFIX)); + } } String DatabaseReplicated::getFullReplicaName(const String & shard, const String & replica) @@ -173,13 +180,40 @@ ClusterPtr DatabaseReplicated::tryGetCluster() const return cluster; } -void DatabaseReplicated::setCluster(ClusterPtr && new_cluster) +ClusterPtr DatabaseReplicated::tryGetAllGroupsCluster() const { std::lock_guard lock{mutex}; - cluster = std::move(new_cluster); + if (replica_group_name.empty()) + return nullptr; + + if (cluster_all_groups) + return cluster_all_groups; + + /// Database is probably not created or not initialized yet, it's ok to return nullptr + if (is_readonly) + return cluster_all_groups; + + try + { + cluster_all_groups = getClusterImpl(/*all_groups*/ true); + } + catch (...) + { + tryLogCurrentException(log); + } + return cluster_all_groups; } -ClusterPtr DatabaseReplicated::getClusterImpl() const +void DatabaseReplicated::setCluster(ClusterPtr && new_cluster, bool all_groups) +{ + std::lock_guard lock{mutex}; + if (all_groups) + cluster_all_groups = std::move(new_cluster); + else + cluster = std::move(new_cluster); +} + +ClusterPtr DatabaseReplicated::getClusterImpl(bool all_groups) const { Strings unfiltered_hosts; Strings hosts; @@ -199,17 +233,24 @@ ClusterPtr DatabaseReplicated::getClusterImpl() const "It's possible if the first replica is not fully created yet " "or if the last replica was just dropped or due to logical error", zookeeper_path); - hosts.clear(); - std::vector paths; - for (const auto & host : unfiltered_hosts) - paths.push_back(zookeeper_path + "/replicas/" + host + "/replica_group"); - - auto replica_groups = zookeeper->tryGet(paths); - - for (size_t i = 0; i < paths.size(); ++i) + if (all_groups) { - if (replica_groups[i].data == replica_group_name) - hosts.push_back(unfiltered_hosts[i]); + hosts = unfiltered_hosts; + } + else + { + hosts.clear(); + std::vector paths; + for (const auto & host : unfiltered_hosts) + paths.push_back(zookeeper_path + "/replicas/" + host + "/replica_group"); + + auto replica_groups = zookeeper->tryGet(paths); + + for (size_t i = 0; i < paths.size(); ++i) + { + if (replica_groups[i].data == replica_group_name) + hosts.push_back(unfiltered_hosts[i]); + } } Int32 cversion = stat.cversion; @@ -274,6 +315,11 @@ ClusterPtr DatabaseReplicated::getClusterImpl() const bool treat_local_as_remote = false; bool treat_local_port_as_remote = getContext()->getApplicationType() == Context::ApplicationType::LOCAL; + + String cluster_name = TSA_SUPPRESS_WARNING_FOR_READ(database_name); /// FIXME + if (all_groups) + cluster_name = ALL_GROUPS_CLUSTER_PREFIX + cluster_name; + ClusterConnectionParameters params{ cluster_auth_info.cluster_username, cluster_auth_info.cluster_password, @@ -282,7 +328,7 @@ ClusterPtr DatabaseReplicated::getClusterImpl() const treat_local_port_as_remote, cluster_auth_info.cluster_secure_connection, Priority{1}, - TSA_SUPPRESS_WARNING_FOR_READ(database_name), /// FIXME + cluster_name, cluster_auth_info.cluster_secret}; return std::make_shared(getContext()->getSettingsRef(), shards, params); diff --git a/src/Databases/DatabaseReplicated.h b/src/Databases/DatabaseReplicated.h index 55bcf963d37..761d6b4b503 100644 --- a/src/Databases/DatabaseReplicated.h +++ b/src/Databases/DatabaseReplicated.h @@ -20,6 +20,8 @@ using ClusterPtr = std::shared_ptr; class DatabaseReplicated : public DatabaseAtomic { public: + static constexpr auto ALL_GROUPS_CLUSTER_PREFIX = "all_groups."; + DatabaseReplicated(const String & name_, const String & metadata_path_, UUID uuid, const String & zookeeper_path_, const String & shard_name_, const String & replica_name_, DatabaseReplicatedSettings db_settings_, @@ -65,6 +67,7 @@ public: /// Returns cluster consisting of database replicas ClusterPtr tryGetCluster() const; + ClusterPtr tryGetAllGroupsCluster() const; void drop(ContextPtr /*context*/) override; @@ -113,8 +116,8 @@ private: ASTPtr parseQueryFromMetadataInZooKeeper(const String & node_name, const String & query); String readMetadataFile(const String & table_name) const; - ClusterPtr getClusterImpl() const; - void setCluster(ClusterPtr && new_cluster); + ClusterPtr getClusterImpl(bool all_groups = false) const; + void setCluster(ClusterPtr && new_cluster, bool all_groups = false); void createEmptyLogEntry(const ZooKeeperPtr & current_zookeeper); @@ -155,6 +158,7 @@ private: UInt64 tables_metadata_digest TSA_GUARDED_BY(metadata_mutex); mutable ClusterPtr cluster; + mutable ClusterPtr cluster_all_groups; LoadTaskPtr startup_replicated_database_task TSA_GUARDED_BY(mutex); }; diff --git a/src/Databases/DatabaseReplicatedWorker.cpp b/src/Databases/DatabaseReplicatedWorker.cpp index 6e19a77c501..31d6f7876a8 100644 --- a/src/Databases/DatabaseReplicatedWorker.cpp +++ b/src/Databases/DatabaseReplicatedWorker.cpp @@ -421,6 +421,8 @@ DDLTaskPtr DatabaseReplicatedDDLWorker::initAndCheckTask(const String & entry_na { /// Some replica is added or removed, let's update cached cluster database->setCluster(database->getClusterImpl()); + if (!database->replica_group_name.empty()) + database->setCluster(database->getClusterImpl(/*all_groups*/ true), /*all_groups*/ true); out_reason = fmt::format("Entry {} is a dummy task", entry_name); return {}; } diff --git a/src/Databases/DatabasesCommon.cpp b/src/Databases/DatabasesCommon.cpp index 5fee14ecc2a..fd38a31da5c 100644 --- a/src/Databases/DatabasesCommon.cpp +++ b/src/Databases/DatabasesCommon.cpp @@ -41,11 +41,11 @@ void applyMetadataChangesToCreateQuery(const ASTPtr & query, const StorageInMemo throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Cannot alter table {} because it was created AS table function" " and doesn't have structure in metadata", backQuote(ast_create_query.getTable())); - if (!has_structure && !ast_create_query.is_dictionary) + if (!has_structure && !ast_create_query.is_dictionary && !ast_create_query.isParameterizedView()) throw Exception(ErrorCodes::LOGICAL_ERROR, "Cannot alter table {} metadata doesn't have structure", backQuote(ast_create_query.getTable())); - if (!ast_create_query.is_dictionary) + if (!ast_create_query.is_dictionary && !ast_create_query.isParameterizedView()) { ASTPtr new_columns = InterpreterCreateQuery::formatColumns(metadata.columns); ASTPtr new_indices = InterpreterCreateQuery::formatIndices(metadata.secondary_indices); diff --git a/src/Dictionaries/CacheDictionary.cpp b/src/Dictionaries/CacheDictionary.cpp index 2842e2b8799..1816324a93b 100644 --- a/src/Dictionaries/CacheDictionary.cpp +++ b/src/Dictionaries/CacheDictionary.cpp @@ -511,7 +511,10 @@ MutableColumns CacheDictionary::aggregateColumns( if (default_mask) { if (key_state_from_storage.isDefault()) + { (*default_mask)[key_index] = 1; + aggregated_column->insertDefault(); + } else { (*default_mask)[key_index] = 0; diff --git a/src/Disks/IO/IOUringReader.cpp b/src/Disks/IO/IOUringReader.cpp index 6b0e3f8cc89..b0e783e11d9 100644 --- a/src/Disks/IO/IOUringReader.cpp +++ b/src/Disks/IO/IOUringReader.cpp @@ -22,7 +22,8 @@ namespace ProfileEvents extern const Event AsynchronousReaderIgnoredBytes; extern const Event IOUringSQEsSubmitted; - extern const Event IOUringSQEsResubmits; + extern const Event IOUringSQEsResubmitsAsync; + extern const Event IOUringSQEsResubmitsSync; extern const Event IOUringCQEsCompleted; extern const Event IOUringCQEsFailed; } @@ -149,10 +150,12 @@ int IOUringReader::submitToRing(EnqueuedRequest & enqueued) io_uring_prep_read(sqe, fd, request.buf, static_cast(request.size - enqueued.bytes_read), request.offset + enqueued.bytes_read); int ret = 0; - do + ret = io_uring_submit(&ring); + while (ret == -EINTR || ret == -EAGAIN) { + ProfileEvents::increment(ProfileEvents::IOUringSQEsResubmitsSync); ret = io_uring_submit(&ring); - } while (ret == -EINTR || ret == -EAGAIN); + } if (ret > 0 && !enqueued.resubmitting) { @@ -266,7 +269,7 @@ void IOUringReader::monitorRing() if (cqe->res == -EAGAIN || cqe->res == -EINTR) { enqueued.resubmitting = true; - ProfileEvents::increment(ProfileEvents::IOUringSQEsResubmits); + ProfileEvents::increment(ProfileEvents::IOUringSQEsResubmitsAsync); ret = submitToRing(enqueued); if (ret <= 0) @@ -310,6 +313,7 @@ void IOUringReader::monitorRing() // potential short read, re-submit enqueued.resubmitting = true; enqueued.bytes_read += bytes_read; + ProfileEvents::increment(ProfileEvents::IOUringSQEsResubmitsAsync); ret = submitToRing(enqueued); if (ret <= 0) diff --git a/src/Disks/IO/WriteBufferFromAzureBlobStorage.cpp b/src/Disks/IO/WriteBufferFromAzureBlobStorage.cpp index 2c90e3a9003..a2d21cf49c2 100644 --- a/src/Disks/IO/WriteBufferFromAzureBlobStorage.cpp +++ b/src/Disks/IO/WriteBufferFromAzureBlobStorage.cpp @@ -19,11 +19,15 @@ namespace ProfileEvents namespace DB { +namespace ErrorCodes +{ + extern const int LOGICAL_ERROR; +} + struct WriteBufferFromAzureBlobStorage::PartData { Memory<> memory; size_t data_size = 0; - std::string block_id; }; BufferAllocationPolicyPtr createBufferAllocationPolicy(const AzureObjectStorageSettings & settings) @@ -119,22 +123,30 @@ void WriteBufferFromAzureBlobStorage::preFinalize() // This function should not be run again is_prefinalized = true; + hidePartialData(); + + if (hidden_size > 0) + detachBuffer(); + + setFakeBufferWhenPreFinalized(); + /// If there is only one block and size is less than or equal to max_single_part_upload_size /// then we use single part upload instead of multi part upload - if (buffer_allocation_policy->getBufferNumber() == 1) + if (block_ids.empty() && detached_part_data.size() == 1 && detached_part_data.front().data_size <= max_single_part_upload_size) { - size_t data_size = size_t(position() - memory.data()); - if (data_size <= max_single_part_upload_size) - { - auto block_blob_client = blob_container_client->GetBlockBlobClient(blob_path); - Azure::Core::IO::MemoryBodyStream memory_stream(reinterpret_cast(memory.data()), data_size); - execWithRetry([&](){ block_blob_client.Upload(memory_stream); }, max_unexpected_write_error_retries, data_size); - LOG_TRACE(log, "Committed single block for blob `{}`", blob_path); - return; - } - } + auto part_data = std::move(detached_part_data.front()); + auto block_blob_client = blob_container_client->GetBlockBlobClient(blob_path); + Azure::Core::IO::MemoryBodyStream memory_stream(reinterpret_cast(part_data.memory.data()), part_data.data_size); + execWithRetry([&](){ block_blob_client.Upload(memory_stream); }, max_unexpected_write_error_retries, part_data.data_size); + LOG_TRACE(log, "Committed single block for blob `{}`", blob_path); - writePart(); + detached_part_data.pop_front(); + return; + } + else + { + writeMultipartUpload(); + } } void WriteBufferFromAzureBlobStorage::finalizeImpl() @@ -144,9 +156,13 @@ void WriteBufferFromAzureBlobStorage::finalizeImpl() if (!is_prefinalized) preFinalize(); + chassert(offset() == 0); + chassert(hidden_size == 0); + + task_tracker->waitAll(); + if (!block_ids.empty()) { - task_tracker->waitAll(); auto block_blob_client = blob_container_client->GetBlockBlobClient(blob_path); execWithRetry([&](){ block_blob_client.CommitBlockList(block_ids); }, max_unexpected_write_error_retries); LOG_TRACE(log, "Committed {} blocks for blob `{}`", block_ids.size(), blob_path); @@ -155,14 +171,66 @@ void WriteBufferFromAzureBlobStorage::finalizeImpl() void WriteBufferFromAzureBlobStorage::nextImpl() { + if (is_prefinalized) + throw Exception( + ErrorCodes::LOGICAL_ERROR, + "Cannot write to prefinalized buffer for Azure Blob Storage, the file could have been created"); + task_tracker->waitIfAny(); - writePart(); + + hidePartialData(); + + reallocateFirstBuffer(); + + if (available() > 0) + return; + + detachBuffer(); + + if (detached_part_data.size() > 1) + writeMultipartUpload(); + allocateBuffer(); } +void WriteBufferFromAzureBlobStorage::hidePartialData() +{ + if (write_settings.remote_throttler) + write_settings.remote_throttler->add(offset(), ProfileEvents::RemoteWriteThrottlerBytes, ProfileEvents::RemoteWriteThrottlerSleepMicroseconds); + + chassert(memory.size() >= hidden_size + offset()); + + hidden_size += offset(); + chassert(memory.data() + hidden_size == working_buffer.begin() + offset()); + chassert(memory.data() + hidden_size == position()); + + WriteBuffer::set(memory.data() + hidden_size, memory.size() - hidden_size); + chassert(offset() == 0); +} + +void WriteBufferFromAzureBlobStorage::reallocateFirstBuffer() +{ + chassert(offset() == 0); + + if (buffer_allocation_policy->getBufferNumber() > 1 || available() > 0) + return; + + const size_t max_first_buffer = buffer_allocation_policy->getBufferSize(); + if (memory.size() == max_first_buffer) + return; + + size_t size = std::min(memory.size() * 2, max_first_buffer); + memory.resize(size); + + WriteBuffer::set(memory.data() + hidden_size, memory.size() - hidden_size); + chassert(offset() == 0); +} + void WriteBufferFromAzureBlobStorage::allocateBuffer() { buffer_allocation_policy->nextBuffer(); + chassert(0 == hidden_size); + auto size = buffer_allocation_policy->getBufferSize(); if (buffer_allocation_policy->getBufferNumber() == 1) @@ -172,30 +240,56 @@ void WriteBufferFromAzureBlobStorage::allocateBuffer() WriteBuffer::set(memory.data(), memory.size()); } -void WriteBufferFromAzureBlobStorage::writePart() +void WriteBufferFromAzureBlobStorage::detachBuffer() { - auto data_size = size_t(position() - memory.data()); + size_t data_size = size_t(position() - memory.data()); if (data_size == 0) return; - const std::string & block_id = block_ids.emplace_back(getRandomASCIIString(64)); - std::shared_ptr part_data = std::make_shared(std::move(memory), data_size, block_id); - WriteBuffer::set(nullptr, 0); + chassert(data_size == hidden_size); - auto upload_worker = [this, part_data] () + auto buf = std::move(memory); + + WriteBuffer::set(nullptr, 0); + total_size += hidden_size; + hidden_size = 0; + + detached_part_data.push_back({std::move(buf), data_size}); + WriteBuffer::set(nullptr, 0); +} + +void WriteBufferFromAzureBlobStorage::writePart(WriteBufferFromAzureBlobStorage::PartData && part_data) +{ + const std::string & block_id = block_ids.emplace_back(getRandomASCIIString(64)); + auto worker_data = std::make_shared>(block_id, std::move(part_data)); + + auto upload_worker = [this, worker_data] () { + auto & data_size = std::get<1>(*worker_data).data_size; + auto & data_block_id = std::get<0>(*worker_data); auto block_blob_client = blob_container_client->GetBlockBlobClient(blob_path); - Azure::Core::IO::MemoryBodyStream memory_stream(reinterpret_cast(part_data->memory.data()), part_data->data_size); - execWithRetry([&](){ block_blob_client.StageBlock(part_data->block_id, memory_stream); }, max_unexpected_write_error_retries, part_data->data_size); - - if (write_settings.remote_throttler) - write_settings.remote_throttler->add(part_data->data_size, ProfileEvents::RemoteWriteThrottlerBytes, ProfileEvents::RemoteWriteThrottlerSleepMicroseconds); + Azure::Core::IO::MemoryBodyStream memory_stream(reinterpret_cast(std::get<1>(*worker_data).memory.data()), data_size); + execWithRetry([&](){ block_blob_client.StageBlock(data_block_id, memory_stream); }, max_unexpected_write_error_retries, data_size); }; task_tracker->add(std::move(upload_worker)); } +void WriteBufferFromAzureBlobStorage::setFakeBufferWhenPreFinalized() +{ + WriteBuffer::set(fake_buffer_when_prefinalized, sizeof(fake_buffer_when_prefinalized)); +} + +void WriteBufferFromAzureBlobStorage::writeMultipartUpload() +{ + while (!detached_part_data.empty()) + { + writePart(std::move(detached_part_data.front())); + detached_part_data.pop_front(); + } +} + } #endif diff --git a/src/Disks/IO/WriteBufferFromAzureBlobStorage.h b/src/Disks/IO/WriteBufferFromAzureBlobStorage.h index 3da6d843991..10fe871a727 100644 --- a/src/Disks/IO/WriteBufferFromAzureBlobStorage.h +++ b/src/Disks/IO/WriteBufferFromAzureBlobStorage.h @@ -48,8 +48,13 @@ public: private: struct PartData; - void writePart(); + void writeMultipartUpload(); + void writePart(PartData && part_data); + void detachBuffer(); + void reallocateFirstBuffer(); void allocateBuffer(); + void hidePartialData(); + void setFakeBufferWhenPreFinalized(); void finalizeImpl() override; void execWithRetry(std::function func, size_t num_tries, size_t cost = 0); @@ -77,9 +82,16 @@ private: MemoryBufferPtr allocateBuffer() const; + char fake_buffer_when_prefinalized[1] = {}; + bool first_buffer=true; + size_t total_size = 0; + size_t hidden_size = 0; + std::unique_ptr task_tracker; + + std::deque detached_part_data; }; } diff --git a/src/Disks/ObjectStorages/AzureBlobStorage/AzureObjectStorage.h b/src/Disks/ObjectStorages/AzureBlobStorage/AzureObjectStorage.h index 8ead696cf78..c342929d656 100644 --- a/src/Disks/ObjectStorages/AzureBlobStorage/AzureObjectStorage.h +++ b/src/Disks/ObjectStorages/AzureBlobStorage/AzureObjectStorage.h @@ -166,6 +166,8 @@ public: return client.get(); } + bool supportParallelWrite() const override { return true; } + private: using SharedAzureClientPtr = std::shared_ptr; void removeObjectImpl(const StoredObject & object, const SharedAzureClientPtr & client_ptr, bool if_exists); diff --git a/src/Disks/ObjectStorages/S3/S3ObjectStorage.cpp b/src/Disks/ObjectStorages/S3/S3ObjectStorage.cpp index ae719f5cde4..afc13251f5b 100644 --- a/src/Disks/ObjectStorages/S3/S3ObjectStorage.cpp +++ b/src/Disks/ObjectStorages/S3/S3ObjectStorage.cpp @@ -382,6 +382,7 @@ void S3ObjectStorage::removeObjectsImpl(const StoredObjects & objects, bool if_e { std::vector current_chunk; String keys; + size_t first_position = current_position; for (; current_position < objects.size() && current_chunk.size() < chunk_size_limit; ++current_position) { Aws::S3::Model::ObjectIdentifier obj; @@ -407,9 +408,9 @@ void S3ObjectStorage::removeObjectsImpl(const StoredObjects & objects, bool if_e { const auto * outcome_error = outcome.IsSuccess() ? nullptr : &outcome.GetError(); auto time_now = std::chrono::system_clock::now(); - for (const auto & object : objects) + for (size_t i = first_position; i < current_position; ++i) blob_storage_log->addEvent(BlobStorageLogElement::EventType::Delete, - uri.bucket, object.remote_path, object.local_path, object.bytes_size, + uri.bucket, objects[i].remote_path, objects[i].local_path, objects[i].bytes_size, outcome_error, time_now); } diff --git a/src/Formats/FormatFactory.cpp b/src/Formats/FormatFactory.cpp index a7883919c4c..e90986f2236 100644 --- a/src/Formats/FormatFactory.cpp +++ b/src/Formats/FormatFactory.cpp @@ -161,6 +161,7 @@ FormatSettings getFormatSettings(const ContextPtr & context, const Settings & se format_settings.parquet.output_string_as_string = settings.output_format_parquet_string_as_string; format_settings.parquet.output_fixed_string_as_fixed_byte_array = settings.output_format_parquet_fixed_string_as_fixed_byte_array; format_settings.parquet.max_block_size = settings.input_format_parquet_max_block_size; + format_settings.parquet.prefer_block_bytes = settings.input_format_parquet_prefer_block_bytes; format_settings.parquet.output_compression_method = settings.output_format_parquet_compression_method; format_settings.parquet.output_compliant_nested_types = settings.output_format_parquet_compliant_nested_types; format_settings.parquet.use_custom_encoder = settings.output_format_parquet_use_custom_encoder; diff --git a/src/Formats/FormatSettings.h b/src/Formats/FormatSettings.h index b296928e4d4..337aafbbe9c 100644 --- a/src/Formats/FormatSettings.h +++ b/src/Formats/FormatSettings.h @@ -265,7 +265,8 @@ struct FormatSettings bool preserve_order = false; bool use_custom_encoder = true; bool parallel_encoding = true; - UInt64 max_block_size = 8192; + UInt64 max_block_size = DEFAULT_BLOCK_SIZE; + size_t prefer_block_bytes = DEFAULT_BLOCK_SIZE * 256; ParquetVersion output_version; ParquetCompression output_compression_method = ParquetCompression::SNAPPY; bool output_compliant_nested_types = true; diff --git a/src/Formats/SchemaInferenceUtils.cpp b/src/Formats/SchemaInferenceUtils.cpp index 6cbcae2bebe..31faea2e13e 100644 --- a/src/Formats/SchemaInferenceUtils.cpp +++ b/src/Formats/SchemaInferenceUtils.cpp @@ -879,11 +879,11 @@ namespace } template - bool tryReadFloat(Float64 & value, ReadBuffer & buf, const FormatSettings & settings) + bool tryReadFloat(Float64 & value, ReadBuffer & buf, const FormatSettings & settings, bool & has_fractional) { if (is_json || settings.try_infer_exponent_floats) - return tryReadFloatText(value, buf); - return tryReadFloatTextNoExponent(value, buf); + return tryReadFloatTextExt(value, buf, has_fractional); + return tryReadFloatTextExtNoExponent(value, buf, has_fractional); } template @@ -893,46 +893,31 @@ namespace return nullptr; Float64 tmp_float; + bool has_fractional; if (settings.try_infer_integers) { /// If we read from String, we can do it in a more efficient way. if (auto * string_buf = dynamic_cast(&buf)) { /// Remember the pointer to the start of the number to rollback to it. - char * number_start = buf.position(); - Int64 tmp_int; - bool read_int = tryReadIntText(tmp_int, buf); - /// If we reached eof, it cannot be float (it requires no less data than integer) - if (buf.eof()) - return read_int ? std::make_shared() : nullptr; - - char * int_end = buf.position(); /// We can safely get back to the start of the number, because we read from a string and we didn't reach eof. - buf.position() = number_start; + char * number_start = buf.position(); - bool read_uint = false; - char * uint_end = nullptr; - /// In case of Int64 overflow we can try to infer UInt64. - if (!read_int) - { - UInt64 tmp_uint; - read_uint = tryReadIntText(tmp_uint, buf); - /// If we reached eof, it cannot be float (it requires no less data than integer) - if (buf.eof()) - return read_uint ? std::make_shared() : nullptr; - - uint_end = buf.position(); - buf.position() = number_start; - } - - if (tryReadFloat(tmp_float, buf, settings)) - { - if (read_int && buf.position() == int_end) - return std::make_shared(); - if (read_uint && buf.position() == uint_end) - return std::make_shared(); + /// NOTE: it may break parsing of tryReadFloat() != tryReadIntText() + parsing of '.'/'e' + /// But, for now it is true + if (tryReadFloat(tmp_float, buf, settings, has_fractional) && has_fractional) return std::make_shared(); - } + + Int64 tmp_int; + buf.position() = number_start; + if (tryReadIntText(tmp_int, buf)) + return std::make_shared(); + + /// In case of Int64 overflow we can try to infer UInt64. + UInt64 tmp_uint; + buf.position() = number_start; + if (tryReadIntText(tmp_uint, buf)) + return std::make_shared(); return nullptr; } @@ -942,36 +927,22 @@ namespace /// and then as float. PeekableReadBuffer peekable_buf(buf); PeekableReadBufferCheckpoint checkpoint(peekable_buf); - Int64 tmp_int; - bool read_int = tryReadIntText(tmp_int, peekable_buf); - auto * int_end = peekable_buf.position(); - peekable_buf.rollbackToCheckpoint(true); - bool read_uint = false; - char * uint_end = nullptr; - /// In case of Int64 overflow we can try to infer UInt64. - if (!read_int) - { - PeekableReadBufferCheckpoint new_checkpoint(peekable_buf); - UInt64 tmp_uint; - read_uint = tryReadIntText(tmp_uint, peekable_buf); - uint_end = peekable_buf.position(); - peekable_buf.rollbackToCheckpoint(true); - } - - if (tryReadFloat(tmp_float, peekable_buf, settings)) - { - /// Float parsing reads no fewer bytes than integer parsing, - /// so position of the buffer is either the same, or further. - /// If it's the same, then it's integer. - if (read_int && peekable_buf.position() == int_end) - return std::make_shared(); - if (read_uint && peekable_buf.position() == uint_end) - return std::make_shared(); + if (tryReadFloat(tmp_float, peekable_buf, settings, has_fractional) && has_fractional) return std::make_shared(); - } + peekable_buf.rollbackToCheckpoint(/* drop= */ false); + + Int64 tmp_int; + if (tryReadIntText(tmp_int, peekable_buf)) + return std::make_shared(); + peekable_buf.rollbackToCheckpoint(/* drop= */ true); + + /// In case of Int64 overflow we can try to infer UInt64. + UInt64 tmp_uint; + if (tryReadIntText(tmp_uint, peekable_buf)) + return std::make_shared(); } - else if (tryReadFloat(tmp_float, buf, settings)) + else if (tryReadFloat(tmp_float, buf, settings, has_fractional)) { return std::make_shared(); } @@ -1004,7 +975,8 @@ namespace buf.position() = buf.buffer().begin(); Float64 tmp; - if (tryReadFloat(tmp, buf, settings) && buf.eof()) + bool has_fractional; + if (tryReadFloat(tmp, buf, settings, has_fractional) && buf.eof()) return std::make_shared(); return nullptr; diff --git a/src/Functions/LeastGreatestGeneric.h b/src/Functions/LeastGreatestGeneric.h index 9073f14d679..bbab001b00d 100644 --- a/src/Functions/LeastGreatestGeneric.h +++ b/src/Functions/LeastGreatestGeneric.h @@ -111,7 +111,7 @@ public: argument_types.push_back(argument.type); /// More efficient specialization for two numeric arguments. - if (arguments.size() == 2 && isNumber(arguments[0].type) && isNumber(arguments[1].type)) + if (arguments.size() == 2 && isNumber(removeNullable(arguments[0].type)) && isNumber(removeNullable(arguments[1].type))) return std::make_unique(SpecializedFunction::create(context), argument_types, return_type); return std::make_unique( @@ -123,7 +123,7 @@ public: if (types.empty()) throw Exception(ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH, "Function {} cannot be called without arguments", getName()); - if (types.size() == 2 && isNumber(types[0]) && isNumber(types[1])) + if (types.size() == 2 && isNumber(removeNullable(types[0])) && isNumber(removeNullable(types[1]))) return SpecializedFunction::create(context)->getReturnTypeImpl(types); return getLeastSupertype(types); diff --git a/src/Functions/isNotNull.cpp b/src/Functions/isNotNull.cpp index ea95a5c2b1c..a10e7ebd40c 100644 --- a/src/Functions/isNotNull.cpp +++ b/src/Functions/isNotNull.cpp @@ -29,6 +29,18 @@ public: return name; } + ColumnPtr getConstantResultForNonConstArguments(const ColumnsWithTypeAndName & arguments, const DataTypePtr & result_type) const override + { + const ColumnWithTypeAndName & elem = arguments[0]; + if (elem.type->onlyNull()) + return result_type->createColumnConst(1, UInt8(0)); + + if (canContainNull(*elem.type)) + return nullptr; + + return result_type->createColumnConst(1, UInt8(1)); + } + size_t getNumberOfArguments() const override { return 1; } bool useDefaultImplementationForNulls() const override { return false; } bool useDefaultImplementationForConstants() const override { return true; } diff --git a/src/Functions/isNull.cpp b/src/Functions/isNull.cpp index a98ff2ab8e8..95d659b103b 100644 --- a/src/Functions/isNull.cpp +++ b/src/Functions/isNull.cpp @@ -31,6 +31,18 @@ public: return name; } + ColumnPtr getConstantResultForNonConstArguments(const ColumnsWithTypeAndName & arguments, const DataTypePtr & result_type) const override + { + const ColumnWithTypeAndName & elem = arguments[0]; + if (elem.type->onlyNull()) + return result_type->createColumnConst(1, UInt8(1)); + + if (canContainNull(*elem.type)) + return nullptr; + + return result_type->createColumnConst(1, UInt8(0)); + } + size_t getNumberOfArguments() const override { return 1; } bool useDefaultImplementationForNulls() const override { return false; } bool useDefaultImplementationForLowCardinalityColumns() const override { return false; } diff --git a/src/Functions/isNullable.cpp b/src/Functions/isNullable.cpp index 14874487f40..b24ee4f5e73 100644 --- a/src/Functions/isNullable.cpp +++ b/src/Functions/isNullable.cpp @@ -2,6 +2,7 @@ #include #include #include +#include namespace DB { @@ -23,6 +24,15 @@ public: return name; } + ColumnPtr getConstantResultForNonConstArguments(const ColumnsWithTypeAndName & arguments, const DataTypePtr & result_type) const override + { + const ColumnWithTypeAndName & elem = arguments[0]; + if (elem.type->onlyNull() || canContainNull(*elem.type)) + return result_type->createColumnConst(1, UInt8(1)); + + return result_type->createColumnConst(1, UInt8(0)); + } + bool useDefaultImplementationForNulls() const override { return false; } bool useDefaultImplementationForNothing() const override { return false; } diff --git a/src/IO/Archives/IArchiveReader.h b/src/IO/Archives/IArchiveReader.h index ee516d2655b..d7758b9e401 100644 --- a/src/IO/Archives/IArchiveReader.h +++ b/src/IO/Archives/IArchiveReader.h @@ -5,6 +5,7 @@ #include #include +#include namespace DB { @@ -25,6 +26,7 @@ public: { UInt64 uncompressed_size; UInt64 compressed_size; + Poco::Timestamp last_modified; bool is_encrypted; }; diff --git a/src/IO/Archives/LibArchiveReader.cpp b/src/IO/Archives/LibArchiveReader.cpp index bec7f587180..e3fe63fa40d 100644 --- a/src/IO/Archives/LibArchiveReader.cpp +++ b/src/IO/Archives/LibArchiveReader.cpp @@ -157,6 +157,7 @@ public: file_info.emplace(); file_info->uncompressed_size = archive_entry_size(current_entry); file_info->compressed_size = archive_entry_size(current_entry); + file_info->last_modified = archive_entry_mtime(current_entry); file_info->is_encrypted = false; } diff --git a/src/IO/readFloatText.h b/src/IO/readFloatText.h index 3a21d7201a9..215bb1a3270 100644 --- a/src/IO/readFloatText.h +++ b/src/IO/readFloatText.h @@ -320,11 +320,13 @@ static inline void readUIntTextUpToNSignificantDigits(T & x, ReadBuffer & buf) template -ReturnType readFloatTextFastImpl(T & x, ReadBuffer & in) +ReturnType readFloatTextFastImpl(T & x, ReadBuffer & in, bool & has_fractional) { static_assert(std::is_same_v || std::is_same_v, "Argument for readFloatTextImpl must be float or double"); static_assert('a' > '.' && 'A' > '.' && '\n' < '.' && '\t' < '.' && '\'' < '.' && '"' < '.', "Layout of char is not like ASCII"); + has_fractional = false; + static constexpr bool throw_exception = std::is_same_v; bool negative = false; @@ -377,6 +379,7 @@ ReturnType readFloatTextFastImpl(T & x, ReadBuffer & in) if (checkChar('.', in)) { + has_fractional = true; auto after_point_count = in.count(); while (!in.eof() && *in.position() == '0') @@ -394,6 +397,7 @@ ReturnType readFloatTextFastImpl(T & x, ReadBuffer & in) { if (checkChar('e', in) || checkChar('E', in)) { + has_fractional = true; if (in.eof()) { if constexpr (throw_exception) @@ -420,10 +424,14 @@ ReturnType readFloatTextFastImpl(T & x, ReadBuffer & in) } if (after_point) + { x += static_cast(shift10(after_point, after_point_exponent)); + } if (exponent) + { x = static_cast(shift10(x, exponent)); + } if (negative) x = -x; @@ -590,8 +598,16 @@ ReturnType readFloatTextSimpleImpl(T & x, ReadBuffer & buf) template void readFloatTextPrecise(T & x, ReadBuffer & in) { readFloatTextPreciseImpl(x, in); } template bool tryReadFloatTextPrecise(T & x, ReadBuffer & in) { return readFloatTextPreciseImpl(x, in); } -template void readFloatTextFast(T & x, ReadBuffer & in) { readFloatTextFastImpl(x, in); } -template bool tryReadFloatTextFast(T & x, ReadBuffer & in) { return readFloatTextFastImpl(x, in); } +template void readFloatTextFast(T & x, ReadBuffer & in) +{ + bool has_fractional; + readFloatTextFastImpl(x, in, has_fractional); +} +template bool tryReadFloatTextFast(T & x, ReadBuffer & in) +{ + bool has_fractional; + return readFloatTextFastImpl(x, in, has_fractional); +} template void readFloatTextSimple(T & x, ReadBuffer & in) { readFloatTextSimpleImpl(x, in); } template bool tryReadFloatTextSimple(T & x, ReadBuffer & in) { return readFloatTextSimpleImpl(x, in); } @@ -603,6 +619,21 @@ template void readFloatText(T & x, ReadBuffer & in) { readFloatText template bool tryReadFloatText(T & x, ReadBuffer & in) { return tryReadFloatTextFast(x, in); } /// Don't read exponent part of the number. -template bool tryReadFloatTextNoExponent(T & x, ReadBuffer & in) { return readFloatTextFastImpl(x, in); } +template bool tryReadFloatTextNoExponent(T & x, ReadBuffer & in) +{ + bool has_fractional; + return readFloatTextFastImpl(x, in, has_fractional); +} + +/// With a @has_fractional flag +/// Used for input_format_try_infer_integers +template bool tryReadFloatTextExt(T & x, ReadBuffer & in, bool & has_fractional) +{ + return readFloatTextFastImpl(x, in, has_fractional); +} +template bool tryReadFloatTextExtNoExponent(T & x, ReadBuffer & in, bool & has_fractional) +{ + return readFloatTextFastImpl(x, in, has_fractional); +} } diff --git a/src/Interpreters/ClusterProxy/executeQuery.cpp b/src/Interpreters/ClusterProxy/executeQuery.cpp index 13e6fa87051..91c0c592f28 100644 --- a/src/Interpreters/ClusterProxy/executeQuery.cpp +++ b/src/Interpreters/ClusterProxy/executeQuery.cpp @@ -22,7 +22,9 @@ #include #include #include - +#include +#include +#include namespace DB { @@ -505,6 +507,41 @@ void executeQueryWithParallelReplicas( query_plan.addStep(std::move(read_from_remote)); } +void executeQueryWithParallelReplicas( + QueryPlan & query_plan, + const StorageID & storage_id, + QueryProcessingStage::Enum processed_stage, + const QueryTreeNodePtr & query_tree, + const PlannerContextPtr & planner_context, + ContextPtr context, + std::shared_ptr storage_limits) +{ + QueryTreeNodePtr modified_query_tree = query_tree->clone(); + rewriteJoinToGlobalJoin(modified_query_tree, context); + modified_query_tree = buildQueryTreeForShard(planner_context, modified_query_tree); + + auto header + = InterpreterSelectQueryAnalyzer::getSampleBlock(modified_query_tree, context, SelectQueryOptions(processed_stage).analyze()); + auto modified_query_ast = queryNodeToDistributedSelectQuery(modified_query_tree); + + executeQueryWithParallelReplicas(query_plan, storage_id, header, processed_stage, modified_query_ast, context, storage_limits); +} + +void executeQueryWithParallelReplicas( + QueryPlan & query_plan, + const StorageID & storage_id, + QueryProcessingStage::Enum processed_stage, + const ASTPtr & query_ast, + ContextPtr context, + std::shared_ptr storage_limits) +{ + auto modified_query_ast = ClusterProxy::rewriteSelectQuery( + context, query_ast, storage_id.database_name, storage_id.table_name, /*remote_table_function_ptr*/ nullptr); + auto header = InterpreterSelectQuery(modified_query_ast, context, SelectQueryOptions(processed_stage).analyze()).getSampleBlock(); + + executeQueryWithParallelReplicas(query_plan, storage_id, header, processed_stage, modified_query_ast, context, storage_limits); +} + } } diff --git a/src/Interpreters/ClusterProxy/executeQuery.h b/src/Interpreters/ClusterProxy/executeQuery.h index 284fea05135..6548edf8939 100644 --- a/src/Interpreters/ClusterProxy/executeQuery.h +++ b/src/Interpreters/ClusterProxy/executeQuery.h @@ -24,6 +24,12 @@ struct StorageID; struct StorageLimits; using StorageLimitsList = std::list; +class IQueryTreeNode; +using QueryTreeNodePtr = std::shared_ptr; + +class PlannerContext; +using PlannerContextPtr = std::shared_ptr; + namespace ClusterProxy { @@ -60,7 +66,6 @@ void executeQuery( AdditionalShardFilterGenerator shard_filter_generator, bool is_remote_function); - void executeQueryWithParallelReplicas( QueryPlan & query_plan, const StorageID & storage_id, @@ -69,6 +74,23 @@ void executeQueryWithParallelReplicas( const ASTPtr & query_ast, ContextPtr context, std::shared_ptr storage_limits); + +void executeQueryWithParallelReplicas( + QueryPlan & query_plan, + const StorageID & storage_id, + QueryProcessingStage::Enum processed_stage, + const ASTPtr & query_ast, + ContextPtr context, + std::shared_ptr storage_limits); + +void executeQueryWithParallelReplicas( + QueryPlan & query_plan, + const StorageID & storage_id, + QueryProcessingStage::Enum processed_stage, + const QueryTreeNodePtr & query_tree, + const PlannerContextPtr & planner_context, + ContextPtr context, + std::shared_ptr storage_limits); } } diff --git a/src/Interpreters/Context.cpp b/src/Interpreters/Context.cpp index 06b3adb328d..67ea069d46d 100644 --- a/src/Interpreters/Context.cpp +++ b/src/Interpreters/Context.cpp @@ -18,6 +18,7 @@ #include #include #include +#include #include #include #include @@ -610,6 +611,8 @@ struct ContextSharedPart : boost::noncopyable LOG_TRACE(log, "Shutting down database catalog"); DatabaseCatalog::shutdown(); + NamedCollectionFactory::instance().shutdown(); + delete_async_insert_queue.reset(); SHUTDOWN(log, "merges executor", merge_mutate_executor, wait()); @@ -740,12 +743,18 @@ struct ContextSharedPart : boost::noncopyable void initializeTraceCollector(std::shared_ptr trace_log) { - if (!trace_log) - return; + if (!trace_collector.has_value()) + throw Exception(ErrorCodes::LOGICAL_ERROR, "TraceCollector needs to be first created before initialization"); + + trace_collector->initialize(trace_log); + } + + void createTraceCollector() + { if (hasTraceCollector()) return; - trace_collector.emplace(std::move(trace_log)); + trace_collector.emplace(); } void addWarningMessage(const String & message) TSA_REQUIRES(mutex) @@ -3891,6 +3900,11 @@ void Context::initializeSystemLogs() }); } +void Context::createTraceCollector() +{ + shared->createTraceCollector(); +} + void Context::initializeTraceCollector() { shared->initializeTraceCollector(getTraceLog()); diff --git a/src/Interpreters/Context.h b/src/Interpreters/Context.h index 87a7baa0469..68f37377926 100644 --- a/src/Interpreters/Context.h +++ b/src/Interpreters/Context.h @@ -1077,6 +1077,8 @@ public: void initializeSystemLogs(); /// Call after initialization before using trace collector. + void createTraceCollector(); + void initializeTraceCollector(); /// Call after unexpected crash happen. diff --git a/src/Interpreters/DDLTask.cpp b/src/Interpreters/DDLTask.cpp index a37b4db029a..6c346836ed8 100644 --- a/src/Interpreters/DDLTask.cpp +++ b/src/Interpreters/DDLTask.cpp @@ -568,8 +568,21 @@ void ZooKeeperMetadataTransaction::commit() ClusterPtr tryGetReplicatedDatabaseCluster(const String & cluster_name) { - if (const auto * replicated_db = dynamic_cast(DatabaseCatalog::instance().tryGetDatabase(cluster_name).get())) - return replicated_db->tryGetCluster(); + String name = cluster_name; + bool all_groups = false; + if (name.starts_with(DatabaseReplicated::ALL_GROUPS_CLUSTER_PREFIX)) + { + name = name.substr(strlen(DatabaseReplicated::ALL_GROUPS_CLUSTER_PREFIX)); + all_groups = true; + } + + if (const auto * replicated_db = dynamic_cast(DatabaseCatalog::instance().tryGetDatabase(name).get())) + { + if (all_groups) + return replicated_db->tryGetAllGroupsCluster(); + else + return replicated_db->tryGetCluster(); + } return {}; } diff --git a/src/Interpreters/InterpreterAlterNamedCollectionQuery.cpp b/src/Interpreters/InterpreterAlterNamedCollectionQuery.cpp index a4e86879596..79a17fd1844 100644 --- a/src/Interpreters/InterpreterAlterNamedCollectionQuery.cpp +++ b/src/Interpreters/InterpreterAlterNamedCollectionQuery.cpp @@ -4,7 +4,7 @@ #include #include #include -#include +#include namespace DB @@ -23,7 +23,7 @@ BlockIO InterpreterAlterNamedCollectionQuery::execute() return executeDDLQueryOnCluster(query_ptr, current_context, params); } - NamedCollectionUtils::updateFromSQL(query, current_context); + NamedCollectionFactory::instance().updateFromSQL(query); return {}; } diff --git a/src/Interpreters/InterpreterCreateNamedCollectionQuery.cpp b/src/Interpreters/InterpreterCreateNamedCollectionQuery.cpp index 41e87bb73dd..c71441daa8c 100644 --- a/src/Interpreters/InterpreterCreateNamedCollectionQuery.cpp +++ b/src/Interpreters/InterpreterCreateNamedCollectionQuery.cpp @@ -4,7 +4,7 @@ #include #include #include -#include +#include namespace DB @@ -23,7 +23,7 @@ BlockIO InterpreterCreateNamedCollectionQuery::execute() return executeDDLQueryOnCluster(query_ptr, current_context, params); } - NamedCollectionUtils::createFromSQL(query, current_context); + NamedCollectionFactory::instance().createFromSQL(query); return {}; } diff --git a/src/Interpreters/InterpreterDropNamedCollectionQuery.cpp b/src/Interpreters/InterpreterDropNamedCollectionQuery.cpp index baadc85f443..2edaef1b2f2 100644 --- a/src/Interpreters/InterpreterDropNamedCollectionQuery.cpp +++ b/src/Interpreters/InterpreterDropNamedCollectionQuery.cpp @@ -4,7 +4,7 @@ #include #include #include -#include +#include namespace DB @@ -23,7 +23,7 @@ BlockIO InterpreterDropNamedCollectionQuery::execute() return executeDDLQueryOnCluster(query_ptr, current_context, params); } - NamedCollectionUtils::removeFromSQL(query, current_context); + NamedCollectionFactory::instance().removeFromSQL(query); return {}; } diff --git a/src/Interpreters/InterpreterSelectQuery.cpp b/src/Interpreters/InterpreterSelectQuery.cpp index b72399df2c1..1ce4fe9a377 100644 --- a/src/Interpreters/InterpreterSelectQuery.cpp +++ b/src/Interpreters/InterpreterSelectQuery.cpp @@ -2374,49 +2374,6 @@ UInt64 InterpreterSelectQuery::maxBlockSizeByLimit() const return 0; } -/** Storages can rely that filters that for storage will be available for analysis before - * plan is fully constructed and optimized. - * - * StorageMerge common header calculation and prewhere push-down relies on this. - * - * This is similar to Planner::collectFiltersForAnalysis - */ -void collectFiltersForAnalysis( - const ASTPtr & query_ptr, - const ContextPtr & query_context, - const StorageSnapshotPtr & storage_snapshot, - const SelectQueryOptions & options, - SelectQueryInfo & query_info) -{ - auto get_column_options = GetColumnsOptions(GetColumnsOptions::All).withExtendedObjects().withVirtuals(); - - auto dummy = std::make_shared( - storage_snapshot->storage.getStorageID(), ColumnsDescription(storage_snapshot->getColumns(get_column_options)), storage_snapshot); - - QueryPlan query_plan; - InterpreterSelectQuery(query_ptr, query_context, dummy, dummy->getInMemoryMetadataPtr(), options).buildQueryPlan(query_plan); - - auto optimization_settings = QueryPlanOptimizationSettings::fromContext(query_context); - query_plan.optimize(optimization_settings); - - std::vector nodes_to_process; - nodes_to_process.push_back(query_plan.getRootNode()); - - while (!nodes_to_process.empty()) - { - const auto * node_to_process = nodes_to_process.back(); - nodes_to_process.pop_back(); - nodes_to_process.insert(nodes_to_process.end(), node_to_process->children.begin(), node_to_process->children.end()); - - auto * read_from_dummy = typeid_cast(node_to_process->step.get()); - if (!read_from_dummy) - continue; - - query_info.filter_actions_dag = read_from_dummy->getFilterActionsDAG(); - query_info.optimized_prewhere_info = read_from_dummy->getPrewhereInfo(); - } -} - void InterpreterSelectQuery::executeFetchColumns(QueryProcessingStage::Enum processing_stage, QueryPlan & query_plan) { auto & query = getSelectQuery(); @@ -2546,10 +2503,6 @@ void InterpreterSelectQuery::executeFetchColumns(QueryProcessingStage::Enum proc } else if (storage) { - if (shouldMoveToPrewhere() && settings.query_plan_optimize_prewhere && settings.query_plan_enable_optimizations - && typeid_cast(storage.get())) - collectFiltersForAnalysis(query_ptr, context, storage_snapshot, options, query_info); - /// Table. if (max_streams == 0) max_streams = 1; diff --git a/src/Interpreters/SystemLog.cpp b/src/Interpreters/SystemLog.cpp index 5e0ce2cb0de..3b25deeb59d 100644 --- a/src/Interpreters/SystemLog.cpp +++ b/src/Interpreters/SystemLog.cpp @@ -504,6 +504,10 @@ void SystemLog::flushImpl(const std::vector & to_flush, Block block(std::move(log_element_columns)); MutableColumns columns = block.mutateColumns(); + + for (auto & column : columns) + column->reserve(to_flush.size()); + for (const auto & elem : to_flush) elem.appendToBlock(columns); @@ -532,7 +536,8 @@ void SystemLog::flushImpl(const std::vector & to_flush, } catch (...) { - tryLogCurrentException(__PRETTY_FUNCTION__); + tryLogCurrentException(__PRETTY_FUNCTION__, fmt::format("Failed to flush system log {} with {} entries up to offset {}", + table_id.getNameForLogs(), to_flush.size(), to_flush_end)); } queue->confirm(to_flush_end); diff --git a/src/Interpreters/TraceCollector.cpp b/src/Interpreters/TraceCollector.cpp index 8e9c397b7a1..77f70d754c8 100644 --- a/src/Interpreters/TraceCollector.cpp +++ b/src/Interpreters/TraceCollector.cpp @@ -1,5 +1,4 @@ -#include "TraceCollector.h" - +#include #include #include #include @@ -14,8 +13,12 @@ namespace DB { -TraceCollector::TraceCollector(std::shared_ptr trace_log_) - : trace_log(std::move(trace_log_)) +namespace ErrorCodes +{ + extern const int LOGICAL_ERROR; +} + +TraceCollector::TraceCollector() { TraceSender::pipe.open(); @@ -28,6 +31,23 @@ TraceCollector::TraceCollector(std::shared_ptr trace_log_) thread = ThreadFromGlobalPool(&TraceCollector::run, this); } +void TraceCollector::initialize(std::shared_ptr trace_log_) +{ + if (is_trace_log_initialized) + throw DB::Exception(ErrorCodes::LOGICAL_ERROR, "TraceCollector is already initialized"); + + trace_log_ptr = trace_log_; + is_trace_log_initialized.store(true, std::memory_order_release); +} + +std::shared_ptr TraceCollector::getTraceLog() +{ + if (!is_trace_log_initialized.load(std::memory_order_acquire)) + return nullptr; + + return trace_log_ptr; +} + void TraceCollector::tryClosePipe() { try @@ -120,7 +140,7 @@ void TraceCollector::run() ProfileEvents::Count increment; readPODBinary(increment, in); - if (trace_log) + if (auto trace_log = getTraceLog()) { // time and time_in_microseconds are both being constructed from the same timespec so that the // times will be equal up to the precision of a second. diff --git a/src/Interpreters/TraceCollector.h b/src/Interpreters/TraceCollector.h index 382e7511ac6..c2894394dd0 100644 --- a/src/Interpreters/TraceCollector.h +++ b/src/Interpreters/TraceCollector.h @@ -1,4 +1,5 @@ #pragma once +#include #include class StackTrace; @@ -16,11 +17,17 @@ class TraceLog; class TraceCollector { public: - explicit TraceCollector(std::shared_ptr trace_log_); + TraceCollector(); ~TraceCollector(); + void initialize(std::shared_ptr trace_log_); + private: - std::shared_ptr trace_log; + std::shared_ptr getTraceLog(); + + std::atomic is_trace_log_initialized = false; + std::shared_ptr trace_log_ptr; + ThreadFromGlobalPool thread; void tryClosePipe(); diff --git a/src/Planner/Planner.cpp b/src/Planner/Planner.cpp index b40e23a9553..15b92ed12da 100644 --- a/src/Planner/Planner.cpp +++ b/src/Planner/Planner.cpp @@ -166,7 +166,7 @@ FiltersForTableExpressionMap collectFiltersForAnalysis(const QueryTreeNodePtr & continue; const auto & storage = table_node ? table_node->getStorage() : table_function_node->getStorage(); - if (typeid_cast(storage.get()) || typeid_cast(storage.get()) + if (typeid_cast(storage.get()) || (parallel_replicas_estimation_enabled && std::dynamic_pointer_cast(storage))) { collect_filters = true; diff --git a/src/Planner/PlannerExpressionAnalysis.cpp b/src/Planner/PlannerExpressionAnalysis.cpp index f0a2845c3e8..060bbba1c05 100644 --- a/src/Planner/PlannerExpressionAnalysis.cpp +++ b/src/Planner/PlannerExpressionAnalysis.cpp @@ -1,6 +1,7 @@ #include #include +#include #include #include @@ -37,7 +38,7 @@ namespace * Actions before filter are added into into actions chain. * It is client responsibility to update filter analysis result if filter column must be removed after chain is finalized. */ -FilterAnalysisResult analyzeFilter(const QueryTreeNodePtr & filter_expression_node, +std::optional analyzeFilter(const QueryTreeNodePtr & filter_expression_node, const ColumnsWithTypeAndName & input_columns, const PlannerContextPtr & planner_context, ActionsChain & actions_chain) @@ -45,7 +46,11 @@ FilterAnalysisResult analyzeFilter(const QueryTreeNodePtr & filter_expression_no FilterAnalysisResult result; result.filter_actions = buildActionsDAGFromExpressionNode(filter_expression_node, input_columns, planner_context); - result.filter_column_name = result.filter_actions->getOutputs().at(0)->result_name; + const auto * output = result.filter_actions->getOutputs().at(0); + if (output->column && ConstantFilterDescription(*output->column).always_true) + return {}; + + result.filter_column_name = output->result_name; actions_chain.addStep(std::make_unique(result.filter_actions)); return result; @@ -534,8 +539,11 @@ PlannerExpressionsAnalysisResult buildExpressionAnalysisResult(const QueryTreeNo if (query_node.hasWhere()) { where_analysis_result_optional = analyzeFilter(query_node.getWhere(), current_output_columns, planner_context, actions_chain); - where_action_step_index_optional = actions_chain.getLastStepIndex(); - current_output_columns = actions_chain.getLastStepAvailableOutputColumns(); + if (where_analysis_result_optional) + { + where_action_step_index_optional = actions_chain.getLastStepIndex(); + current_output_columns = actions_chain.getLastStepAvailableOutputColumns(); + } } auto aggregation_analysis_result_optional = analyzeAggregation(query_tree, current_output_columns, planner_context, actions_chain); @@ -548,8 +556,11 @@ PlannerExpressionsAnalysisResult buildExpressionAnalysisResult(const QueryTreeNo if (query_node.hasHaving()) { having_analysis_result_optional = analyzeFilter(query_node.getHaving(), current_output_columns, planner_context, actions_chain); - having_action_step_index_optional = actions_chain.getLastStepIndex(); - current_output_columns = actions_chain.getLastStepAvailableOutputColumns(); + if (having_analysis_result_optional) + { + having_action_step_index_optional = actions_chain.getLastStepIndex(); + current_output_columns = actions_chain.getLastStepAvailableOutputColumns(); + } } auto window_analysis_result_optional = analyzeWindow(query_tree, current_output_columns, planner_context, actions_chain); @@ -562,8 +573,11 @@ PlannerExpressionsAnalysisResult buildExpressionAnalysisResult(const QueryTreeNo if (query_node.hasQualify()) { qualify_analysis_result_optional = analyzeFilter(query_node.getQualify(), current_output_columns, planner_context, actions_chain); - qualify_action_step_index_optional = actions_chain.getLastStepIndex(); - current_output_columns = actions_chain.getLastStepAvailableOutputColumns(); + if (qualify_analysis_result_optional) + { + qualify_action_step_index_optional = actions_chain.getLastStepIndex(); + current_output_columns = actions_chain.getLastStepAvailableOutputColumns(); + } } auto projection_analysis_result = analyzeProjection(query_node, current_output_columns, planner_context, actions_chain); diff --git a/src/Planner/PlannerJoinTree.cpp b/src/Planner/PlannerJoinTree.cpp index efc449402b9..6ec460b0894 100644 --- a/src/Planner/PlannerJoinTree.cpp +++ b/src/Planner/PlannerJoinTree.cpp @@ -45,6 +45,7 @@ #include #include #include +#include #include #include @@ -56,6 +57,7 @@ #include #include #include +#include #include #include @@ -645,7 +647,6 @@ JoinTreeQueryPlan buildQueryPlanForTableExpression(QueryTreeNodePtr table_expres auto table_expression_query_info = select_query_info; table_expression_query_info.table_expression = table_expression; table_expression_query_info.filter_actions_dag = table_expression_data.getFilterActions(); - table_expression_query_info.optimized_prewhere_info = table_expression_data.getPrewhereInfo(); table_expression_query_info.analyzer_can_use_parallel_replicas_on_follower = table_node == planner_context->getGlobalPlannerContext()->parallel_replicas_table; size_t max_streams = settings.max_threads; @@ -769,37 +770,6 @@ JoinTreeQueryPlan buildQueryPlanForTableExpression(QueryTreeNodePtr table_expres { if (!select_query_options.only_analyze) { - auto storage_merge_tree = std::dynamic_pointer_cast(storage); - if (storage_merge_tree && query_context->canUseParallelReplicasOnInitiator() - && settings.parallel_replicas_min_number_of_rows_per_replica > 0) - { - UInt64 rows_to_read - = storage_merge_tree->estimateNumberOfRowsToRead(query_context, storage_snapshot, table_expression_query_info); - - if (max_block_size_limited && (max_block_size_limited < rows_to_read)) - rows_to_read = max_block_size_limited; - - size_t number_of_replicas_to_use = rows_to_read / settings.parallel_replicas_min_number_of_rows_per_replica; - LOG_TRACE( - getLogger("Planner"), - "Estimated {} rows to read. It is enough work for {} parallel replicas", - rows_to_read, - number_of_replicas_to_use); - - if (number_of_replicas_to_use <= 1) - { - planner_context->getMutableQueryContext()->setSetting( - "allow_experimental_parallel_reading_from_replicas", Field(0)); - planner_context->getMutableQueryContext()->setSetting("max_parallel_replicas", UInt64{1}); - LOG_DEBUG(getLogger("Planner"), "Disabling parallel replicas because there aren't enough rows to read"); - } - else if (number_of_replicas_to_use < settings.max_parallel_replicas) - { - planner_context->getMutableQueryContext()->setSetting("max_parallel_replicas", number_of_replicas_to_use); - LOG_DEBUG(getLogger("Planner"), "Reducing the number of replicas to use to {}", number_of_replicas_to_use); - } - } - auto & prewhere_info = table_expression_query_info.prewhere_info; const auto & prewhere_actions = table_expression_data.getPrewhereFilterActions(); @@ -897,6 +867,96 @@ JoinTreeQueryPlan buildQueryPlanForTableExpression(QueryTreeNodePtr table_expres max_block_size, max_streams); + auto parallel_replicas_enabled_for_storage = [](const StoragePtr & table, const Settings & query_settings) + { + if (!table->isMergeTree()) + return false; + + if (!table->supportsReplication() && !query_settings.parallel_replicas_for_non_replicated_merge_tree) + return false; + + return true; + }; + + /// query_plan can be empty if there is nothing to read + if (query_plan.isInitialized() && parallel_replicas_enabled_for_storage(storage, settings) && query_context->canUseParallelReplicasOnInitiator()) + { + // (1) find read step + QueryPlan::Node * node = query_plan.getRootNode(); + ReadFromMergeTree * reading = nullptr; + while (node) + { + reading = typeid_cast(node->step.get()); + if (reading) + break; + + QueryPlan::Node * prev_node = node; + if (!node->children.empty()) + { + chassert(node->children.size() == 1); + node = node->children.at(0); + } + else + { + throw Exception( + ErrorCodes::LOGICAL_ERROR, + "Step is expected to be ReadFromMergeTree but it's {}", + prev_node->step->getName()); + } + } + + chassert(reading); + + // (2) if it's ReadFromMergeTree - run index analysis and check number of rows to read + if (settings.parallel_replicas_min_number_of_rows_per_replica > 0) + { + auto result_ptr = reading->selectRangesToRead(); + + UInt64 rows_to_read = result_ptr->selected_rows; + if (table_expression_query_info.limit > 0 && table_expression_query_info.limit < rows_to_read) + rows_to_read = table_expression_query_info.limit; + + if (max_block_size_limited && (max_block_size_limited < rows_to_read)) + rows_to_read = max_block_size_limited; + + const size_t number_of_replicas_to_use = rows_to_read / settings.parallel_replicas_min_number_of_rows_per_replica; + LOG_TRACE( + getLogger("Planner"), + "Estimated {} rows to read. It is enough work for {} parallel replicas", + rows_to_read, + number_of_replicas_to_use); + + if (number_of_replicas_to_use <= 1) + { + planner_context->getMutableQueryContext()->setSetting( + "allow_experimental_parallel_reading_from_replicas", Field(0)); + planner_context->getMutableQueryContext()->setSetting("max_parallel_replicas", UInt64{1}); + LOG_DEBUG(getLogger("Planner"), "Disabling parallel replicas because there aren't enough rows to read"); + } + else if (number_of_replicas_to_use < settings.max_parallel_replicas) + { + planner_context->getMutableQueryContext()->setSetting("max_parallel_replicas", number_of_replicas_to_use); + LOG_DEBUG(getLogger("Planner"), "Reducing the number of replicas to use to {}", number_of_replicas_to_use); + } + } + + // (3) if parallel replicas still enabled - replace reading step + if (planner_context->getQueryContext()->canUseParallelReplicasOnInitiator()) + { + from_stage = QueryProcessingStage::WithMergeableState; + QueryPlan query_plan_parallel_replicas; + ClusterProxy::executeQueryWithParallelReplicas( + query_plan_parallel_replicas, + storage->getStorageID(), + from_stage, + table_expression_query_info.query_tree, + table_expression_query_info.planner_context, + query_context, + table_expression_query_info.storage_limits); + query_plan = std::move(query_plan_parallel_replicas); + } + } + const auto & alias_column_expressions = table_expression_data.getAliasColumnExpressions(); if (!alias_column_expressions.empty() && query_plan.isInitialized() && from_stage == QueryProcessingStage::FetchColumns) { diff --git a/src/Processors/Formats/Impl/ParquetBlockInputFormat.cpp b/src/Processors/Formats/Impl/ParquetBlockInputFormat.cpp index 04b3a64b6cb..e837d4d5e20 100644 --- a/src/Processors/Formats/Impl/ParquetBlockInputFormat.cpp +++ b/src/Processors/Formats/Impl/ParquetBlockInputFormat.cpp @@ -420,6 +420,24 @@ void ParquetBlockInputFormat::initializeIfNeeded() int num_row_groups = metadata->num_row_groups(); row_group_batches.reserve(num_row_groups); + auto adative_chunk_size = [&](int row_group_idx) -> size_t + { + size_t total_size = 0; + auto row_group_meta = metadata->RowGroup(row_group_idx); + for (int column_index : column_indices) + { + total_size += row_group_meta->ColumnChunk(column_index)->total_uncompressed_size(); + } + if (!total_size || !format_settings.parquet.prefer_block_bytes) return 0; + auto average_row_bytes = floor(static_cast(total_size) / row_group_meta->num_rows()); + // avoid inf preferred_num_rows; + if (average_row_bytes < 1) return 0; + const size_t preferred_num_rows = static_cast(floor(format_settings.parquet.prefer_block_bytes/average_row_bytes)); + const size_t MIN_ROW_NUM = 128; + // size_t != UInt64 in darwin + return std::min(std::max(preferred_num_rows, MIN_ROW_NUM), static_cast(format_settings.parquet.max_block_size)); + }; + for (int row_group = 0; row_group < num_row_groups; ++row_group) { if (skip_row_groups.contains(row_group)) @@ -439,6 +457,8 @@ void ParquetBlockInputFormat::initializeIfNeeded() row_group_batches.back().row_groups_idxs.push_back(row_group); row_group_batches.back().total_rows += metadata->RowGroup(row_group)->num_rows(); row_group_batches.back().total_bytes_compressed += metadata->RowGroup(row_group)->total_compressed_size(); + auto rows = adative_chunk_size(row_group); + row_group_batches.back().adaptive_chunk_size = rows ? rows : format_settings.parquet.max_block_size; } } @@ -449,7 +469,7 @@ void ParquetBlockInputFormat::initializeRowGroupBatchReader(size_t row_group_bat parquet::ArrowReaderProperties arrow_properties; parquet::ReaderProperties reader_properties(ArrowMemoryPool::instance()); arrow_properties.set_use_threads(false); - arrow_properties.set_batch_size(format_settings.parquet.max_block_size); + arrow_properties.set_batch_size(row_group_batch.adaptive_chunk_size); // When reading a row group, arrow will: // 1. Look at `metadata` to get all byte ranges it'll need to read from the file (typically one diff --git a/src/Processors/Formats/Impl/ParquetBlockInputFormat.h b/src/Processors/Formats/Impl/ParquetBlockInputFormat.h index d6591f5c0a3..24735ee4371 100644 --- a/src/Processors/Formats/Impl/ParquetBlockInputFormat.h +++ b/src/Processors/Formats/Impl/ParquetBlockInputFormat.h @@ -208,6 +208,8 @@ private: size_t total_rows = 0; size_t total_bytes_compressed = 0; + size_t adaptive_chunk_size = 0; + std::vector row_groups_idxs; // These are only used by the decoding thread, so don't require locking the mutex. diff --git a/src/Processors/QueryPlan/Optimizations/filterPushDown.cpp b/src/Processors/QueryPlan/Optimizations/filterPushDown.cpp index 8ca240b3e8b..263598bdca7 100644 --- a/src/Processors/QueryPlan/Optimizations/filterPushDown.cpp +++ b/src/Processors/QueryPlan/Optimizations/filterPushDown.cpp @@ -19,6 +19,7 @@ #include #include #include +#include #include #include @@ -608,6 +609,14 @@ size_t tryPushDownFilter(QueryPlan::Node * parent_node, QueryPlan::Nodes & nodes return 3; } + if (auto * read_from_merge = typeid_cast(child.get())) + { + FilterDAGInfo info{filter->getExpression(), filter->getFilterColumnName(), filter->removesFilterColumn()}; + read_from_merge->addFilter(std::move(info)); + std::swap(*parent_node, *child_node); + return 1; + } + return 0; } diff --git a/src/Processors/QueryPlan/Optimizations/optimizePrewhere.cpp b/src/Processors/QueryPlan/Optimizations/optimizePrewhere.cpp index 72a2027763c..1badd315200 100644 --- a/src/Processors/QueryPlan/Optimizations/optimizePrewhere.cpp +++ b/src/Processors/QueryPlan/Optimizations/optimizePrewhere.cpp @@ -4,10 +4,10 @@ #include #include #include +#include #include #include #include - namespace DB { @@ -30,7 +30,7 @@ static void removeFromOutput(ActionsDAG & dag, const std::string name) void optimizePrewhere(Stack & stack, QueryPlan::Nodes &) { - if (stack.size() < 3) + if (stack.size() < 2) return; auto & frame = stack.back(); @@ -45,6 +45,9 @@ void optimizePrewhere(Stack & stack, QueryPlan::Nodes &) if (!source_step_with_filter) return; + if (typeid_cast(frame.node->step.get())) + return; + const auto & storage_snapshot = source_step_with_filter->getStorageSnapshot(); const auto & storage = storage_snapshot->storage; if (!storage.canMoveConditionsToPrewhere()) diff --git a/src/Processors/QueryPlan/Optimizations/optimizeUseAggregateProjection.cpp b/src/Processors/QueryPlan/Optimizations/optimizeUseAggregateProjection.cpp index 4017670ad14..713f2f35fc8 100644 --- a/src/Processors/QueryPlan/Optimizations/optimizeUseAggregateProjection.cpp +++ b/src/Processors/QueryPlan/Optimizations/optimizeUseAggregateProjection.cpp @@ -421,6 +421,9 @@ struct AggregateProjectionCandidates /// This flag means that DAG for projection candidate should be used in FilterStep. bool has_filter = false; + + /// If not empty, try to find exact ranges from parts to speed up trivial count queries. + String only_count_column; }; AggregateProjectionCandidates getAggregateProjectionCandidates( @@ -502,6 +505,12 @@ AggregateProjectionCandidates getAggregateProjectionCandidates( candidates.minmax_projection.emplace(std::move(minmax)); } } + else + { + /// Trivial count optimization only applies after @can_use_minmax_projection. + if (keys.empty() && aggregates.size() == 1 && typeid_cast(aggregates[0].function.get())) + candidates.only_count_column = aggregates[0].column_name; + } } if (!candidates.minmax_projection) @@ -584,13 +593,21 @@ std::optional optimizeUseAggregateProjections(QueryPlan::Node & node, Qu ContextPtr context = reading->getContext(); MergeTreeDataSelectExecutor reader(reading->getMergeTreeData()); AggregateProjectionCandidate * best_candidate = nullptr; + + /// Stores row count from exact ranges of parts. + size_t exact_count = 0; + if (candidates.minmax_projection) { best_candidate = &candidates.minmax_projection->candidate; } - else if (!candidates.real.empty()) + else if (!candidates.real.empty() || !candidates.only_count_column.empty()) { - auto ordinary_reading_select_result = reading->selectRangesToRead(); + auto ordinary_reading_select_result = reading->getAnalyzedResult(); + bool find_exact_ranges = !candidates.only_count_column.empty(); + if (!ordinary_reading_select_result || (!ordinary_reading_select_result->has_exact_ranges && find_exact_ranges)) + ordinary_reading_select_result = reading->selectRangesToRead(find_exact_ranges); + size_t ordinary_reading_marks = ordinary_reading_select_result->selected_marks; /// Nothing to read. Ignore projections. @@ -600,7 +617,49 @@ std::optional optimizeUseAggregateProjections(QueryPlan::Node & node, Qu return {}; } - const auto & parts_with_ranges = ordinary_reading_select_result->parts_with_ranges; + auto & parts_with_ranges = ordinary_reading_select_result->parts_with_ranges; + + if (!candidates.only_count_column.empty()) + { + for (auto & part_with_ranges : parts_with_ranges) + { + MarkRanges new_ranges; + auto & ranges = part_with_ranges.ranges; + const auto & exact_ranges = part_with_ranges.exact_ranges; + if (exact_ranges.empty()) + continue; + + size_t i = 0; + size_t len = exact_ranges.size(); + for (auto & range : ranges) + { + while (i < len && exact_ranges[i].begin < range.end) + { + chassert(exact_ranges[i].begin >= range.begin); + chassert(exact_ranges[i].end <= range.end); + + /// Found some marks which are not exact + if (range.begin < exact_ranges[i].begin) + new_ranges.emplace_back(range.begin, exact_ranges[i].begin); + + range.begin = exact_ranges[i].end; + ordinary_reading_marks -= exact_ranges[i].end - exact_ranges[i].begin; + exact_count += part_with_ranges.data_part->index_granularity.getRowsCountInRange(exact_ranges[i]); + ++i; + } + + /// Current range still contains some marks which are not exact + if (range.begin < range.end) + new_ranges.emplace_back(range); + } + chassert(i == len); + part_with_ranges.ranges = std::move(new_ranges); + } + + std::erase_if(parts_with_ranges, [&](const auto & part_with_ranges) { return part_with_ranges.ranges.empty(); }); + if (parts_with_ranges.empty()) + chassert(ordinary_reading_marks == 0); + } /// Selecting best candidate. for (auto & candidate : candidates.real) @@ -630,8 +689,20 @@ std::optional optimizeUseAggregateProjections(QueryPlan::Node & node, Qu if (!best_candidate) { - reading->setAnalyzedResult(std::move(ordinary_reading_select_result)); - return {}; + if (exact_count > 0) + { + if (ordinary_reading_marks > 0) + { + ordinary_reading_select_result->selected_marks = ordinary_reading_marks; + ordinary_reading_select_result->selected_rows -= exact_count; + reading->setAnalyzedResult(std::move(ordinary_reading_select_result)); + } + } + else + { + reading->setAnalyzedResult(std::move(ordinary_reading_select_result)); + return {}; + } } } else @@ -639,10 +710,11 @@ std::optional optimizeUseAggregateProjections(QueryPlan::Node & node, Qu return {}; } - chassert(best_candidate != nullptr); - QueryPlanStepPtr projection_reading; bool has_ordinary_parts; + String selected_projection_name; + if (best_candidate) + selected_projection_name = best_candidate->projection->name; /// Add reading from projection step. if (candidates.minmax_projection) @@ -654,6 +726,32 @@ std::optional optimizeUseAggregateProjections(QueryPlan::Node & node, Qu projection_reading = std::make_unique(std::move(pipe)); has_ordinary_parts = false; } + else if (best_candidate == nullptr) + { + chassert(exact_count > 0); + + auto agg_count = std::make_shared(DataTypes{}); + + std::vector state(agg_count->sizeOfData()); + AggregateDataPtr place = state.data(); + agg_count->create(place); + SCOPE_EXIT_MEMORY_SAFE(agg_count->destroy(place)); + agg_count->set(place, exact_count); + + auto column = ColumnAggregateFunction::create(agg_count); + column->insertFrom(place); + + Block block_with_count{ + {std::move(column), + std::make_shared(agg_count, DataTypes{}, Array{}), + candidates.only_count_column}}; + + Pipe pipe(std::make_shared(std::move(block_with_count))); + projection_reading = std::make_unique(std::move(pipe)); + + selected_projection_name = "Optimized trivial count"; + has_ordinary_parts = reading->getAnalyzedResult() != nullptr; + } else { auto storage_snapshot = reading->getStorageSnapshot(); @@ -694,46 +792,54 @@ std::optional optimizeUseAggregateProjections(QueryPlan::Node & node, Qu context->getQueryContext()->addQueryAccessInfo(Context::QualifiedProjectionName { .storage_id = reading->getMergeTreeData().getStorageID(), - .projection_name = best_candidate->projection->name, + .projection_name = selected_projection_name, }); } // LOG_TRACE(getLogger("optimizeUseProjections"), "Projection reading header {}", // projection_reading->getOutputStream().header.dumpStructure()); - projection_reading->setStepDescription(best_candidate->projection->name); - + projection_reading->setStepDescription(selected_projection_name); auto & projection_reading_node = nodes.emplace_back(QueryPlan::Node{.step = std::move(projection_reading)}); - auto & expr_or_filter_node = nodes.emplace_back(); - if (candidates.has_filter) + /// Root node of optimized child plan using @projection_name + QueryPlan::Node * aggregate_projection_node = nullptr; + + if (best_candidate) { - expr_or_filter_node.step = std::make_unique( - projection_reading_node.step->getOutputStream(), - best_candidate->dag, - best_candidate->dag->getOutputs().front()->result_name, - true); - } - else - expr_or_filter_node.step = std::make_unique( - projection_reading_node.step->getOutputStream(), - best_candidate->dag); + aggregate_projection_node = &nodes.emplace_back(); + if (candidates.has_filter) + { + aggregate_projection_node->step = std::make_unique( + projection_reading_node.step->getOutputStream(), + best_candidate->dag, + best_candidate->dag->getOutputs().front()->result_name, + true); + } + else + aggregate_projection_node->step + = std::make_unique(projection_reading_node.step->getOutputStream(), best_candidate->dag); - expr_or_filter_node.children.push_back(&projection_reading_node); + aggregate_projection_node->children.push_back(&projection_reading_node); + } + else /// trivial count optimization + { + aggregate_projection_node = &projection_reading_node; + } if (!has_ordinary_parts) { /// All parts are taken from projection - aggregating->requestOnlyMergeForAggregateProjection(expr_or_filter_node.step->getOutputStream()); - node.children.front() = &expr_or_filter_node; + aggregating->requestOnlyMergeForAggregateProjection(aggregate_projection_node->step->getOutputStream()); + node.children.front() = aggregate_projection_node; } else { - node.step = aggregating->convertToAggregatingProjection(expr_or_filter_node.step->getOutputStream()); - node.children.push_back(&expr_or_filter_node); + node.step = aggregating->convertToAggregatingProjection(aggregate_projection_node->step->getOutputStream()); + node.children.push_back(aggregate_projection_node); } - return best_candidate->projection->name; + return selected_projection_name; } } diff --git a/src/Processors/QueryPlan/Optimizations/optimizeUseNormalProjection.cpp b/src/Processors/QueryPlan/Optimizations/optimizeUseNormalProjection.cpp index 728aaaa6fc4..0af3869ccf1 100644 --- a/src/Processors/QueryPlan/Optimizations/optimizeUseNormalProjection.cpp +++ b/src/Processors/QueryPlan/Optimizations/optimizeUseNormalProjection.cpp @@ -139,7 +139,9 @@ std::optional optimizeUseNormalProjections(Stack & stack, QueryPlan::Nod const auto & query_info = reading->getQueryInfo(); MergeTreeDataSelectExecutor reader(reading->getMergeTreeData()); - auto ordinary_reading_select_result = reading->selectRangesToRead(); + auto ordinary_reading_select_result = reading->getAnalyzedResult(); + if (!ordinary_reading_select_result) + ordinary_reading_select_result = reading->selectRangesToRead(); size_t ordinary_reading_marks = ordinary_reading_select_result->selected_marks; /// Nothing to read. Ignore projections. diff --git a/src/Processors/QueryPlan/Optimizations/projectionsCommon.cpp b/src/Processors/QueryPlan/Optimizations/projectionsCommon.cpp index 3009460a468..af1578d6af8 100644 --- a/src/Processors/QueryPlan/Optimizations/projectionsCommon.cpp +++ b/src/Processors/QueryPlan/Optimizations/projectionsCommon.cpp @@ -25,8 +25,7 @@ namespace QueryPlanOptimizations bool canUseProjectionForReadingStep(ReadFromMergeTree * reading) { - /// Probably some projection already was applied. - if (reading->hasAnalyzedResult()) + if (reading->getAnalyzedResult() && reading->getAnalyzedResult()->readFromProjection()) return false; if (reading->isQueryWithFinal()) diff --git a/src/Processors/QueryPlan/ReadFromMergeTree.cpp b/src/Processors/QueryPlan/ReadFromMergeTree.cpp index 887a95da60d..e469062d7e7 100644 --- a/src/Processors/QueryPlan/ReadFromMergeTree.cpp +++ b/src/Processors/QueryPlan/ReadFromMergeTree.cpp @@ -1358,9 +1358,9 @@ Pipe ReadFromMergeTree::spreadMarkRangesAmongStreamsFinal( return merging_pipes.empty() ? Pipe::unitePipes(std::move(no_merging_pipes)) : Pipe::unitePipes(std::move(merging_pipes)); } -ReadFromMergeTree::AnalysisResultPtr ReadFromMergeTree::selectRangesToRead() const +ReadFromMergeTree::AnalysisResultPtr ReadFromMergeTree::selectRangesToRead(bool find_exact_ranges) const { - return selectRangesToRead(prepared_parts, alter_conversions_for_parts, false /* find_exact_ranges */); + return selectRangesToRead(prepared_parts, alter_conversions_for_parts, find_exact_ranges); } ReadFromMergeTree::AnalysisResultPtr ReadFromMergeTree::selectRangesToRead( @@ -1664,6 +1664,7 @@ ReadFromMergeTree::AnalysisResultPtr ReadFromMergeTree::selectRangesToRead( result.selected_marks_pk = sum_marks_pk; result.total_marks_pk = total_marks_pk; result.selected_rows = sum_rows; + result.has_exact_ranges = result.selected_parts == 0 || find_exact_ranges; if (query_info_.input_order_info) result.read_type = (query_info_.input_order_info->direction > 0) diff --git a/src/Processors/QueryPlan/ReadFromMergeTree.h b/src/Processors/QueryPlan/ReadFromMergeTree.h index d1f88a6788f..caa8aa2e1bd 100644 --- a/src/Processors/QueryPlan/ReadFromMergeTree.h +++ b/src/Processors/QueryPlan/ReadFromMergeTree.h @@ -100,7 +100,9 @@ public: UInt64 selected_marks_pk = 0; UInt64 total_marks_pk = 0; UInt64 selected_rows = 0; + bool has_exact_ranges = false; + bool readFromProjection() const { return !parts_with_ranges.empty() && parts_with_ranges.front().data_part->isProjectionPart(); } void checkLimits(const Settings & settings, const SelectQueryInfo & query_info_) const; }; @@ -167,7 +169,7 @@ public: AnalysisResultPtr selectRangesToRead( MergeTreeData::DataPartsVector parts, std::vector alter_conversions, bool find_exact_ranges = false) const; - AnalysisResultPtr selectRangesToRead() const; + AnalysisResultPtr selectRangesToRead(bool find_exact_ranges = false) const; StorageMetadataPtr getStorageMetadata() const { return metadata_for_reading; } @@ -182,7 +184,7 @@ public: bool requestOutputEachPartitionThroughSeparatePort(); bool willOutputEachPartitionThroughSeparatePort() const { return output_each_partition_through_separate_port; } - bool hasAnalyzedResult() const { return analyzed_result_ptr != nullptr; } + AnalysisResultPtr getAnalyzedResult() const { return analyzed_result_ptr; } void setAnalyzedResult(AnalysisResultPtr analyzed_result_ptr_) { analyzed_result_ptr = std::move(analyzed_result_ptr_); } const MergeTreeData::DataPartsVector & getParts() const { return prepared_parts; } diff --git a/src/Processors/QueryPlan/SourceStepWithFilter.h b/src/Processors/QueryPlan/SourceStepWithFilter.h index 0971b99d828..126d4824fff 100644 --- a/src/Processors/QueryPlan/SourceStepWithFilter.h +++ b/src/Processors/QueryPlan/SourceStepWithFilter.h @@ -49,11 +49,6 @@ public: filter_dags.push_back(std::move(filter_dag)); } - void addFilterFromParentStep(const ActionsDAG::Node * filter_node) - { - filter_nodes.nodes.push_back(filter_node); - } - /// Apply filters that can optimize reading from storage. void applyFilters() { diff --git a/src/Storages/AlterCommands.cpp b/src/Storages/AlterCommands.cpp index 3a4c1d94750..0d491067afc 100644 --- a/src/Storages/AlterCommands.cpp +++ b/src/Storages/AlterCommands.cpp @@ -18,6 +18,7 @@ #include #include #include +#include #include #include #include @@ -1613,7 +1614,10 @@ void AlterCommands::validate(const StoragePtr & table, ContextPtr context) const } } - if (all_columns.empty()) + /// Parameterized views do not have 'columns' in their metadata + bool is_parameterized_view = table->as() && table->as()->isParameterizedView(); + + if (!is_parameterized_view && all_columns.empty()) throw Exception(ErrorCodes::BAD_ARGUMENTS, "Cannot DROP or CLEAR all columns"); validateColumnsDefaultsAndGetSampleBlock(default_expr_list, all_columns.getAll(), context); diff --git a/src/Storages/MergeTree/ColumnSizeEstimator.h b/src/Storages/MergeTree/ColumnSizeEstimator.h index 1307a5f493e..59a635a00fb 100644 --- a/src/Storages/MergeTree/ColumnSizeEstimator.h +++ b/src/Storages/MergeTree/ColumnSizeEstimator.h @@ -19,18 +19,18 @@ public: size_t sum_index_columns = 0; size_t sum_ordinary_columns = 0; - ColumnSizeEstimator(ColumnToSize && map_, const Names & key_columns, const Names & ordinary_columns) + ColumnSizeEstimator(ColumnToSize && map_, const NamesAndTypesList & key_columns, const NamesAndTypesList & ordinary_columns) : map(std::move(map_)) { - for (const auto & name : key_columns) + for (const auto & [name, _] : key_columns) if (!map.contains(name)) map[name] = 0; - for (const auto & name : ordinary_columns) + for (const auto & [name, _] : ordinary_columns) if (!map.contains(name)) map[name] = 0; - for (const auto & name : key_columns) + for (const auto & [name, _] : key_columns) sum_index_columns += map.at(name); - for (const auto & name : ordinary_columns) + for (const auto & [name, _] : ordinary_columns) sum_ordinary_columns += map.at(name); sum_total = std::max(static_cast(1), sum_index_columns + sum_ordinary_columns); diff --git a/src/Storages/MergeTree/MergeTask.cpp b/src/Storages/MergeTree/MergeTask.cpp index 62bc3b42d1f..56bd1181fef 100644 --- a/src/Storages/MergeTree/MergeTask.cpp +++ b/src/Storages/MergeTree/MergeTask.cpp @@ -48,59 +48,23 @@ namespace ErrorCodes extern const int SUPPORT_IS_DISABLED; } - -/// PK columns are sorted and merged, ordinary columns are gathered using info from merge step -static void extractMergingAndGatheringColumns( - const NamesAndTypesList & storage_columns, - const ExpressionActionsPtr & sorting_key_expr, - const IndicesDescription & indexes, - const MergeTreeData::MergingParams & merging_params, - NamesAndTypesList & gathering_columns, Names & gathering_column_names, - NamesAndTypesList & merging_columns, Names & merging_column_names) +static ColumnsStatistics getStatisticsForColumns( + const NamesAndTypesList & columns_to_read, + const StorageMetadataPtr & metadata_snapshot) { - Names sort_key_columns_vec = sorting_key_expr->getRequiredColumns(); - std::set key_columns(sort_key_columns_vec.cbegin(), sort_key_columns_vec.cend()); - for (const auto & index : indexes) + ColumnsStatistics all_statistics; + const auto & all_columns = metadata_snapshot->getColumns(); + + for (const auto & column : columns_to_read) { - Names index_columns_vec = index.expression->getRequiredColumns(); - std::copy(index_columns_vec.cbegin(), index_columns_vec.cend(), - std::inserter(key_columns, key_columns.end())); - } - - /// Force sign column for Collapsing mode - if (merging_params.mode == MergeTreeData::MergingParams::Collapsing) - key_columns.emplace(merging_params.sign_column); - - /// Force version column for Replacing mode - if (merging_params.mode == MergeTreeData::MergingParams::Replacing) - { - key_columns.emplace(merging_params.is_deleted_column); - key_columns.emplace(merging_params.version_column); - } - - /// Force sign column for VersionedCollapsing mode. Version is already in primary key. - if (merging_params.mode == MergeTreeData::MergingParams::VersionedCollapsing) - key_columns.emplace(merging_params.sign_column); - - /// Force to merge at least one column in case of empty key - if (key_columns.empty()) - key_columns.emplace(storage_columns.front().name); - - /// TODO: also force "summing" and "aggregating" columns to make Horizontal merge only for such columns - - for (const auto & column : storage_columns) - { - if (key_columns.contains(column.name)) + const auto * desc = all_columns.tryGet(column.name); + if (desc && !desc->statistics.empty()) { - merging_columns.emplace_back(column); - merging_column_names.emplace_back(column.name); - } - else - { - gathering_columns.emplace_back(column); - gathering_column_names.emplace_back(column.name); + auto statistics = MergeTreeStatisticsFactory::instance().get(desc->statistics); + all_statistics.push_back(std::move(statistics)); } } + return all_statistics; } static void addMissedColumnsToSerializationInfos( @@ -129,6 +93,77 @@ static void addMissedColumnsToSerializationInfos( } } +/// PK columns are sorted and merged, ordinary columns are gathered using info from merge step +void MergeTask::ExecuteAndFinalizeHorizontalPart::extractMergingAndGatheringColumns() const +{ + const auto & sorting_key_expr = global_ctx->metadata_snapshot->getSortingKey().expression; + Names sort_key_columns_vec = sorting_key_expr->getRequiredColumns(); + + std::set key_columns(sort_key_columns_vec.cbegin(), sort_key_columns_vec.cend()); + + /// Force sign column for Collapsing mode + if (ctx->merging_params.mode == MergeTreeData::MergingParams::Collapsing) + key_columns.emplace(ctx->merging_params.sign_column); + + /// Force version column for Replacing mode + if (ctx->merging_params.mode == MergeTreeData::MergingParams::Replacing) + { + key_columns.emplace(ctx->merging_params.is_deleted_column); + key_columns.emplace(ctx->merging_params.version_column); + } + + /// Force sign column for VersionedCollapsing mode. Version is already in primary key. + if (ctx->merging_params.mode == MergeTreeData::MergingParams::VersionedCollapsing) + key_columns.emplace(ctx->merging_params.sign_column); + + /// Force to merge at least one column in case of empty key + if (key_columns.empty()) + key_columns.emplace(global_ctx->storage_columns.front().name); + + const auto & skip_indexes = global_ctx->metadata_snapshot->getSecondaryIndices(); + + for (const auto & index : skip_indexes) + { + auto index_columns = index.expression->getRequiredColumns(); + + /// Calculate indexes that depend only on one column on vertical + /// stage and other indexes on horizonatal stage of merge. + if (index_columns.size() == 1) + { + const auto & column_name = index_columns.front(); + global_ctx->skip_indexes_by_column[column_name].push_back(index); + } + else + { + std::ranges::copy(index_columns, std::inserter(key_columns, key_columns.end())); + global_ctx->merging_skip_indexes.push_back(index); + } + } + + /// TODO: also force "summing" and "aggregating" columns to make Horizontal merge only for such columns + + for (const auto & column : global_ctx->storage_columns) + { + if (key_columns.contains(column.name)) + { + global_ctx->merging_columns.emplace_back(column); + + /// If column is in horizontal stage we need to calculate its indexes on horizontal stage as well + auto it = global_ctx->skip_indexes_by_column.find(column.name); + if (it != global_ctx->skip_indexes_by_column.end()) + { + for (auto & index : it->second) + global_ctx->merging_skip_indexes.push_back(std::move(index)); + + global_ctx->skip_indexes_by_column.erase(it); + } + } + else + { + global_ctx->gathering_columns.emplace_back(column); + } + } +} bool MergeTask::ExecuteAndFinalizeHorizontalPart::prepare() { @@ -196,27 +231,18 @@ bool MergeTask::ExecuteAndFinalizeHorizontalPart::prepare() if (!global_ctx->parent_part) global_ctx->temporary_directory_lock = global_ctx->data->getTemporaryPartDirectoryHolder(local_tmp_part_basename); - global_ctx->all_column_names = global_ctx->metadata_snapshot->getColumns().getNamesOfPhysical(); global_ctx->storage_columns = global_ctx->metadata_snapshot->getColumns().getAllPhysical(); auto object_columns = MergeTreeData::getConcreteObjectColumns(global_ctx->future_part->parts, global_ctx->metadata_snapshot->getColumns()); - extendObjectColumns(global_ctx->storage_columns, object_columns, false); global_ctx->storage_snapshot = std::make_shared(*global_ctx->data, global_ctx->metadata_snapshot, std::move(object_columns)); - extractMergingAndGatheringColumns( - global_ctx->storage_columns, - global_ctx->metadata_snapshot->getSortingKey().expression, - global_ctx->metadata_snapshot->getSecondaryIndices(), - ctx->merging_params, - global_ctx->gathering_columns, - global_ctx->gathering_column_names, - global_ctx->merging_columns, - global_ctx->merging_column_names); + extractMergingAndGatheringColumns(); global_ctx->new_data_part->uuid = global_ctx->future_part->uuid; global_ctx->new_data_part->partition.assign(global_ctx->future_part->getPartition()); global_ctx->new_data_part->is_temp = global_ctx->parent_part == nullptr; + /// In case of replicated merge tree with zero copy replication /// Here Clickhouse claims that this new part can be deleted in temporary state without unlocking the blobs /// The blobs have to be removed along with the part, this temporary part owns them and does not share them yet. @@ -278,6 +304,7 @@ bool MergeTask::ExecuteAndFinalizeHorizontalPart::prepare() ctx->sum_input_rows_upper_bound = global_ctx->merge_list_element_ptr->total_rows_count; ctx->sum_compressed_bytes_upper_bound = global_ctx->merge_list_element_ptr->total_size_bytes_compressed; + global_ctx->chosen_merge_algorithm = chooseMergeAlgorithm(); global_ctx->merge_list_element_ptr->merge_algorithm.store(global_ctx->chosen_merge_algorithm, std::memory_order_relaxed); @@ -298,9 +325,9 @@ bool MergeTask::ExecuteAndFinalizeHorizontalPart::prepare() case MergeAlgorithm::Horizontal: { global_ctx->merging_columns = global_ctx->storage_columns; - global_ctx->merging_column_names = global_ctx->all_column_names; + global_ctx->merging_skip_indexes = global_ctx->metadata_snapshot->getSecondaryIndices(); global_ctx->gathering_columns.clear(); - global_ctx->gathering_column_names.clear(); + global_ctx->skip_indexes_by_column.clear(); break; } case MergeAlgorithm::Vertical: @@ -309,13 +336,13 @@ bool MergeTask::ExecuteAndFinalizeHorizontalPart::prepare() ctx->rows_sources_write_buf = std::make_unique(*ctx->rows_sources_uncompressed_write_buf); std::map local_merged_column_to_size; - for (const MergeTreeData::DataPartPtr & part : global_ctx->future_part->parts) + for (const auto & part : global_ctx->future_part->parts) part->accumulateColumnSizes(local_merged_column_to_size); ctx->column_sizes = ColumnSizeEstimator( std::move(local_merged_column_to_size), - global_ctx->merging_column_names, - global_ctx->gathering_column_names); + global_ctx->merging_columns, + global_ctx->gathering_columns); break; } @@ -323,9 +350,6 @@ bool MergeTask::ExecuteAndFinalizeHorizontalPart::prepare() throw Exception(ErrorCodes::LOGICAL_ERROR, "Merge algorithm must be chosen"); } - assert(global_ctx->gathering_columns.size() == global_ctx->gathering_column_names.size()); - assert(global_ctx->merging_columns.size() == global_ctx->merging_column_names.size()); - /// If merge is vertical we cannot calculate it ctx->blocks_are_granules_size = (global_ctx->chosen_merge_algorithm == MergeAlgorithm::Vertical); @@ -342,28 +366,25 @@ bool MergeTask::ExecuteAndFinalizeHorizontalPart::prepare() /// resources for this). if (!ctx->need_remove_expired_values) { - size_t expired_columns = 0; auto part_serialization_infos = global_ctx->new_data_part->getSerializationInfos(); + NameSet columns_to_remove; for (auto & [column_name, ttl] : global_ctx->new_data_part->ttl_infos.columns_ttl) { if (ttl.finished()) { global_ctx->new_data_part->expired_columns.insert(column_name); LOG_TRACE(ctx->log, "Adding expired column {} for part {}", column_name, global_ctx->new_data_part->name); - std::erase(global_ctx->gathering_column_names, column_name); - std::erase(global_ctx->merging_column_names, column_name); - std::erase(global_ctx->all_column_names, column_name); + columns_to_remove.insert(column_name); part_serialization_infos.erase(column_name); - ++expired_columns; } } - if (expired_columns) + if (!columns_to_remove.empty()) { - global_ctx->gathering_columns = global_ctx->gathering_columns.filter(global_ctx->gathering_column_names); - global_ctx->merging_columns = global_ctx->merging_columns.filter(global_ctx->merging_column_names); - global_ctx->storage_columns = global_ctx->storage_columns.filter(global_ctx->all_column_names); + global_ctx->gathering_columns = global_ctx->gathering_columns.eraseNames(columns_to_remove); + global_ctx->merging_columns = global_ctx->merging_columns.eraseNames(columns_to_remove); + global_ctx->storage_columns = global_ctx->storage_columns.eraseNames(columns_to_remove); global_ctx->new_data_part->setColumns( global_ctx->storage_columns, @@ -376,8 +397,8 @@ bool MergeTask::ExecuteAndFinalizeHorizontalPart::prepare() global_ctx->new_data_part, global_ctx->metadata_snapshot, global_ctx->merging_columns, - MergeTreeIndexFactory::instance().getMany(global_ctx->metadata_snapshot->getSecondaryIndices()), - MergeTreeStatisticsFactory::instance().getMany(global_ctx->metadata_snapshot->getColumns()), + MergeTreeIndexFactory::instance().getMany(global_ctx->merging_skip_indexes), + getStatisticsForColumns(global_ctx->merging_columns, global_ctx->metadata_snapshot), ctx->compression_codec, global_ctx->txn ? global_ctx->txn->tid : Tx::PrehistoricTID, /*reset_columns=*/ true, @@ -407,9 +428,7 @@ void MergeTask::addGatheringColumn(GlobalRuntimeContextPtr global_ctx, const Str return; global_ctx->storage_columns.emplace_back(name, type); - global_ctx->all_column_names.emplace_back(name); global_ctx->gathering_columns.emplace_back(name, type); - global_ctx->gathering_column_names.emplace_back(name); } @@ -423,7 +442,6 @@ MergeTask::StageRuntimeContextPtr MergeTask::ExecuteAndFinalizeHorizontalPart::g new_ctx->compression_codec = std::move(ctx->compression_codec); new_ctx->tmp_disk = std::move(ctx->tmp_disk); new_ctx->it_name_and_type = std::move(ctx->it_name_and_type); - new_ctx->column_num_for_vertical_merge = std::move(ctx->column_num_for_vertical_merge); new_ctx->read_with_direct_io = std::move(ctx->read_with_direct_io); new_ctx->need_sync = std::move(ctx->need_sync); @@ -510,7 +528,7 @@ bool MergeTask::VerticalMergeStage::prepareVerticalMergeForAllColumns() const size_t sum_input_rows_exact = global_ctx->merge_list_element_ptr->rows_read; size_t input_rows_filtered = *global_ctx->input_rows_filtered; - global_ctx->merge_list_element_ptr->columns_written = global_ctx->merging_column_names.size(); + global_ctx->merge_list_element_ptr->columns_written = global_ctx->merging_columns.size(); global_ctx->merge_list_element_ptr->progress.store(ctx->column_sizes->keyColumnsWeight(), std::memory_order_relaxed); ctx->rows_sources_write_buf->next(); @@ -546,14 +564,12 @@ bool MergeTask::VerticalMergeStage::prepareVerticalMergeForAllColumns() const /// Move ownership from std::unique_ptr to std::unique_ptr for CompressedReadBufferFromFile. /// First, release ownership from unique_ptr to base type. reread_buf.release(); /// NOLINT(bugprone-unused-return-value,hicpp-ignored-remove-result): we already have the pointer value in `reread_buffer_raw` + /// Then, move ownership to unique_ptr to concrete type. std::unique_ptr reread_buffer_from_file(reread_buffer_raw); + /// CompressedReadBufferFromFile expects std::unique_ptr as argument. ctx->rows_sources_read_buf = std::make_unique(std::move(reread_buffer_from_file)); - - /// For external cycle - global_ctx->gathering_column_names_size = global_ctx->gathering_column_names.size(); - ctx->column_num_for_vertical_merge = 0; ctx->it_name_and_type = global_ctx->gathering_columns.cbegin(); const auto & settings = global_ctx->context->getSettingsRef(); @@ -636,6 +652,21 @@ void MergeTask::VerticalMergeStage::prepareVerticalMergeForOneColumn() const pipe.addTransform(std::move(transform)); + MergeTreeIndices indexes_to_recalc; + auto indexes_it = global_ctx->skip_indexes_by_column.find(column_name); + + if (indexes_it != global_ctx->skip_indexes_by_column.end()) + { + indexes_to_recalc = MergeTreeIndexFactory::instance().getMany(indexes_it->second); + + pipe.addTransform(std::make_shared( + pipe.getHeader(), + indexes_it->second.getSingleExpressionForIndices(global_ctx->metadata_snapshot->getColumns(), + global_ctx->data->getContext()))); + + pipe.addTransform(std::make_shared(pipe.getHeader())); + } + ctx->column_parts_pipeline = QueryPipeline(std::move(pipe)); /// Dereference unique_ptr @@ -646,19 +677,16 @@ void MergeTask::VerticalMergeStage::prepareVerticalMergeForOneColumn() const /// Is calculated inside MergeProgressCallback. ctx->column_parts_pipeline.disableProfileEventUpdate(); - ctx->executor = std::make_unique(ctx->column_parts_pipeline); + NamesAndTypesList columns_list = {*ctx->it_name_and_type}; ctx->column_to = std::make_unique( global_ctx->new_data_part, global_ctx->metadata_snapshot, - ctx->executor->getHeader(), + columns_list, ctx->compression_codec, - /// we don't need to recalc indices here - /// because all of them were already recalculated and written - /// as key part of vertical merge - std::vector{}, - ColumnsStatistics{}, /// TODO(hanfei) + indexes_to_recalc, + getStatisticsForColumns(columns_list, global_ctx->metadata_snapshot), &global_ctx->written_offset_columns, global_ctx->to->getIndexGranularity()); @@ -716,8 +744,7 @@ void MergeTask::VerticalMergeStage::finalizeVerticalMergeForOneColumn() const global_ctx->merge_list_element_ptr->bytes_written_uncompressed += bytes; global_ctx->merge_list_element_ptr->progress.store(ctx->progress_before + ctx->column_sizes->columnWeight(column_name), std::memory_order_relaxed); - /// This is the external cycle increment. - ++ctx->column_num_for_vertical_merge; + /// This is the external loop increment. ++ctx->it_name_and_type; } @@ -749,9 +776,9 @@ bool MergeTask::MergeProjectionsStage::mergeMinMaxIndexAndPrepareProjections() c LOG_DEBUG(ctx->log, "Merge sorted {} rows, containing {} columns ({} merged, {} gathered) in {} sec., {} rows/sec., {}/sec.", global_ctx->merge_list_element_ptr->rows_read, - global_ctx->all_column_names.size(), - global_ctx->merging_column_names.size(), - global_ctx->gathering_column_names.size(), + global_ctx->storage_columns.size(), + global_ctx->merging_columns.size(), + global_ctx->gathering_columns.size(), elapsed_seconds, global_ctx->merge_list_element_ptr->rows_read / elapsed_seconds, ReadableSize(global_ctx->merge_list_element_ptr->bytes_read_uncompressed / elapsed_seconds)); @@ -888,7 +915,7 @@ bool MergeTask::VerticalMergeStage::executeVerticalMergeForAllColumns() const return false; /// This is the external cycle condition - if (ctx->column_num_for_vertical_merge >= global_ctx->gathering_column_names_size) + if (ctx->it_name_and_type == global_ctx->gathering_columns.end()) return false; switch (ctx->vertical_merge_one_column_state) @@ -976,7 +1003,7 @@ void MergeTask::ExecuteAndFinalizeHorizontalPart::createMergedStream() *global_ctx->data, global_ctx->storage_snapshot, part, - global_ctx->merging_column_names, + global_ctx->merging_columns.getNames(), /*mark_ranges=*/ {}, global_ctx->input_rows_filtered, /*apply_deleted_mask=*/ true, @@ -1115,12 +1142,12 @@ void MergeTask::ExecuteAndFinalizeHorizontalPart::createMergedStream() /// If deduplicate_by_columns is empty, add all columns except virtuals. if (global_ctx->deduplicate_by_columns.empty()) { - for (const auto & column_name : global_ctx->merging_column_names) + for (const auto & column : global_ctx->merging_columns) { - if (virtuals.tryGet(column_name, VirtualsKind::Persistent)) + if (virtuals.tryGet(column.name, VirtualsKind::Persistent)) continue; - global_ctx->deduplicate_by_columns.emplace_back(column_name); + global_ctx->deduplicate_by_columns.emplace_back(column.name); } } @@ -1141,11 +1168,13 @@ void MergeTask::ExecuteAndFinalizeHorizontalPart::createMergedStream() builder->addTransform(std::move(transform)); } - if (global_ctx->metadata_snapshot->hasSecondaryIndices()) + if (!global_ctx->merging_skip_indexes.empty()) { - const auto & indices = global_ctx->metadata_snapshot->getSecondaryIndices(); builder->addTransform(std::make_shared( - builder->getHeader(), indices.getSingleExpressionForIndices(global_ctx->metadata_snapshot->getColumns(), global_ctx->data->getContext()))); + builder->getHeader(), + global_ctx->merging_skip_indexes.getSingleExpressionForIndices(global_ctx->metadata_snapshot->getColumns(), + global_ctx->data->getContext()))); + builder->addTransform(std::make_shared(builder->getHeader())); } diff --git a/src/Storages/MergeTree/MergeTask.h b/src/Storages/MergeTree/MergeTask.h index 1294fa30449..0e13d3aef62 100644 --- a/src/Storages/MergeTree/MergeTask.h +++ b/src/Storages/MergeTree/MergeTask.h @@ -24,6 +24,7 @@ #include #include #include +#include namespace DB @@ -164,14 +165,13 @@ private: NamesAndTypesList gathering_columns{}; NamesAndTypesList merging_columns{}; - Names gathering_column_names{}; - Names merging_column_names{}; NamesAndTypesList storage_columns{}; - Names all_column_names{}; MergeTreeData::DataPart::Checksums checksums_gathered_columns{}; + IndicesDescription merging_skip_indexes; + std::unordered_map skip_indexes_by_column; + MergeAlgorithm chosen_merge_algorithm{MergeAlgorithm::Undecided}; - size_t gathering_column_names_size{0}; std::unique_ptr horizontal_stage_progress{nullptr}; std::unique_ptr column_progress{nullptr}; @@ -232,7 +232,6 @@ private: /// Dependencies for next stages std::list::const_iterator it_name_and_type; - size_t column_num_for_vertical_merge{0}; bool need_sync{false}; }; @@ -260,12 +259,14 @@ private: MergeAlgorithm chooseMergeAlgorithm() const; void createMergedStream(); + void extractMergingAndGatheringColumns() const; void setRuntimeContext(StageRuntimeContextPtr local, StageRuntimeContextPtr global) override { ctx = static_pointer_cast(local); global_ctx = static_pointer_cast(global); } + StageRuntimeContextPtr getContextForNextStage() override; ExecuteAndFinalizeHorizontalPartRuntimeContextPtr ctx; @@ -284,7 +285,6 @@ private: CompressionCodecPtr compression_codec; TemporaryDataOnDiskPtr tmp_disk{nullptr}; std::list::const_iterator it_name_and_type; - size_t column_num_for_vertical_merge{0}; bool read_with_direct_io{false}; bool need_sync{false}; /// End dependencies from previous stages diff --git a/src/Storages/MergeTree/MergeTreeData.cpp b/src/Storages/MergeTree/MergeTreeData.cpp index cd706dab9ae..89f39c65517 100644 --- a/src/Storages/MergeTree/MergeTreeData.cpp +++ b/src/Storages/MergeTree/MergeTreeData.cpp @@ -1981,6 +1981,7 @@ void MergeTreeData::loadDataParts(bool skip_sanity_checks, std::optional runner(getUnexpectedPartsLoadingThreadPool().get(), "UnexpectedParts"); for (auto & load_state : unexpected_data_parts) @@ -2027,6 +2031,13 @@ void MergeTreeData::loadUnexpectedDataParts() unexpected_data_parts_cv.notify_all(); } } +catch (...) +{ + LOG_ERROR(log, "Loading of unexpected parts failed. " + "Will terminate to avoid undefined behaviour due to inconsistent set of parts. " + "Exception: {}", getCurrentExceptionMessage(true)); + std::terminate(); +} void MergeTreeData::loadOutdatedDataParts(bool is_async) try @@ -7061,19 +7072,23 @@ QueryProcessingStage::Enum MergeTreeData::getQueryProcessingStage( const StorageSnapshotPtr &, SelectQueryInfo &) const { - if (query_context->getClientInfo().collaborate_with_initiator) - return QueryProcessingStage::Enum::FetchColumns; - - /// Parallel replicas - if (query_context->canUseParallelReplicasOnInitiator() && to_stage >= QueryProcessingStage::WithMergeableState) + /// with new analyzer, Planner make decision regarding parallel replicas usage, and so about processing stage on reading + if (!query_context->getSettingsRef().allow_experimental_analyzer) { - /// ReplicatedMergeTree - if (supportsReplication()) - return QueryProcessingStage::Enum::WithMergeableState; + if (query_context->getClientInfo().collaborate_with_initiator) + return QueryProcessingStage::Enum::FetchColumns; - /// For non-replicated MergeTree we allow them only if parallel_replicas_for_non_replicated_merge_tree is enabled - if (query_context->getSettingsRef().parallel_replicas_for_non_replicated_merge_tree) - return QueryProcessingStage::Enum::WithMergeableState; + /// Parallel replicas + if (query_context->canUseParallelReplicasOnInitiator() && to_stage >= QueryProcessingStage::WithMergeableState) + { + /// ReplicatedMergeTree + if (supportsReplication()) + return QueryProcessingStage::Enum::WithMergeableState; + + /// For non-replicated MergeTree we allow them only if parallel_replicas_for_non_replicated_merge_tree is enabled + if (query_context->getSettingsRef().parallel_replicas_for_non_replicated_merge_tree) + return QueryProcessingStage::Enum::WithMergeableState; + } } return QueryProcessingStage::Enum::FetchColumns; diff --git a/src/Storages/MergeTree/MergeTreeDataWriter.cpp b/src/Storages/MergeTree/MergeTreeDataWriter.cpp index 8e304936747..5c8aa32949d 100644 --- a/src/Storages/MergeTree/MergeTreeDataWriter.cpp +++ b/src/Storages/MergeTree/MergeTreeDataWriter.cpp @@ -503,7 +503,8 @@ MergeTreeDataWriter::TemporaryPart MergeTreeDataWriter::writeTempPartImpl( ProfileEvents::increment(ProfileEvents::MergeTreeDataWriterBlocksAlreadySorted); } - if (data.getSettings()->allow_experimental_optimized_row_order) + if (data.getSettings()->optimize_row_order + && data.merging_params.mode == MergeTreeData::MergingParams::Mode::Ordinary) /// Nobody knows if this optimization messes up specialized MergeTree engines. { RowOrderOptimizer::optimize(block, sort_description, perm); perm_ptr = &perm; @@ -730,7 +731,8 @@ MergeTreeDataWriter::TemporaryPart MergeTreeDataWriter::writeProjectionPartImpl( ProfileEvents::increment(ProfileEvents::MergeTreeDataProjectionWriterBlocksAlreadySorted); } - if (data.getSettings()->allow_experimental_optimized_row_order) + if (data.getSettings()->optimize_row_order + && data.merging_params.mode == MergeTreeData::MergingParams::Mode::Ordinary) /// Nobody knows if this optimization messes up specialized MergeTree engines. { RowOrderOptimizer::optimize(block, sort_description, perm); perm_ptr = &perm; diff --git a/src/Storages/MergeTree/MergeTreeSettings.h b/src/Storages/MergeTree/MergeTreeSettings.h index 026a1da7196..6ababefa530 100644 --- a/src/Storages/MergeTree/MergeTreeSettings.h +++ b/src/Storages/MergeTree/MergeTreeSettings.h @@ -94,6 +94,7 @@ struct Settings; M(Bool, async_insert, false, "If true, data from INSERT query is stored in queue and later flushed to table in background.", 0) \ M(Bool, add_implicit_sign_column_constraint_for_collapsing_engine, false, "If true, add implicit constraint for sign column for CollapsingMergeTree engine.", 0) \ M(Milliseconds, sleep_before_commit_local_part_in_replicated_table_ms, 0, "For testing. Do not change it.", 0) \ + M(Bool, optimize_row_order, false, "Allow reshuffling of rows during part inserts and merges to improve the compressibility of the new part", 0) \ \ /* Part removal settings. */ \ M(UInt64, simultaneous_parts_removal_limit, 0, "Maximum number of parts to remove during one CleanupThread iteration (0 means unlimited).", 0) \ @@ -199,7 +200,6 @@ struct Settings; M(Bool, cache_populated_by_fetch, false, "Only available in ClickHouse Cloud", 0) \ M(Bool, force_read_through_cache_for_merges, false, "Force read-through filesystem cache for merges", 0) \ M(Bool, allow_experimental_replacing_merge_with_cleanup, false, "Allow experimental CLEANUP merges for ReplacingMergeTree with is_deleted column.", 0) \ - M(Bool, allow_experimental_optimized_row_order, false, "Allow reshuffling of rows during part inserts and merges to improve the compressibility of the new part", 0) \ \ /** Compress marks and primary key. */ \ M(Bool, compress_marks, true, "Marks support compression, reduce mark file size and speed up network transmission.", 0) \ diff --git a/src/Storages/MergeTree/MergedColumnOnlyOutputStream.cpp b/src/Storages/MergeTree/MergedColumnOnlyOutputStream.cpp index 54d177943d0..5ae6517a236 100644 --- a/src/Storages/MergeTree/MergedColumnOnlyOutputStream.cpp +++ b/src/Storages/MergeTree/MergedColumnOnlyOutputStream.cpp @@ -13,15 +13,14 @@ namespace ErrorCodes MergedColumnOnlyOutputStream::MergedColumnOnlyOutputStream( const MergeTreeMutableDataPartPtr & data_part, const StorageMetadataPtr & metadata_snapshot_, - const Block & header_, + const NamesAndTypesList & columns_list_, CompressionCodecPtr default_codec, const MergeTreeIndices & indices_to_recalc, const ColumnsStatistics & stats_to_recalc_, WrittenOffsetColumns * offset_columns_, const MergeTreeIndexGranularity & index_granularity, const MergeTreeIndexGranularityInfo * index_granularity_info) - : IMergedBlockOutputStream(data_part->storage.getSettings(), data_part->getDataPartStoragePtr(), metadata_snapshot_, header_.getNamesAndTypesList(), /*reset_columns=*/ true) - , header(header_) + : IMergedBlockOutputStream(data_part->storage.getSettings(), data_part->getDataPartStoragePtr(), metadata_snapshot_, columns_list_, /*reset_columns=*/ true) { const auto & global_settings = data_part->storage.getContext()->getSettings(); @@ -37,7 +36,7 @@ MergedColumnOnlyOutputStream::MergedColumnOnlyOutputStream( data_part->name, data_part->storage.getLogName(), data_part->getSerializations(), data_part_storage, data_part->index_granularity_info, storage_settings, - header.getNamesAndTypesList(), + columns_list_, data_part->getColumnPositions(), metadata_snapshot_, data_part->storage.getVirtualsPtr(), diff --git a/src/Storages/MergeTree/MergedColumnOnlyOutputStream.h b/src/Storages/MergeTree/MergedColumnOnlyOutputStream.h index 16a54ff33b6..e837a62743e 100644 --- a/src/Storages/MergeTree/MergedColumnOnlyOutputStream.h +++ b/src/Storages/MergeTree/MergedColumnOnlyOutputStream.h @@ -17,7 +17,7 @@ public: MergedColumnOnlyOutputStream( const MergeTreeMutableDataPartPtr & data_part, const StorageMetadataPtr & metadata_snapshot_, - const Block & header_, + const NamesAndTypesList & columns_list_, CompressionCodecPtr default_codec_, const MergeTreeIndices & indices_to_recalc_, const ColumnsStatistics & stats_to_recalc_, @@ -25,16 +25,12 @@ public: const MergeTreeIndexGranularity & index_granularity = {}, const MergeTreeIndexGranularityInfo * index_granularity_info_ = nullptr); - Block getHeader() const { return header; } void write(const Block & block) override; MergeTreeData::DataPart::Checksums fillChecksums(MergeTreeData::MutableDataPartPtr & new_part, MergeTreeData::DataPart::Checksums & all_checksums); void finish(bool sync); - -private: - Block header; }; using MergedColumnOnlyOutputStreamPtr = std::shared_ptr; diff --git a/src/Storages/MergeTree/MutateTask.cpp b/src/Storages/MergeTree/MutateTask.cpp index 1318563e469..54df68126f8 100644 --- a/src/Storages/MergeTree/MutateTask.cpp +++ b/src/Storages/MergeTree/MutateTask.cpp @@ -1901,7 +1901,7 @@ private: ctx->out = std::make_shared( ctx->new_data_part, ctx->metadata_snapshot, - ctx->updated_header, + ctx->updated_header.getNamesAndTypesList(), ctx->compression_codec, std::vector(ctx->indices_to_recalc.begin(), ctx->indices_to_recalc.end()), ColumnsStatistics(ctx->stats_to_recalc.begin(), ctx->stats_to_recalc.end()), diff --git a/src/Storages/MergeTree/RowOrderOptimizer.cpp b/src/Storages/MergeTree/RowOrderOptimizer.cpp index 34f9fed4500..76b0d6452ad 100644 --- a/src/Storages/MergeTree/RowOrderOptimizer.cpp +++ b/src/Storages/MergeTree/RowOrderOptimizer.cpp @@ -78,9 +78,8 @@ std::vector getOtherColumnIndexes(const Block & block, const SortDescrip /// -------- /// 2 1 a 3 /// ---------------------- -EqualRanges getEqualRanges(const Block & block, const SortDescription & sort_description, const IColumn::Permutation & permutation, const LoggerPtr & log) +EqualRanges getEqualRanges(const Block & block, const SortDescription & sort_description, const IColumn::Permutation & permutation) { - LOG_TRACE(log, "Finding equal ranges"); EqualRanges ranges; const size_t rows = block.rows(); if (sort_description.empty()) @@ -122,11 +121,10 @@ void updatePermutationInEqualRange( const std::vector & other_column_indexes, IColumn::Permutation & permutation, const EqualRange & equal_range, - const std::vector & cardinalities) + const std::vector & cardinalities, + const LoggerPtr & log) { - LoggerPtr log = getLogger("RowOrderOptimizer"); - - LOG_TRACE(log, "Starting optimization in equal range"); + LOG_TEST(log, "Starting optimization in equal range"); std::vector column_order(other_column_indexes.size()); iota(column_order.begin(), column_order.end(), 0); @@ -134,17 +132,17 @@ void updatePermutationInEqualRange( stable_sort(column_order.begin(), column_order.end(), cmp); std::vector ranges = {equal_range}; - LOG_TRACE(log, "equal_range: .from: {}, .to: {}", equal_range.from, equal_range.to); + LOG_TEST(log, "equal_range: .from: {}, .to: {}", equal_range.from, equal_range.to); for (size_t i : column_order) { const size_t column_id = other_column_indexes[i]; const ColumnPtr & column = block.getByPosition(column_id).column; - LOG_TRACE(log, "i: {}, column_id: {}, column->getName(): {}, cardinality: {}", i, column_id, column->getName(), cardinalities[i]); + LOG_TEST(log, "i: {}, column_id: {}, column type: {}, cardinality: {}", i, column_id, column->getName(), cardinalities[i]); column->updatePermutation( IColumn::PermutationSortDirection::Ascending, IColumn::PermutationSortStability::Stable, 0, 1, permutation, ranges); } - LOG_TRACE(log, "Finish optimization in equal range"); + LOG_TEST(log, "Finish optimization in equal range"); } } @@ -156,7 +154,10 @@ void RowOrderOptimizer::optimize(const Block & block, const SortDescription & so LOG_TRACE(log, "Starting optimization"); if (block.columns() == 0) + { + LOG_TRACE(log, "Finished optimization (block has no columns)"); return; /// a table without columns, this should not happen in the first place ... + } if (permutation.empty()) { @@ -165,17 +166,17 @@ void RowOrderOptimizer::optimize(const Block & block, const SortDescription & so iota(permutation.data(), rows, IColumn::Permutation::value_type(0)); } - const EqualRanges equal_ranges = getEqualRanges(block, sort_description, permutation, log); + const EqualRanges equal_ranges = getEqualRanges(block, sort_description, permutation); const std::vector other_columns_indexes = getOtherColumnIndexes(block, sort_description); - LOG_TRACE(log, "block.columns(): {}, block.rows(): {}, sort_description.size(): {}, equal_ranges.size(): {}", block.columns(), block.rows(), sort_description.size(), equal_ranges.size()); + LOG_TRACE(log, "columns: {}, sorting key columns: {}, rows: {}, equal ranges: {}", block.columns(), sort_description.size(), block.rows(), equal_ranges.size()); for (const auto & equal_range : equal_ranges) { if (equal_range.size() <= 1) continue; const std::vector cardinalities = getCardinalitiesInPermutedRange(block, other_columns_indexes, permutation, equal_range); - updatePermutationInEqualRange(block, other_columns_indexes, permutation, equal_range, cardinalities); + updatePermutationInEqualRange(block, other_columns_indexes, permutation, equal_range, cardinalities, log); } LOG_TRACE(log, "Finished optimization"); diff --git a/src/Storages/NamedCollectionsHelpers.cpp b/src/Storages/NamedCollectionsHelpers.cpp index 47b69d79ad8..ba90f21c907 100644 --- a/src/Storages/NamedCollectionsHelpers.cpp +++ b/src/Storages/NamedCollectionsHelpers.cpp @@ -95,7 +95,7 @@ MutableNamedCollectionPtr tryGetNamedCollectionWithOverrides( if (asts.empty()) return nullptr; - NamedCollectionUtils::loadIfNot(); + NamedCollectionFactory::instance().loadIfNot(); auto collection_name = getCollectionName(asts); if (!collection_name.has_value()) diff --git a/src/Storages/ObjectStorage/StorageObjectStorageSource.cpp b/src/Storages/ObjectStorage/StorageObjectStorageSource.cpp index b31d0f8a92e..2fc6993369d 100644 --- a/src/Storages/ObjectStorage/StorageObjectStorageSource.cpp +++ b/src/Storages/ObjectStorage/StorageObjectStorageSource.cpp @@ -195,12 +195,14 @@ Chunk StorageObjectStorageSource::generate() const auto & object_info = reader.getObjectInfo(); const auto & filename = object_info.getFileName(); chassert(object_info.metadata); - VirtualColumnUtils::addRequestedPathFileAndSizeVirtualsToChunk( - chunk, - read_from_format_info.requested_virtual_columns, - getUniqueStoragePathIdentifier(*configuration, reader.getObjectInfo(), false), - object_info.metadata->size_bytes, &filename); - + VirtualColumnUtils::addRequestedFileLikeStorageVirtualsToChunk( + chunk, read_from_format_info.requested_virtual_columns, + { + .path = getUniqueStoragePathIdentifier(*configuration, reader.getObjectInfo(), false), + .size = object_info.metadata->size_bytes, + .filename = &filename, + .last_modified = object_info.metadata->last_modified + }); return chunk; } diff --git a/src/Storages/ProjectionsDescription.cpp b/src/Storages/ProjectionsDescription.cpp index 0bcbedee41a..f5d869cdea0 100644 --- a/src/Storages/ProjectionsDescription.cpp +++ b/src/Storages/ProjectionsDescription.cpp @@ -64,7 +64,6 @@ ProjectionDescription ProjectionDescription::clone() const other.sample_block_for_keys = sample_block_for_keys; other.metadata = metadata; other.key_size = key_size; - other.is_minmax_count_projection = is_minmax_count_projection; other.primary_key_max_column_name = primary_key_max_column_name; other.partition_value_indices = partition_value_indices; @@ -195,7 +194,6 @@ ProjectionDescription ProjectionDescription::getMinMaxCountProjection( ContextPtr query_context) { ProjectionDescription result; - result.is_minmax_count_projection = true; auto select_query = std::make_shared(); ASTPtr select_expression_list = std::make_shared(); @@ -282,13 +280,11 @@ ProjectionDescription ProjectionDescription::getMinMaxCountProjection( return result; } - void ProjectionDescription::recalculateWithNewColumns(const ColumnsDescription & new_columns, ContextPtr query_context) { *this = getProjectionFromAST(definition_ast, new_columns, query_context); } - Block ProjectionDescription::calculate(const Block & block, ContextPtr context) const { auto mut_context = Context::createCopy(context); diff --git a/src/Storages/ProjectionsDescription.h b/src/Storages/ProjectionsDescription.h index 75a97697e00..5f091b4421b 100644 --- a/src/Storages/ProjectionsDescription.h +++ b/src/Storages/ProjectionsDescription.h @@ -56,8 +56,6 @@ struct ProjectionDescription size_t key_size = 0; - bool is_minmax_count_projection = false; - /// If a primary key expression is used in the minmax_count projection, store the name of max expression. String primary_key_max_column_name; diff --git a/src/Storages/S3Queue/S3QueueSource.cpp b/src/Storages/S3Queue/S3QueueSource.cpp index d8633037ed9..b5b1a8dd992 100644 --- a/src/Storages/S3Queue/S3QueueSource.cpp +++ b/src/Storages/S3Queue/S3QueueSource.cpp @@ -421,8 +421,14 @@ Chunk StorageS3QueueSource::generate() file_status->processed_rows += chunk.getNumRows(); processed_rows_from_file += chunk.getNumRows(); - VirtualColumnUtils::addRequestedPathFileAndSizeVirtualsToChunk( - chunk, requested_virtual_columns, path, reader.getObjectInfo().metadata->size_bytes); + VirtualColumnUtils::addRequestedFileLikeStorageVirtualsToChunk( + chunk, requested_virtual_columns, + { + .path = path, + .size = reader.getObjectInfo().metadata->size_bytes + }); + + return chunk; } } diff --git a/src/Storages/SelectQueryInfo.h b/src/Storages/SelectQueryInfo.h index 11e2a2fc5e7..6901b6cb5ff 100644 --- a/src/Storages/SelectQueryInfo.h +++ b/src/Storages/SelectQueryInfo.h @@ -208,10 +208,6 @@ struct SelectQueryInfo bool need_aggregate = false; PrewhereInfoPtr prewhere_info; - /// Generated by pre-run optimization with StorageDummy. - /// Currently it's used to support StorageMerge PREWHERE optimization. - PrewhereInfoPtr optimized_prewhere_info; - /// If query has aggregate functions bool has_aggregates = false; diff --git a/src/Storages/StorageFile.cpp b/src/Storages/StorageFile.cpp index 6744159d5dc..aaf84f6f82c 100644 --- a/src/Storages/StorageFile.cpp +++ b/src/Storages/StorageFile.cpp @@ -1341,6 +1341,7 @@ Chunk StorageFileSource::generate() chassert(file_enumerator); current_path = fmt::format("{}::{}", archive_reader->getPath(), *filename_override); current_file_size = file_enumerator->getFileInfo().uncompressed_size; + current_file_last_modified = file_enumerator->getFileInfo().last_modified; if (need_only_count && tryGetCountFromCache(current_archive_stat)) continue; @@ -1370,6 +1371,7 @@ Chunk StorageFileSource::generate() struct stat file_stat; file_stat = getFileStat(current_path, storage->use_table_fd, storage->table_fd, storage->getName()); current_file_size = file_stat.st_size; + current_file_last_modified = Poco::Timestamp::fromEpochTime(file_stat.st_mtime); if (getContext()->getSettingsRef().engine_file_skip_empty_files && file_stat.st_size == 0) continue; @@ -1436,8 +1438,15 @@ Chunk StorageFileSource::generate() progress(num_rows, chunk_size ? chunk_size : chunk.bytes()); /// Enrich with virtual columns. - VirtualColumnUtils::addRequestedPathFileAndSizeVirtualsToChunk( - chunk, requested_virtual_columns, current_path, current_file_size, filename_override.has_value() ? &filename_override.value() : nullptr); + VirtualColumnUtils::addRequestedFileLikeStorageVirtualsToChunk( + chunk, requested_virtual_columns, + { + .path = current_path, + .size = current_file_size, + .filename = (filename_override.has_value() ? &filename_override.value() : nullptr), + .last_modified = current_file_last_modified + }); + return chunk; } diff --git a/src/Storages/StorageFile.h b/src/Storages/StorageFile.h index 37da59c3664..ac094aeb489 100644 --- a/src/Storages/StorageFile.h +++ b/src/Storages/StorageFile.h @@ -279,6 +279,7 @@ private: FilesIteratorPtr files_iterator; String current_path; std::optional current_file_size; + std::optional current_file_last_modified; struct stat current_archive_stat; std::optional filename_override; Block sample_block; diff --git a/src/Storages/StorageGenerateRandom.cpp b/src/Storages/StorageGenerateRandom.cpp index 2190e012c5b..2f850c76465 100644 --- a/src/Storages/StorageGenerateRandom.cpp +++ b/src/Storages/StorageGenerateRandom.cpp @@ -50,6 +50,12 @@ namespace ErrorCodes namespace { +struct GenerateRandomState +{ + std::atomic add_total_rows = 0; +}; +using GenerateRandomStatePtr = std::shared_ptr; + void fillBufferWithRandomData(char * __restrict data, size_t limit, size_t size_of_type, pcg64 & rng, [[maybe_unused]] bool flip_bytes = false) { size_t size = limit * size_of_type; @@ -532,10 +538,24 @@ ColumnPtr fillColumnWithRandomData( class GenerateSource : public ISource { public: - GenerateSource(UInt64 block_size_, UInt64 max_array_length_, UInt64 max_string_length_, UInt64 random_seed_, Block block_header_, ContextPtr context_) + GenerateSource( + UInt64 block_size_, + UInt64 max_array_length_, + UInt64 max_string_length_, + UInt64 random_seed_, + Block block_header_, + ContextPtr context_, + GenerateRandomStatePtr state_) : ISource(Nested::flattenNested(prepareBlockToFill(block_header_))) - , block_size(block_size_), max_array_length(max_array_length_), max_string_length(max_string_length_) - , block_to_fill(std::move(block_header_)), rng(random_seed_), context(context_) {} + , block_size(block_size_) + , max_array_length(max_array_length_) + , max_string_length(max_string_length_) + , block_to_fill(std::move(block_header_)) + , rng(random_seed_) + , context(context_) + , shared_state(state_) + { + } String getName() const override { return "GenerateRandom"; } @@ -549,7 +569,15 @@ protected: columns.emplace_back(fillColumnWithRandomData(elem.type, block_size, max_array_length, max_string_length, rng, context)); columns = Nested::flattenNested(block_to_fill.cloneWithColumns(columns)).getColumns(); - return {std::move(columns), block_size}; + + UInt64 total_rows = shared_state->add_total_rows.fetch_and(0); + if (total_rows) + addTotalRowsApprox(total_rows); + + auto chunk = Chunk{std::move(columns), block_size}; + progress(chunk.getNumRows(), chunk.bytes()); + + return chunk; } private: @@ -561,6 +589,7 @@ private: pcg64 rng; ContextPtr context; + GenerateRandomStatePtr shared_state; static Block & prepareBlockToFill(Block & block) { @@ -648,9 +677,6 @@ Pipe StorageGenerateRandom::read( { storage_snapshot->check(column_names); - Pipes pipes; - pipes.reserve(num_streams); - const ColumnsDescription & our_columns = storage_snapshot->metadata->getColumns(); Block block_header; for (const auto & name : column_names) @@ -679,16 +705,24 @@ Pipe StorageGenerateRandom::read( } } + UInt64 query_limit = query_info.limit; + if (query_limit && num_streams * max_block_size > query_limit) + { + /// We want to avoid spawning more streams than necessary + num_streams = std::min(num_streams, static_cast(((query_limit + max_block_size - 1) / max_block_size))); + } + Pipes pipes; + pipes.reserve(num_streams); + /// Will create more seed values for each source from initial seed. pcg64 generate(random_seed); + auto shared_state = std::make_shared(query_info.limit); + for (UInt64 i = 0; i < num_streams; ++i) { - auto source = std::make_shared(max_block_size, max_array_length, max_string_length, generate(), block_header, context); - - if (i == 0 && query_info.limit) - source->addTotalRowsApprox(query_info.limit); - + auto source = std::make_shared( + max_block_size, max_array_length, max_string_length, generate(), block_header, context, shared_state); pipes.emplace_back(std::move(source)); } diff --git a/src/Storages/StorageMerge.cpp b/src/Storages/StorageMerge.cpp index 4c678a1228b..ed3f43367dd 100644 --- a/src/Storages/StorageMerge.cpp +++ b/src/Storages/StorageMerge.cpp @@ -34,9 +34,10 @@ #include #include #include -#include #include #include +#include +#include #include #include #include @@ -402,10 +403,14 @@ ReadFromMerge::ReadFromMerge( { } -void ReadFromMerge::updatePrewhereInfo(const PrewhereInfoPtr & prewhere_info_value) +void ReadFromMerge::addFilter(FilterDAGInfo filter) { - SourceStepWithFilter::updatePrewhereInfo(prewhere_info_value); - common_header = applyPrewhereActions(common_header, prewhere_info); + output_stream->header = FilterTransform::transformHeader( + output_stream->header, + filter.actions.get(), + filter.column_name, + filter.do_remove_column); + pushed_down_filters.push_back(std::move(filter)); } void ReadFromMerge::initializePipeline(QueryPipelineBuilder & pipeline, const BuildQueryPipelineSettings &) @@ -435,21 +440,7 @@ void ReadFromMerge::initializePipeline(QueryPipelineBuilder & pipeline, const Bu Names column_names_as_aliases; Aliases aliases; - Names real_column_names = column_names; - if (child_plan.row_policy_data_opt) - child_plan.row_policy_data_opt->extendNames(real_column_names); - - auto modified_query_info = getModifiedQueryInfo(modified_context, table, nested_storage_snaphsot, real_column_names, column_names_as_aliases, aliases); - - auto source_pipeline = createSources( - child_plan.plan, - nested_storage_snaphsot, - modified_query_info, - common_processed_stage, - common_header, - child_plan.table_aliases, - child_plan.row_policy_data_opt, - table); + auto source_pipeline = buildPipeline(child_plan, common_processed_stage); if (source_pipeline && source_pipeline->initialized()) { @@ -567,10 +558,8 @@ std::vector ReadFromMerge::createChildrenPlans(SelectQ if (sampling_requested && !storage->supportsSampling()) throw Exception(ErrorCodes::SAMPLING_NOT_SUPPORTED, "Illegal SAMPLE: table {} doesn't support sampling", storage->getStorageID().getNameForLogs()); - res.emplace_back(); - - auto & aliases = res.back().table_aliases; - auto & row_policy_data_opt = res.back().row_policy_data_opt; + Aliases aliases; + RowPolicyDataOpt row_policy_data_opt; auto storage_metadata_snapshot = storage->getInMemoryMetadataPtr(); auto nested_storage_snaphsot = storage->getStorageSnapshot(storage_metadata_snapshot, modified_context); @@ -649,7 +638,7 @@ std::vector ReadFromMerge::createChildrenPlans(SelectQ } - res.back().plan = createPlanForTable( + auto child = createPlanForTable( nested_storage_snaphsot, modified_query_info, common_processed_stage, @@ -659,9 +648,32 @@ std::vector ReadFromMerge::createChildrenPlans(SelectQ row_policy_data_opt, modified_context, current_streams); - res.back().plan.addInterpreterContext(modified_context); - } + child.plan.addInterpreterContext(modified_context); + if (child.plan.isInitialized()) + { + addVirtualColumns(child, modified_query_info, common_processed_stage, table); + + /// Subordinary tables could have different but convertible types, like numeric types of different width. + /// We must return streams with structure equals to structure of Merge table. + convertAndFilterSourceStream(common_header, modified_query_info, nested_storage_snaphsot, aliases, row_policy_data_opt, context, child); + + for (const auto & filter_info : pushed_down_filters) + { + auto filter_step = std::make_unique( + child.plan.getCurrentDataStream(), + filter_info.actions->clone(), + filter_info.column_name, + filter_info.do_remove_column); + + child.plan.addStep(std::move(filter_step)); + } + + child.plan.optimize(QueryPlanOptimizationSettings::fromContext(modified_context)); + } + + res.emplace_back(std::move(child)); + } return res; } @@ -876,8 +888,6 @@ SelectQueryInfo ReadFromMerge::getModifiedQueryInfo(const ContextMutablePtr & mo const StorageID current_storage_id = storage->getStorageID(); SelectQueryInfo modified_query_info = query_info; - if (modified_query_info.optimized_prewhere_info && !modified_query_info.prewhere_info) - modified_query_info.prewhere_info = modified_query_info.optimized_prewhere_info; if (modified_query_info.planner_context) modified_query_info.planner_context = std::make_shared(modified_context, modified_query_info.planner_context); @@ -1019,31 +1029,101 @@ bool recursivelyApplyToReadingSteps(QueryPlan::Node * node, const std::function< return ok; } -QueryPipelineBuilderPtr ReadFromMerge::createSources( - QueryPlan & plan, - const StorageSnapshotPtr & storage_snapshot_, +void ReadFromMerge::addVirtualColumns( + ChildPlan & child, SelectQueryInfo & modified_query_info, QueryProcessingStage::Enum processed_stage, - const Block & header, - const Aliases & aliases, - const RowPolicyDataOpt & row_policy_data_opt, - const StorageWithLockAndName & storage_with_lock, - bool concat_streams) const + const StorageWithLockAndName & storage_with_lock) const { - if (!plan.isInitialized()) - return std::make_unique(); - - QueryPipelineBuilderPtr builder; - - const auto & [database_name, storage, _, table_name] = storage_with_lock; + const auto & [database_name, _, storage, table_name] = storage_with_lock; bool allow_experimental_analyzer = context->getSettingsRef().allow_experimental_analyzer; - auto storage_stage - = storage->getQueryProcessingStage(context, processed_stage, storage_snapshot_, modified_query_info); - builder = plan.buildQueryPipeline( - QueryPlanOptimizationSettings::fromContext(context), BuildQueryPipelineSettings::fromContext(context)); + /// Add virtual columns if we don't already have them. - if (processed_stage > storage_stage || (allow_experimental_analyzer && processed_stage != QueryProcessingStage::FetchColumns)) + Block plan_header = child.plan.getCurrentDataStream().header; + + if (allow_experimental_analyzer) + { + String table_alias = modified_query_info.query_tree->as()->getJoinTree()->as()->getAlias(); + + String database_column = table_alias.empty() || processed_stage == QueryProcessingStage::FetchColumns ? "_database" : table_alias + "._database"; + String table_column = table_alias.empty() || processed_stage == QueryProcessingStage::FetchColumns ? "_table" : table_alias + "._table"; + + if (has_database_virtual_column && common_header.has(database_column) + && child.stage == QueryProcessingStage::FetchColumns && !plan_header.has(database_column)) + { + ColumnWithTypeAndName column; + column.name = database_column; + column.type = std::make_shared(std::make_shared()); + column.column = column.type->createColumnConst(0, Field(database_name)); + + auto adding_column_dag = ActionsDAG::makeAddingColumnActions(std::move(column)); + auto expression_step = std::make_unique(child.plan.getCurrentDataStream(), adding_column_dag); + child.plan.addStep(std::move(expression_step)); + plan_header = child.plan.getCurrentDataStream().header; + } + + if (has_table_virtual_column && common_header.has(table_column) + && child.stage == QueryProcessingStage::FetchColumns && !plan_header.has(table_column)) + { + ColumnWithTypeAndName column; + column.name = table_column; + column.type = std::make_shared(std::make_shared()); + column.column = column.type->createColumnConst(0, Field(table_name)); + + auto adding_column_dag = ActionsDAG::makeAddingColumnActions(std::move(column)); + auto expression_step = std::make_unique(child.plan.getCurrentDataStream(), adding_column_dag); + child.plan.addStep(std::move(expression_step)); + plan_header = child.plan.getCurrentDataStream().header; + } + } + else + { + if (has_database_virtual_column && common_header.has("_database") && !plan_header.has("_database")) + { + ColumnWithTypeAndName column; + column.name = "_database"; + column.type = std::make_shared(std::make_shared()); + column.column = column.type->createColumnConst(0, Field(database_name)); + + auto adding_column_dag = ActionsDAG::makeAddingColumnActions(std::move(column)); + auto expression_step = std::make_unique(child.plan.getCurrentDataStream(), adding_column_dag); + child.plan.addStep(std::move(expression_step)); + plan_header = child.plan.getCurrentDataStream().header; + } + + if (has_table_virtual_column && common_header.has("_table") && !plan_header.has("_table")) + { + ColumnWithTypeAndName column; + column.name = "_table"; + column.type = std::make_shared(std::make_shared()); + column.column = column.type->createColumnConst(0, Field(table_name)); + + auto adding_column_dag = ActionsDAG::makeAddingColumnActions(std::move(column)); + auto expression_step = std::make_unique(child.plan.getCurrentDataStream(), adding_column_dag); + child.plan.addStep(std::move(expression_step)); + plan_header = child.plan.getCurrentDataStream().header; + } + } +} + +QueryPipelineBuilderPtr ReadFromMerge::buildPipeline( + ChildPlan & child, + QueryProcessingStage::Enum processed_stage) const +{ + if (!child.plan.isInitialized()) + return nullptr; + + auto optimisation_settings = QueryPlanOptimizationSettings::fromContext(context); + /// All optimisations will be done at plans creation + optimisation_settings.optimize_plan = false; + auto builder = child.plan.buildQueryPipeline(optimisation_settings, BuildQueryPipelineSettings::fromContext(context)); + + if (!builder->initialized()) + return builder; + + bool allow_experimental_analyzer = context->getSettingsRef().allow_experimental_analyzer; + if (processed_stage > child.stage || (allow_experimental_analyzer && processed_stage != QueryProcessingStage::FetchColumns)) { /** Materialization is needed, since from distributed storage the constants come materialized. * If you do not do this, different types (Const and non-Const) columns will be produced in different threads, @@ -1052,99 +1132,10 @@ QueryPipelineBuilderPtr ReadFromMerge::createSources( builder->addSimpleTransform([](const Block & stream_header) { return std::make_shared(stream_header); }); } - if (builder->initialized()) - { - if (concat_streams && builder->getNumStreams() > 1) - { - // It's possible to have many tables read from merge, resize(1) might open too many files at the same time. - // Using concat instead. - builder->addTransform(std::make_shared(builder->getHeader(), builder->getNumStreams())); - } - - /// Add virtual columns if we don't already have them. - - Block pipe_header = builder->getHeader(); - - if (allow_experimental_analyzer) - { - String table_alias = modified_query_info.query_tree->as()->getJoinTree()->as()->getAlias(); - - String database_column = table_alias.empty() || processed_stage == QueryProcessingStage::FetchColumns ? "_database" : table_alias + "._database"; - String table_column = table_alias.empty() || processed_stage == QueryProcessingStage::FetchColumns ? "_table" : table_alias + "._table"; - - if (has_database_virtual_column && common_header.has(database_column) - && storage_stage == QueryProcessingStage::FetchColumns && !pipe_header.has(database_column)) - { - ColumnWithTypeAndName column; - column.name = database_column; - column.type = std::make_shared(std::make_shared()); - column.column = column.type->createColumnConst(0, Field(database_name)); - - auto adding_column_dag = ActionsDAG::makeAddingColumnActions(std::move(column)); - auto adding_column_actions = std::make_shared( - std::move(adding_column_dag), ExpressionActionsSettings::fromContext(context, CompileExpressions::yes)); - - builder->addSimpleTransform([&](const Block & stream_header) - { return std::make_shared(stream_header, adding_column_actions); }); - } - - if (has_table_virtual_column && common_header.has(table_column) - && storage_stage == QueryProcessingStage::FetchColumns && !pipe_header.has(table_column)) - { - ColumnWithTypeAndName column; - column.name = table_column; - column.type = std::make_shared(std::make_shared()); - column.column = column.type->createColumnConst(0, Field(table_name)); - - auto adding_column_dag = ActionsDAG::makeAddingColumnActions(std::move(column)); - auto adding_column_actions = std::make_shared( - std::move(adding_column_dag), ExpressionActionsSettings::fromContext(context, CompileExpressions::yes)); - - builder->addSimpleTransform([&](const Block & stream_header) - { return std::make_shared(stream_header, adding_column_actions); }); - } - } - else - { - if (has_database_virtual_column && common_header.has("_database") && !pipe_header.has("_database")) - { - ColumnWithTypeAndName column; - column.name = "_database"; - column.type = std::make_shared(std::make_shared()); - column.column = column.type->createColumnConst(0, Field(database_name)); - - auto adding_column_dag = ActionsDAG::makeAddingColumnActions(std::move(column)); - auto adding_column_actions = std::make_shared( - std::move(adding_column_dag), ExpressionActionsSettings::fromContext(context, CompileExpressions::yes)); - builder->addSimpleTransform([&](const Block & stream_header) - { return std::make_shared(stream_header, adding_column_actions); }); - } - - if (has_table_virtual_column && common_header.has("_table") && !pipe_header.has("_table")) - { - ColumnWithTypeAndName column; - column.name = "_table"; - column.type = std::make_shared(std::make_shared()); - column.column = column.type->createColumnConst(0, Field(table_name)); - - auto adding_column_dag = ActionsDAG::makeAddingColumnActions(std::move(column)); - auto adding_column_actions = std::make_shared( - std::move(adding_column_dag), ExpressionActionsSettings::fromContext(context, CompileExpressions::yes)); - builder->addSimpleTransform([&](const Block & stream_header) - { return std::make_shared(stream_header, adding_column_actions); }); - } - } - - /// Subordinary tables could have different but convertible types, like numeric types of different width. - /// We must return streams with structure equals to structure of Merge table. - convertAndFilterSourceStream( - header, modified_query_info, storage_snapshot_, aliases, row_policy_data_opt, context, *builder, storage_stage); - } - return builder; } -QueryPlan ReadFromMerge::createPlanForTable( +ReadFromMerge::ChildPlan ReadFromMerge::createPlanForTable( const StorageSnapshotPtr & storage_snapshot_, SelectQueryInfo & modified_query_info, QueryProcessingStage::Enum processed_stage, @@ -1181,35 +1172,14 @@ QueryPlan ReadFromMerge::createPlanForTable( if (real_column_names.empty()) real_column_names.push_back(ExpressionActions::getSmallestColumn(storage_snapshot_->metadata->getColumns().getAllPhysical()).name); - StorageView * view = dynamic_cast(storage.get()); - if (!view || allow_experimental_analyzer) - { - storage->read(plan, - real_column_names, - storage_snapshot_, - modified_query_info, - modified_context, - processed_stage, - max_block_size, - UInt32(streams_num)); - } - else - { - /// For view storage, we need to rewrite the `modified_query_info.view_query` to optimize read. - /// The most intuitive way is to use InterpreterSelectQuery. - - /// Intercept the settings - modified_context->setSetting("max_threads", streams_num); - modified_context->setSetting("max_streams_to_max_threads_ratio", 1); - modified_context->setSetting("max_block_size", max_block_size); - - InterpreterSelectQuery interpreter(modified_query_info.query, - modified_context, - storage, - view->getInMemoryMetadataPtr(), - SelectQueryOptions(processed_stage)); - interpreter.buildQueryPlan(plan); - } + storage->read(plan, + real_column_names, + storage_snapshot_, + modified_query_info, + modified_context, + processed_stage, + max_block_size, + UInt32(streams_num)); if (!plan.isInitialized()) return {}; @@ -1248,7 +1218,7 @@ QueryPlan ReadFromMerge::createPlanForTable( } } - return plan; + return ChildPlan{std::move(plan), storage_stage}; } ReadFromMerge::RowPolicyData::RowPolicyData(RowPolicyFilterPtr row_policy_filter_ptr, @@ -1306,12 +1276,10 @@ void ReadFromMerge::RowPolicyData::addStorageFilter(SourceStepWithFilter * step) step->addFilter(actions_dag, filter_column_name); } -void ReadFromMerge::RowPolicyData::addFilterTransform(QueryPipelineBuilder & builder) const +void ReadFromMerge::RowPolicyData::addFilterTransform(QueryPlan & plan) const { - builder.addSimpleTransform([&](const Block & stream_header) - { - return std::make_shared(stream_header, filter_actions, filter_column_name, true /* remove filter column */); - }); + auto filter_step = std::make_unique(plan.getCurrentDataStream(), actions_dag, filter_column_name, true /* remove filter column */); + plan.addStep(std::move(filter_step)); } StorageMerge::StorageListWithLocks ReadFromMerge::getSelectedTables( @@ -1490,13 +1458,12 @@ void ReadFromMerge::convertAndFilterSourceStream( const Aliases & aliases, const RowPolicyDataOpt & row_policy_data_opt, ContextPtr local_context, - QueryPipelineBuilder & builder, - QueryProcessingStage::Enum processed_stage) + ChildPlan & child) { - Block before_block_header = builder.getHeader(); + Block before_block_header = child.plan.getCurrentDataStream().header; auto storage_sample_block = snapshot->metadata->getSampleBlock(); - auto pipe_columns = builder.getHeader().getNamesAndTypesList(); + auto pipe_columns = before_block_header.getNamesAndTypesList(); if (local_context->getSettingsRef().allow_experimental_analyzer) { @@ -1519,13 +1486,8 @@ void ReadFromMerge::convertAndFilterSourceStream( throw Exception(ErrorCodes::LOGICAL_ERROR, "Expected to have 1 output but got {}", nodes.size()); actions_dag->addOrReplaceInOutputs(actions_dag->addAlias(*nodes.front(), alias.name)); - - auto actions = std::make_shared(actions_dag, ExpressionActionsSettings::fromContext(local_context, CompileExpressions::yes)); - - builder.addSimpleTransform([&](const Block & stream_header) - { - return std::make_shared(stream_header, actions); - }); + auto expression_step = std::make_unique(child.plan.getCurrentDataStream(), actions_dag); + child.plan.addStep(std::move(expression_step)); } } else @@ -1539,37 +1501,26 @@ void ReadFromMerge::convertAndFilterSourceStream( auto dag = std::make_shared(pipe_columns); auto actions_dag = expression_analyzer.getActionsDAG(true, false); - auto actions = std::make_shared(actions_dag, ExpressionActionsSettings::fromContext(local_context, CompileExpressions::yes)); - - builder.addSimpleTransform([&](const Block & stream_header) - { - return std::make_shared(stream_header, actions); - }); + auto expression_step = std::make_unique(child.plan.getCurrentDataStream(), actions_dag); + child.plan.addStep(std::move(expression_step)); } } ActionsDAG::MatchColumnsMode convert_actions_match_columns_mode = ActionsDAG::MatchColumnsMode::Name; if (local_context->getSettingsRef().allow_experimental_analyzer - && (processed_stage != QueryProcessingStage::FetchColumns || dynamic_cast(&snapshot->storage) != nullptr)) + && (child.stage != QueryProcessingStage::FetchColumns || dynamic_cast(&snapshot->storage) != nullptr)) convert_actions_match_columns_mode = ActionsDAG::MatchColumnsMode::Position; if (row_policy_data_opt) - { - row_policy_data_opt->addFilterTransform(builder); - } + row_policy_data_opt->addFilterTransform(child.plan); - auto convert_actions_dag = ActionsDAG::makeConvertingActions(builder.getHeader().getColumnsWithTypeAndName(), + auto convert_actions_dag = ActionsDAG::makeConvertingActions(child.plan.getCurrentDataStream().header.getColumnsWithTypeAndName(), header.getColumnsWithTypeAndName(), convert_actions_match_columns_mode); - auto actions = std::make_shared( - std::move(convert_actions_dag), - ExpressionActionsSettings::fromContext(local_context, CompileExpressions::yes)); - builder.addSimpleTransform([&](const Block & stream_header) - { - return std::make_shared(stream_header, actions); - }); + auto expression_step = std::make_unique(child.plan.getCurrentDataStream(), convert_actions_dag); + child.plan.addStep(std::move(expression_step)); } const ReadFromMerge::StorageListWithLocks & ReadFromMerge::getSelectedTables() @@ -1606,29 +1557,14 @@ bool ReadFromMerge::requestReadingInOrder(InputOrderInfoPtr order_info_) return true; } -void ReadFromMerge::applyFilters(const QueryPlan & plan, const ActionDAGNodes & added_filter_nodes) const -{ - auto apply_filters = [&added_filter_nodes](ReadFromMergeTree & read_from_merge_tree) - { - for (const auto & node : added_filter_nodes.nodes) - read_from_merge_tree.addFilterFromParentStep(node); - - read_from_merge_tree.SourceStepWithFilter::applyFilters(); - return true; - }; - - recursivelyApplyToReadingSteps(plan.getRootNode(), apply_filters); -} - void ReadFromMerge::applyFilters(ActionDAGNodes added_filter_nodes) { + for (const auto & filter_info : pushed_down_filters) + added_filter_nodes.nodes.push_back(&filter_info.actions->findInOutputs(filter_info.column_name)); + SourceStepWithFilter::applyFilters(added_filter_nodes); filterTablesAndCreateChildrenPlans(); - - for (const auto & child_plan : *child_plans) - if (child_plan.plan.isInitialized()) - applyFilters(child_plan.plan, added_filter_nodes); } QueryPlanRawPtrs ReadFromMerge::getChildPlans() diff --git a/src/Storages/StorageMerge.h b/src/Storages/StorageMerge.h index 735c8711a63..94b34256d02 100644 --- a/src/Storages/StorageMerge.h +++ b/src/Storages/StorageMerge.h @@ -165,7 +165,7 @@ public: QueryPlanRawPtrs getChildPlans() override; - void updatePrewhereInfo(const PrewhereInfoPtr & prewhere_info_value) override; + void addFilter(FilterDAGInfo filter); private: const size_t required_max_block_size; @@ -221,7 +221,7 @@ private: /// Create explicit filter transform to exclude /// rows that are not conform to row level policy - void addFilterTransform(QueryPipelineBuilder &) const; + void addFilterTransform(QueryPlan &) const; private: std::string filter_column_name; // complex filter, may contain logic operations @@ -235,21 +235,21 @@ private: struct ChildPlan { QueryPlan plan; - Aliases table_aliases; - RowPolicyDataOpt row_policy_data_opt; + QueryProcessingStage::Enum stage; }; /// Store read plan for each child table. /// It's needed to guarantee lifetime for child steps to be the same as for this step (mainly for EXPLAIN PIPELINE). std::optional> child_plans; + /// Store filters pushed down from query plan optimization. Filters are added on top of child plans. + std::vector pushed_down_filters; + std::vector createChildrenPlans(SelectQueryInfo & query_info_) const; void filterTablesAndCreateChildrenPlans(); - void applyFilters(const QueryPlan & plan, const ActionDAGNodes & added_filter_nodes) const; - - QueryPlan createPlanForTable( + ChildPlan createPlanForTable( const StorageSnapshotPtr & storage_snapshot, SelectQueryInfo & query_info, QueryProcessingStage::Enum processed_stage, @@ -260,16 +260,15 @@ private: ContextMutablePtr modified_context, size_t streams_num) const; - QueryPipelineBuilderPtr createSources( - QueryPlan & plan, - const StorageSnapshotPtr & storage_snapshot, + void addVirtualColumns( + ChildPlan & child, SelectQueryInfo & modified_query_info, QueryProcessingStage::Enum processed_stage, - const Block & header, - const Aliases & aliases, - const RowPolicyDataOpt & row_policy_data_opt, - const StorageWithLockAndName & storage_with_lock, - bool concat_streams = false) const; + const StorageWithLockAndName & storage_with_lock) const; + + QueryPipelineBuilderPtr buildPipeline( + ChildPlan & child, + QueryProcessingStage::Enum processed_stage) const; static void convertAndFilterSourceStream( const Block & header, @@ -278,15 +277,12 @@ private: const Aliases & aliases, const RowPolicyDataOpt & row_policy_data_opt, ContextPtr context, - QueryPipelineBuilder & builder, - QueryProcessingStage::Enum processed_stage); + ChildPlan & child); StorageMerge::StorageListWithLocks getSelectedTables( ContextPtr query_context, bool filter_by_database_virtual_column, bool filter_by_table_virtual_column) const; - - // static VirtualColumnsDescription createVirtuals(StoragePtr first_table); }; } diff --git a/src/Storages/StorageMergeTree.cpp b/src/Storages/StorageMergeTree.cpp index 27a76f4f21d..9352f772ce1 100644 --- a/src/Storages/StorageMergeTree.cpp +++ b/src/Storages/StorageMergeTree.cpp @@ -213,36 +213,13 @@ void StorageMergeTree::read( size_t max_block_size, size_t num_streams) { - if (local_context->canUseParallelReplicasOnInitiator() && local_context->getSettingsRef().parallel_replicas_for_non_replicated_merge_tree) + const auto & settings = local_context->getSettingsRef(); + /// reading step for parallel replicas with new analyzer is built in Planner, so don't do it here + if (local_context->canUseParallelReplicasOnInitiator() && settings.parallel_replicas_for_non_replicated_merge_tree + && !settings.allow_experimental_analyzer) { - ASTPtr modified_query_ast; - Block header; - if (local_context->getSettingsRef().allow_experimental_analyzer) - { - QueryTreeNodePtr modified_query_tree = query_info.query_tree->clone(); - rewriteJoinToGlobalJoin(modified_query_tree, local_context); - modified_query_tree = buildQueryTreeForShard(query_info.planner_context, modified_query_tree); - header = InterpreterSelectQueryAnalyzer::getSampleBlock( - modified_query_tree, local_context, SelectQueryOptions(processed_stage).analyze()); - modified_query_ast = queryNodeToDistributedSelectQuery(modified_query_tree); - } - else - { - const auto table_id = getStorageID(); - modified_query_ast = ClusterProxy::rewriteSelectQuery(local_context, query_info.query, - table_id.database_name, table_id.table_name, /*remote_table_function_ptr*/nullptr); - header - = InterpreterSelectQuery(modified_query_ast, local_context, SelectQueryOptions(processed_stage).analyze()).getSampleBlock(); - } - ClusterProxy::executeQueryWithParallelReplicas( - query_plan, - getStorageID(), - header, - processed_stage, - modified_query_ast, - local_context, - query_info.storage_limits); + query_plan, getStorageID(), processed_stage, query_info.query, local_context, query_info.storage_limits); } else { diff --git a/src/Storages/StorageReplicatedMergeTree.cpp b/src/Storages/StorageReplicatedMergeTree.cpp index e18e66d7af9..db58d0081c6 100644 --- a/src/Storages/StorageReplicatedMergeTree.cpp +++ b/src/Storages/StorageReplicatedMergeTree.cpp @@ -5461,7 +5461,8 @@ void StorageReplicatedMergeTree::read( /// For this you have to synchronously go to ZooKeeper. if (settings.select_sequential_consistency) readLocalSequentialConsistencyImpl(query_plan, column_names, storage_snapshot, query_info, local_context, max_block_size, num_streams); - else if (local_context->canUseParallelReplicasOnInitiator()) + /// reading step for parallel replicas with new analyzer is built in Planner, so don't do it here + else if (local_context->canUseParallelReplicasOnInitiator() && !settings.allow_experimental_analyzer) readParallelReplicasImpl(query_plan, column_names, query_info, local_context, processed_stage); else readLocalImpl(query_plan, column_names, storage_snapshot, query_info, local_context, max_block_size, num_streams); @@ -5493,36 +5494,8 @@ void StorageReplicatedMergeTree::readParallelReplicasImpl( ContextPtr local_context, QueryProcessingStage::Enum processed_stage) { - ASTPtr modified_query_ast; - Block header; - const auto table_id = getStorageID(); - - if (local_context->getSettingsRef().allow_experimental_analyzer) - { - QueryTreeNodePtr modified_query_tree = query_info.query_tree->clone(); - rewriteJoinToGlobalJoin(modified_query_tree, local_context); - modified_query_tree = buildQueryTreeForShard(query_info.planner_context, modified_query_tree); - - header = InterpreterSelectQueryAnalyzer::getSampleBlock( - modified_query_tree, local_context, SelectQueryOptions(processed_stage).analyze()); - modified_query_ast = queryNodeToDistributedSelectQuery(modified_query_tree); - } - else - { - modified_query_ast = ClusterProxy::rewriteSelectQuery(local_context, query_info.query, - table_id.database_name, table_id.table_name, /*remote_table_function_ptr*/nullptr); - header - = InterpreterSelectQuery(modified_query_ast, local_context, SelectQueryOptions(processed_stage).analyze()).getSampleBlock(); - } - ClusterProxy::executeQueryWithParallelReplicas( - query_plan, - table_id, - header, - processed_stage, - modified_query_ast, - local_context, - query_info.storage_limits); + query_plan, getStorageID(), processed_stage, query_info.query, local_context, query_info.storage_limits); } void StorageReplicatedMergeTree::readLocalImpl( diff --git a/src/Storages/StorageURL.cpp b/src/Storages/StorageURL.cpp index 8d1c6933503..9302e7ef3e5 100644 --- a/src/Storages/StorageURL.cpp +++ b/src/Storages/StorageURL.cpp @@ -411,7 +411,12 @@ Chunk StorageURLSource::generate() if (input_format) chunk_size = input_format->getApproxBytesReadForChunk(); progress(num_rows, chunk_size ? chunk_size : chunk.bytes()); - VirtualColumnUtils::addRequestedPathFileAndSizeVirtualsToChunk(chunk, requested_virtual_columns, curr_uri.getPath(), current_file_size); + VirtualColumnUtils::addRequestedFileLikeStorageVirtualsToChunk( + chunk, requested_virtual_columns, + { + .path = curr_uri.getPath(), + .size = current_file_size + }); return chunk; } diff --git a/src/Storages/System/StorageSystemClusters.cpp b/src/Storages/System/StorageSystemClusters.cpp index cb8d5caa50c..160c8d6270e 100644 --- a/src/Storages/System/StorageSystemClusters.cpp +++ b/src/Storages/System/StorageSystemClusters.cpp @@ -54,6 +54,10 @@ void StorageSystemClusters::fillData(MutableColumns & res_columns, ContextPtr co if (auto database_cluster = replicated->tryGetCluster()) writeCluster(res_columns, {name_and_database.first, database_cluster}, replicated->tryGetAreReplicasActive(database_cluster)); + + if (auto database_cluster = replicated->tryGetAllGroupsCluster()) + writeCluster(res_columns, {DatabaseReplicated::ALL_GROUPS_CLUSTER_PREFIX + name_and_database.first, database_cluster}, + replicated->tryGetAreReplicasActive(database_cluster)); } } } diff --git a/src/Storages/System/StorageSystemNamedCollections.cpp b/src/Storages/System/StorageSystemNamedCollections.cpp index 0836560dff0..e98ea155f30 100644 --- a/src/Storages/System/StorageSystemNamedCollections.cpp +++ b/src/Storages/System/StorageSystemNamedCollections.cpp @@ -33,7 +33,7 @@ void StorageSystemNamedCollections::fillData(MutableColumns & res_columns, Conte { const auto & access = context->getAccess(); - NamedCollectionUtils::loadIfNot(); + NamedCollectionFactory::instance().loadIfNot(); auto collections = NamedCollectionFactory::instance().getAll(); for (const auto & [name, collection] : collections) diff --git a/src/Storages/System/StorageSystemZeros.cpp b/src/Storages/System/StorageSystemZeros.cpp index a48b109fbbe..09a2bb5d963 100644 --- a/src/Storages/System/StorageSystemZeros.cpp +++ b/src/Storages/System/StorageSystemZeros.cpp @@ -16,7 +16,9 @@ namespace struct ZerosState { + explicit ZerosState(UInt64 limit) : add_total_rows(limit) { } std::atomic num_generated_rows = 0; + std::atomic add_total_rows = 0; }; using ZerosStatePtr = std::shared_ptr; @@ -42,10 +44,13 @@ protected: auto column_ptr = column; size_t column_size = column_ptr->size(); - if (state) + UInt64 total_rows = state->add_total_rows.fetch_and(0); + if (total_rows) + addTotalRowsApprox(total_rows); + + if (limit) { auto generated_rows = state->num_generated_rows.fetch_add(column_size, std::memory_order_acquire); - if (generated_rows >= limit) return {}; @@ -103,36 +108,25 @@ Pipe StorageSystemZeros::read( { storage_snapshot->check(column_names); - bool use_multiple_streams = multithreaded; + UInt64 query_limit = limit ? *limit : 0; + if (query_info.limit) + query_limit = query_limit ? std::min(query_limit, query_info.limit) : query_info.limit; - if (limit && *limit < max_block_size) - { - max_block_size = static_cast(*limit); - use_multiple_streams = false; - } + if (query_limit && query_limit < max_block_size) + max_block_size = query_limit; - if (!use_multiple_streams) + if (!multithreaded) num_streams = 1; + else if (query_limit && num_streams * max_block_size > query_limit) + /// We want to avoid spawning more streams than necessary + num_streams = std::min(num_streams, static_cast(((query_limit + max_block_size - 1) / max_block_size))); + + ZerosStatePtr state = std::make_shared(query_limit); Pipe res; - - ZerosStatePtr state; - - if (limit) - state = std::make_shared(); - for (size_t i = 0; i < num_streams; ++i) { - auto source = std::make_shared(max_block_size, limit ? *limit : 0, state); - - if (i == 0) - { - if (limit) - source->addTotalRowsApprox(*limit); - else if (query_info.limit) - source->addTotalRowsApprox(query_info.limit); - } - + auto source = std::make_shared(max_block_size, query_limit, state); res.addSource(std::move(source)); } diff --git a/src/Storages/VirtualColumnUtils.cpp b/src/Storages/VirtualColumnUtils.cpp index cec55cefda2..778c9e13adb 100644 --- a/src/Storages/VirtualColumnUtils.cpp +++ b/src/Storages/VirtualColumnUtils.cpp @@ -26,6 +26,7 @@ #include #include #include +#include #include #include @@ -111,7 +112,7 @@ void filterBlockWithDAG(ActionsDAGPtr dag, Block & block, ContextPtr context) NameSet getVirtualNamesForFileLikeStorage() { - return {"_path", "_file", "_size"}; + return {"_path", "_file", "_size", "_time"}; } VirtualColumnsDescription getVirtualsForFileLikeStorage(const ColumnsDescription & storage_columns) @@ -129,6 +130,7 @@ VirtualColumnsDescription getVirtualsForFileLikeStorage(const ColumnsDescription add_virtual("_path", std::make_shared(std::make_shared())); add_virtual("_file", std::make_shared(std::make_shared())); add_virtual("_size", makeNullable(std::make_shared())); + add_virtual("_time", makeNullable(std::make_shared())); return desc; } @@ -187,32 +189,40 @@ ColumnPtr getFilterByPathAndFileIndexes(const std::vector & paths, const return block.getByName("_idx").column; } -void addRequestedPathFileAndSizeVirtualsToChunk( - Chunk & chunk, const NamesAndTypesList & requested_virtual_columns, const String & path, std::optional size, const String * filename) +void addRequestedFileLikeStorageVirtualsToChunk( + Chunk & chunk, const NamesAndTypesList & requested_virtual_columns, + VirtualsForFileLikeStorage virtual_values) { for (const auto & virtual_column : requested_virtual_columns) { if (virtual_column.name == "_path") { - chunk.addColumn(virtual_column.type->createColumnConst(chunk.getNumRows(), path)->convertToFullColumnIfConst()); + chunk.addColumn(virtual_column.type->createColumnConst(chunk.getNumRows(), virtual_values.path)->convertToFullColumnIfConst()); } else if (virtual_column.name == "_file") { - if (filename) + if (virtual_values.filename) { - chunk.addColumn(virtual_column.type->createColumnConst(chunk.getNumRows(), *filename)->convertToFullColumnIfConst()); + chunk.addColumn(virtual_column.type->createColumnConst(chunk.getNumRows(), (*virtual_values.filename))->convertToFullColumnIfConst()); } else { - size_t last_slash_pos = path.find_last_of('/'); - auto filename_from_path = path.substr(last_slash_pos + 1); + size_t last_slash_pos = virtual_values.path.find_last_of('/'); + auto filename_from_path = virtual_values.path.substr(last_slash_pos + 1); chunk.addColumn(virtual_column.type->createColumnConst(chunk.getNumRows(), filename_from_path)->convertToFullColumnIfConst()); } } else if (virtual_column.name == "_size") { - if (size) - chunk.addColumn(virtual_column.type->createColumnConst(chunk.getNumRows(), *size)->convertToFullColumnIfConst()); + if (virtual_values.size) + chunk.addColumn(virtual_column.type->createColumnConst(chunk.getNumRows(), *virtual_values.size)->convertToFullColumnIfConst()); + else + chunk.addColumn(virtual_column.type->createColumnConstWithDefaultValue(chunk.getNumRows())->convertToFullColumnIfConst()); + } + else if (virtual_column.name == "_time") + { + if (virtual_values.last_modified) + chunk.addColumn(virtual_column.type->createColumnConst(chunk.getNumRows(), virtual_values.last_modified->epochTime())->convertToFullColumnIfConst()); else chunk.addColumn(virtual_column.type->createColumnConstWithDefaultValue(chunk.getNumRows())->convertToFullColumnIfConst()); } diff --git a/src/Storages/VirtualColumnUtils.h b/src/Storages/VirtualColumnUtils.h index 62f2e4855b5..fbfbdd6c6cc 100644 --- a/src/Storages/VirtualColumnUtils.h +++ b/src/Storages/VirtualColumnUtils.h @@ -68,8 +68,18 @@ void filterByPathOrFile(std::vector & sources, const std::vector & pa sources = std::move(filtered_sources); } -void addRequestedPathFileAndSizeVirtualsToChunk( - Chunk & chunk, const NamesAndTypesList & requested_virtual_columns, const String & path, std::optional size, const String * filename = nullptr); +struct VirtualsForFileLikeStorage +{ + const String & path; + std::optional size { std::nullopt }; + const String * filename { nullptr }; + std::optional last_modified { std::nullopt }; + +}; + +void addRequestedFileLikeStorageVirtualsToChunk( + Chunk & chunk, const NamesAndTypesList & requested_virtual_columns, + VirtualsForFileLikeStorage virtual_values); } } diff --git a/src/Storages/tests/gtest_transform_query_for_external_database.cpp b/src/Storages/tests/gtest_transform_query_for_external_database.cpp index 7e2d393c3d1..6765e112bb9 100644 --- a/src/Storages/tests/gtest_transform_query_for_external_database.cpp +++ b/src/Storages/tests/gtest_transform_query_for_external_database.cpp @@ -368,17 +368,21 @@ TEST(TransformQueryForExternalDatabase, Null) check(state, 1, {"field"}, "SELECT field FROM table WHERE field IS NULL", - R"(SELECT "field" FROM "test"."table" WHERE "field" IS NULL)"); + R"(SELECT "field" FROM "test"."table" WHERE "field" IS NULL)", + R"(SELECT "field" FROM "test"."table" WHERE 1 = 0)"); check(state, 1, {"field"}, "SELECT field FROM table WHERE field IS NOT NULL", - R"(SELECT "field" FROM "test"."table" WHERE "field" IS NOT NULL)"); + R"(SELECT "field" FROM "test"."table" WHERE "field" IS NOT NULL)", + R"(SELECT "field" FROM "test"."table")"); check(state, 1, {"field"}, "SELECT field FROM table WHERE isNull(field)", - R"(SELECT "field" FROM "test"."table" WHERE "field" IS NULL)"); + R"(SELECT "field" FROM "test"."table" WHERE "field" IS NULL)", + R"(SELECT "field" FROM "test"."table" WHERE 1 = 0)"); check(state, 1, {"field"}, "SELECT field FROM table WHERE isNotNull(field)", - R"(SELECT "field" FROM "test"."table" WHERE "field" IS NOT NULL)"); + R"(SELECT "field" FROM "test"."table" WHERE "field" IS NOT NULL)", + R"(SELECT "field" FROM "test"."table")"); } TEST(TransformQueryForExternalDatabase, ToDate) diff --git a/tests/ci/build_download_helper.py b/tests/ci/build_download_helper.py index ce5b46a664e..a641f9f4544 100644 --- a/tests/ci/build_download_helper.py +++ b/tests/ci/build_download_helper.py @@ -27,6 +27,8 @@ except ImportError: DOWNLOAD_RETRIES_COUNT = 5 +logger = logging.getLogger(__name__) + class DownloadException(Exception): pass @@ -42,7 +44,7 @@ def get_with_retries( sleep: int = 3, **kwargs: Any, ) -> requests.Response: - logging.info( + logger.info( "Getting URL with %i tries and sleep %i in between: %s", retries, sleep, url ) exc = Exception("A placeholder to satisfy typing and avoid nesting") @@ -54,7 +56,7 @@ def get_with_retries( return response except Exception as e: if i + 1 < retries: - logging.info("Exception '%s' while getting, retry %i", e, i + 1) + logger.info("Exception '%s' while getting, retry %i", e, i + 1) time.sleep(sleep) exc = e @@ -103,7 +105,7 @@ def get_gh_api( ) try_auth = e.response.status_code == 404 if (ratelimit_exceeded or try_auth) and not token_is_set: - logging.warning( + logger.warning( "Received rate limit exception, setting the auth header and retry" ) set_auth_header() @@ -114,10 +116,10 @@ def get_gh_api( exc = e if try_cnt < retries: - logging.info("Exception '%s' while getting, retry %i", exc, try_cnt) + logger.info("Exception '%s' while getting, retry %i", exc, try_cnt) time.sleep(sleep) - raise APIException("Unable to request data from GH API") from exc + raise APIException(f"Unable to request data from GH API: {url}") from exc def get_build_name_for_check(check_name: str) -> str: @@ -128,25 +130,25 @@ def read_build_urls(build_name: str, reports_path: Union[Path, str]) -> List[str for root, _, files in os.walk(reports_path): for file in files: if file.endswith(f"_{build_name}.json"): - logging.info("Found build report json %s for %s", file, build_name) + logger.info("Found build report json %s for %s", file, build_name) with open( os.path.join(root, file), "r", encoding="utf-8" ) as file_handler: build_report = json.load(file_handler) return build_report["build_urls"] # type: ignore - logging.info("A build report is not found for %s", build_name) + logger.info("A build report is not found for %s", build_name) return [] def download_build_with_progress(url: str, path: Path) -> None: - logging.info("Downloading from %s to temp path %s", url, path) + logger.info("Downloading from %s to temp path %s", url, path) for i in range(DOWNLOAD_RETRIES_COUNT): try: response = get_with_retries(url, retries=1, stream=True) total_length = int(response.headers.get("content-length", 0)) if path.is_file() and total_length and path.stat().st_size == total_length: - logging.info( + logger.info( "The file %s already exists and have a proper size %s", path, total_length, @@ -155,14 +157,14 @@ def download_build_with_progress(url: str, path: Path) -> None: with open(path, "wb") as f: if total_length == 0: - logging.info( + logger.info( "No content-length, will download file without progress" ) f.write(response.content) else: dl = 0 - logging.info("Content length is %ld bytes", total_length) + logger.info("Content length is %ld bytes", total_length) for data in response.iter_content(chunk_size=4096): dl += len(data) f.write(data) @@ -177,8 +179,8 @@ def download_build_with_progress(url: str, path: Path) -> None: except Exception as e: if sys.stdout.isatty(): sys.stdout.write("\n") - if os.path.exists(path): - os.remove(path) + if path.exists(): + path.unlink() if i + 1 < DOWNLOAD_RETRIES_COUNT: time.sleep(3) @@ -189,7 +191,7 @@ def download_build_with_progress(url: str, path: Path) -> None: if sys.stdout.isatty(): sys.stdout.write("\n") - logging.info("Downloading finished") + logger.info("Downloading finished") def download_builds( @@ -198,7 +200,7 @@ def download_builds( for url in build_urls: if filter_fn(url): fname = os.path.basename(url.replace("%2B", "+").replace("%20", " ")) - logging.info("Will download %s to %s", fname, result_path) + logger.info("Will download %s to %s", fname, result_path) download_build_with_progress(url, result_path / fname) @@ -210,7 +212,7 @@ def download_builds_filter( ) -> None: build_name = get_build_name_for_check(check_name) urls = read_build_urls(build_name, reports_path) - logging.info("The build report for %s contains the next URLs: %s", build_name, urls) + logger.info("The build report for %s contains the next URLs: %s", build_name, urls) if not urls: raise DownloadException("No build URLs found") @@ -247,7 +249,7 @@ def get_clickhouse_binary_url( ) -> Optional[str]: build_name = get_build_name_for_check(check_name) urls = read_build_urls(build_name, reports_path) - logging.info("The build report for %s contains the next URLs: %s", build_name, urls) + logger.info("The build report for %s contains the next URLs: %s", build_name, urls) for url in urls: check_url = url if "?" in check_url: diff --git a/tests/ci/changelog.py b/tests/ci/changelog.py new file mode 100755 index 00000000000..fcb61d3f605 --- /dev/null +++ b/tests/ci/changelog.py @@ -0,0 +1,455 @@ +#!/usr/bin/env python3 +# In our CI this script runs in style-test containers + +import argparse +import logging +import re +from datetime import date, timedelta +from pathlib import Path +from subprocess import DEVNULL +from typing import Any, Dict, List, Optional, TextIO + +import tqdm # type: ignore +from github.GithubException import RateLimitExceededException, UnknownObjectException +from github.NamedUser import NamedUser +from thefuzz.fuzz import ratio # type: ignore + +from cache_utils import GitHubCache +from env_helper import TEMP_PATH +from git_helper import git_runner, is_shallow +from github_helper import GitHub, PullRequest, PullRequests, Repository +from s3_helper import S3Helper +from version_helper import ( + FILE_WITH_VERSION_PATH, + get_abs_path, + get_version_from_repo, + get_version_from_tag, +) + +# This array gives the preferred category order, and is also used to +# normalize category names. +# Categories are used in .github/PULL_REQUEST_TEMPLATE.md, keep comments there +# updated accordingly +categories_preferred_order = ( + "Backward Incompatible Change", + "New Feature", + "Performance Improvement", + "Improvement", + "Critical Bug Fix", + "Bug Fix", + "Build/Testing/Packaging Improvement", + "Other", +) + +FROM_REF = "" +TO_REF = "" +SHA_IN_CHANGELOG = [] # type: List[str] +gh = GitHub(create_cache_dir=False) +runner = git_runner + + +class Description: + def __init__( + self, number: int, user: NamedUser, html_url: str, entry: str, category: str + ): + self.number = number + self.html_url = html_url + self.user = gh.get_user_cached(user._rawData["login"]) # type: ignore + self.entry = entry + self.category = category + + @property + def formatted_entry(self) -> str: + # Substitute issue links. + # 1) issue number w/o markdown link + entry = re.sub( + r"([^[])#([0-9]{4,})", + r"\1[#\2](https://github.com/ClickHouse/ClickHouse/issues/\2)", + self.entry, + ) + # 2) issue URL w/o markdown link + # including #issuecomment-1 or #event-12 + entry = re.sub( + r"([^(])(https://github.com/ClickHouse/ClickHouse/issues/([0-9]{4,})[-#a-z0-9]*)", + r"\1[#\3](\2)", + entry, + ) + # It's possible that we face a secondary rate limit. + # In this case we should sleep until we get it + while True: + try: + user_name = self.user.name if self.user.name else self.user.login + break + except UnknownObjectException: + user_name = self.user.login + break + except RateLimitExceededException: + gh.sleep_on_rate_limit() + return ( + f"* {entry} [#{self.number}]({self.html_url}) " + f"([{user_name}]({self.user.html_url}))." + ) + + # Sort PR descriptions by numbers + def __eq__(self, other: Any) -> bool: + if not isinstance(self, type(other)): + raise NotImplementedError + return bool(self.number == other.number) + + def __lt__(self, other: "Description") -> bool: + return self.number < other.number + + +def get_descriptions(prs: PullRequests) -> Dict[str, List[Description]]: + descriptions = {} # type: Dict[str, List[Description]] + repos = {} # type: Dict[str, Repository] + for pr in prs: + # See https://github.com/PyGithub/PyGithub/issues/2202, + # obj._rawData doesn't spend additional API requests + # We'll save some requests + # pylint: disable=protected-access + repo_name = pr._rawData["base"]["repo"]["full_name"] + # pylint: enable=protected-access + if repo_name not in repos: + repos[repo_name] = pr.base.repo + in_changelog = False + merge_commit = pr.merge_commit_sha + if merge_commit is None: + logging.warning("PR %s does not have merge-commit, skipping", pr.number) + continue + + in_changelog = merge_commit in SHA_IN_CHANGELOG + if in_changelog: + desc = generate_description(pr, repos[repo_name]) + if desc: + if desc.category not in descriptions: + descriptions[desc.category] = [] + descriptions[desc.category].append(desc) + + for descs in descriptions.values(): + descs.sort() + + return descriptions + + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser( + formatter_class=argparse.ArgumentDefaultsHelpFormatter, + description="Generate a changelog in Markdown format between given tags. " + "It fetches all tags and unshallow the git repository automatically", + ) + parser.add_argument( + "-v", + "--verbose", + action="count", + default=0, + help="set the script verbosity, could be used multiple", + ) + parser.add_argument( + "--debug-helpers", + action="store_true", + help="add debug logging for git_helper and github_helper", + ) + parser.add_argument( + "--output", + type=argparse.FileType("w"), + default="-", + help="output file for changelog", + ) + parser.add_argument( + "--repo", + default="ClickHouse/ClickHouse", + help="a repository to query for pull-requests from GitHub", + ) + parser.add_argument( + "--jobs", + type=int, + default=10, + help="number of jobs to get pull-requests info from GitHub API", + ) + parser.add_argument( + "--gh-user-or-token", + help="user name or GH token to authenticate", + ) + parser.add_argument( + "--gh-password", + help="a password that should be used when user is given", + ) + parser.add_argument( + "--with-testing-tags", + action="store_true", + help="by default '*-testing' tags are ignored, this argument enables them too", + ) + parser.add_argument( + "--from", + dest="from_ref", + help="git ref for a starting point of changelog, by default is calculated " + "automatically to match a previous tag in history", + ) + parser.add_argument( + "to_ref", + metavar="TO_REF", + help="git ref for the changelog end", + ) + args = parser.parse_args() + return args + + +# This function mirrors the PR description checks in ClickhousePullRequestTrigger. +# Returns None if the PR should not be mentioned in changelog. +def generate_description(item: PullRequest, repo: Repository) -> Optional[Description]: + backport_number = item.number + if item.head.ref.startswith("backport/"): + branch_parts = item.head.ref.split("/") + if len(branch_parts) == 3: + try: + item = gh.get_pull_cached(repo, int(branch_parts[-1])) + except Exception as e: + logging.warning("unable to get backpoted PR, exception: %s", e) + else: + logging.warning( + "The branch %s doesn't match backport template, using PR %s as is", + item.head.ref, + item.number, + ) + description = item.body + # Don't skip empty lines because they delimit parts of description + lines = [x.strip() for x in (description.split("\n") if description else [])] + lines = [re.sub(r"\s+", " ", ln) for ln in lines] + + category = "" + entry = "" + + if lines: + i = 0 + while i < len(lines): + if re.match(r"(?i)^[#>*_ ]*change\s*log\s*category", lines[i]): + i += 1 + if i >= len(lines): + break + # Can have one empty line between header and the category itself. + # Filter it out. + if not lines[i]: + i += 1 + if i >= len(lines): + break + category = re.sub(r"^[-*\s]*", "", lines[i]) + i += 1 + elif re.match( + r"(?i)^[#>*_ ]*(short\s*description|change\s*log\s*entry)", lines[i] + ): + i += 1 + # Can have one empty line between header and the entry itself. + # Filter it out. + if i < len(lines) and not lines[i]: + i += 1 + # All following lines until empty one are the changelog entry. + entry_lines = [] + while i < len(lines) and lines[i]: + entry_lines.append(lines[i]) + i += 1 + entry = " ".join(entry_lines) + else: + i += 1 + + # Remove excessive bullets from the entry. + if re.match(r"^[\-\*] ", entry): + entry = entry[2:] + + # Better style. + if re.match(r"^[a-z]", entry): + entry = entry.capitalize() + + if not category: + # Shouldn't happen, because description check in CI should catch such PRs. + # Fall through, so that it shows up in output and the user can fix it. + category = "NO CL CATEGORY" + + # Filter out documentations changelog before not-for-changelog + if re.match( + r"(?i)doc", + category, + ): + return None + + # Filter out the PR categories that are not for changelog. + if re.search( + r"(?i)((non|in|not|un)[-\s]*significant)|" + r"(not[ ]*for[ ]*changelog)|" + r"(changelog[ ]*entry[ ]*is[ ]*not[ ]*required)", + category, + ): + category = "NOT FOR CHANGELOG / INSIGNIFICANT" + entry = item.title + + # Normalize bug fixes + if re.match( + r"(?i)bug\Wfix", + category, + ): + category = "Bug Fix (user-visible misbehavior in an official stable release)" + + if backport_number != item.number: + entry = f"Backported in #{backport_number}: {entry}" + + if not entry: + # Shouldn't happen, because description check in CI should catch such PRs. + category = "NO CL ENTRY" + entry = "NO CL ENTRY: '" + item.title + "'" + + entry = entry.strip() + if entry[-1] != ".": + entry += "." + + for c in categories_preferred_order: + if ratio(category.lower(), c.lower()) >= 90: + category = c + break + + return Description(item.number, item.user, item.html_url, entry, category) + + +def write_changelog( + fd: TextIO, descriptions: Dict[str, List[Description]], year: int +) -> None: + to_commit = runner(f"git rev-parse {TO_REF}^{{}}")[:11] + from_commit = runner(f"git rev-parse {FROM_REF}^{{}}")[:11] + fd.write( + f"---\nsidebar_position: 1\nsidebar_label: {year}\n---\n\n" + f"# {year} Changelog\n\n" + f"### ClickHouse release {TO_REF} ({to_commit}) FIXME " + f"as compared to {FROM_REF} ({from_commit})\n\n" + ) + + seen_categories = [] # type: List[str] + for category in categories_preferred_order: + if category in descriptions: + seen_categories.append(category) + fd.write(f"#### {category}\n") + for desc in descriptions[category]: + fd.write(f"{desc.formatted_entry}\n") + + fd.write("\n") + + for category in sorted(descriptions): + if category not in seen_categories: + fd.write(f"#### {category}\n\n") + for desc in descriptions[category]: + fd.write(f"{desc.formatted_entry}\n") + + fd.write("\n") + + +def check_refs(from_ref: Optional[str], to_ref: str, with_testing_tags: bool) -> None: + global FROM_REF, TO_REF + TO_REF = to_ref + + # Check TO_REF + runner.run(f"git rev-parse {TO_REF}") + + # Check from_ref + if from_ref is not None: + runner.run(f"git rev-parse {FROM_REF}") + FROM_REF = from_ref + return + + # Get the cmake/autogenerated_versions.txt from FROM_REF to read the version + # If the previous tag is greater than version in the FROM_REF, + # then we need to add it to tags_to_exclude + temp_cmake = "tests/ci/tmp/autogenerated_versions.txt" + cmake_version = get_abs_path(temp_cmake) + cmake_version.write_text(runner(f"git show {TO_REF}:{FILE_WITH_VERSION_PATH}")) + to_ref_version = get_version_from_repo(cmake_version) + # Get all tags pointing to TO_REF + excluded_tags = runner.run(f"git tag --points-at '{TO_REF}^{{}}'").split("\n") + logging.info("All tags pointing to %s:\n%s", TO_REF, excluded_tags) + if not with_testing_tags: + excluded_tags.append("*-testing") + while not from_ref: + exclude = " ".join([f"--exclude='{tag}'" for tag in excluded_tags]) + from_ref_tag = runner(f"git describe --abbrev=0 --tags {exclude} '{TO_REF}'") + from_ref_version = get_version_from_tag(from_ref_tag) + if from_ref_version <= to_ref_version: + from_ref = from_ref_tag + break + excluded_tags.append(from_ref_tag) + + cmake_version.unlink() + FROM_REF = from_ref + + +def set_sha_in_changelog(): + global SHA_IN_CHANGELOG + SHA_IN_CHANGELOG = runner.run( + f"git log --format=format:%H {FROM_REF}..{TO_REF}" + ).split("\n") + + +def get_year(prs: PullRequests) -> int: + if not prs: + return date.today().year + return max(pr.created_at.year for pr in prs) + + +def main(): + log_levels = [logging.WARN, logging.INFO, logging.DEBUG] + args = parse_args() + logging.basicConfig( + format="%(asctime)s %(levelname)-8s [%(filename)s:%(lineno)d]:\n%(message)s", + level=log_levels[min(args.verbose, 2)], + ) + if args.debug_helpers: + logging.getLogger("github_helper").setLevel(logging.DEBUG) + logging.getLogger("git_helper").setLevel(logging.DEBUG) + + # Get the full repo + if is_shallow(): + logging.info("Unshallow repository") + runner.run("git fetch --unshallow", stderr=DEVNULL) + logging.info("Fetching all tags") + runner.run("git fetch --tags", stderr=DEVNULL) + + check_refs(args.from_ref, args.to_ref, args.with_testing_tags) + set_sha_in_changelog() + + logging.info("Using %s..%s as changelog interval", FROM_REF, TO_REF) + + # use merge-base commit as a starting point, if used ref in another branch + base_commit = runner.run(f"git merge-base '{FROM_REF}^{{}}' '{TO_REF}^{{}}'") + # Get starting and ending dates for gathering PRs + # Add one day after and before to mitigate TZ possible issues + # `tag^{}` format gives commit ref when we have annotated tags + # format %cs gives a committer date, works better for cherry-picked commits + from_date = runner.run(f"git log -1 --format=format:%cs '{base_commit}'") + to_date = runner.run(f"git log -1 --format=format:%cs '{TO_REF}^{{}}'") + merged = ( + date.fromisoformat(from_date) - timedelta(1), + date.fromisoformat(to_date) + timedelta(1), + ) + + # Get all PRs for the given time frame + global gh + gh = GitHub( + args.gh_user_or_token, + args.gh_password, + create_cache_dir=False, + per_page=100, + pool_size=args.jobs, + ) + temp_path = Path(TEMP_PATH) + gh_cache = GitHubCache(gh.cache_path, temp_path, S3Helper()) + gh_cache.download() + query = f"type:pr repo:{args.repo} is:merged" + prs = gh.get_pulls_from_search( + query=query, merged=merged, sort="created", progress_func=tqdm.tqdm + ) + + descriptions = get_descriptions(prs) + changelog_year = get_year(prs) + + write_changelog(args.output, descriptions, changelog_year) + gh_cache.upload() + + +if __name__ == "__main__": + main() diff --git a/tests/ci/cherry_pick.py b/tests/ci/cherry_pick.py index 629464d0422..459be12ada0 100644 --- a/tests/ci/cherry_pick.py +++ b/tests/ci/cherry_pick.py @@ -532,9 +532,9 @@ class Backport: for br in branches: br.process(self.dry_run) - for br in branches: - if br.backported: - self.mark_pr_backported(pr) + if all(br.backported for br in branches): + # And check it after the running + self.mark_pr_backported(pr) def mark_pr_backported(self, pr: PullRequest) -> None: if self.dry_run: diff --git a/tests/ci/ci_config.py b/tests/ci/ci_config.py index 775e1f98b34..6ab1eb8bac4 100644 --- a/tests/ci/ci_config.py +++ b/tests/ci/ci_config.py @@ -280,6 +280,7 @@ builds_job_config = JobConfig( "./packages", "./docker/packager/packager", "./rust", + "./tests/ci/version_helper.py", # FIXME: This is a WA to rebuild the CH and recreate the Performance.tar.zst artifact # when there are changes in performance test scripts. # Due to the current design of the perf test we need to rebuild CH when the performance test changes, @@ -685,9 +686,6 @@ class CIConfig: return result def get_job_parents(self, check_name: str) -> List[str]: - if check_name in self.builds_report_config: - return self.builds_report_config[check_name].builds - res = [] check_name = normalize_string(check_name) for config in ( @@ -903,10 +901,38 @@ CI_CONFIG = CIConfig( ), CILabels.CI_SET_REQUIRED: LabelConfig(run_jobs=REQUIRED_CHECKS), CILabels.CI_SET_NORMAL_BUILDS: LabelConfig( - run_jobs=[JobNames.STYLE_CHECK, JobNames.BUILD_CHECK] + run_jobs=[ + JobNames.STYLE_CHECK, + JobNames.BUILD_CHECK, + Build.PACKAGE_RELEASE, + Build.PACKAGE_AARCH64, + Build.PACKAGE_ASAN, + Build.PACKAGE_UBSAN, + Build.PACKAGE_TSAN, + Build.PACKAGE_MSAN, + Build.PACKAGE_DEBUG, + Build.BINARY_RELEASE, + Build.PACKAGE_RELEASE_COVERAGE, + Build.FUZZERS, + ] ), CILabels.CI_SET_SPECIAL_BUILDS: LabelConfig( - run_jobs=[JobNames.STYLE_CHECK, JobNames.BUILD_CHECK_SPECIAL] + run_jobs=[ + JobNames.STYLE_CHECK, + JobNames.BUILD_CHECK_SPECIAL, + Build.BINARY_TIDY, + Build.BINARY_DARWIN, + Build.BINARY_AARCH64, + Build.BINARY_AARCH64_V80COMPAT, + Build.BINARY_FREEBSD, + Build.BINARY_DARWIN_AARCH64, + Build.BINARY_PPC64LE, + Build.BINARY_RISCV64, + Build.BINARY_S390X, + Build.BINARY_LOONGARCH64, + Build.BINARY_AMD64_COMPAT, + Build.BINARY_AMD64_MUSL, + ] ), CILabels.CI_SET_NON_REQUIRED: LabelConfig( run_jobs=[job for job in JobNames if job not in REQUIRED_CHECKS] diff --git a/tests/ci/functional_test_check.py b/tests/ci/functional_test_check.py index 5bb46d7ec2f..9678efd8631 100644 --- a/tests/ci/functional_test_check.py +++ b/tests/ci/functional_test_check.py @@ -309,9 +309,6 @@ def main(): state, description, test_results, additional_logs = process_results( result_path, server_log_path ) - # FIXME (alesapin) - if "azure" in check_name: - state = "success" else: print( "This is validate bugfix or flaky check run, but no changes test to run - skip with success" diff --git a/tests/ci/git_helper.py b/tests/ci/git_helper.py index 8ec90dd7b2d..6b66bc44d10 100644 --- a/tests/ci/git_helper.py +++ b/tests/ci/git_helper.py @@ -7,7 +7,7 @@ import os.path as p import re import subprocess import tempfile -from typing import Any, List, Optional +from typing import Any, List, Literal, Optional logger = logging.getLogger(__name__) @@ -15,7 +15,9 @@ logger = logging.getLogger(__name__) # \A and \Z match only start and end of the whole string RELEASE_BRANCH_REGEXP = r"\A\d+[.]\d+\Z" TAG_REGEXP = ( - r"\Av\d{2}[.][1-9]\d*[.][1-9]\d*[.][1-9]\d*-(testing|prestable|stable|lts)\Z" + r"\Av\d{2}" # First two digits of major part + r"([.][1-9]\d*){3}" # minor.patch.tweak parts + r"-(new|testing|prestable|stable|lts)\Z" # suffix with a version type ) SHA_REGEXP = re.compile(r"\A([0-9]|[a-f]){40}\Z") @@ -122,17 +124,35 @@ class Git: _tag_pattern = re.compile(TAG_REGEXP) def __init__(self, ignore_no_tags: bool = False): + """ + new_tag is used for special v24.1.1.1-new tags where the previous version is moved to the release branch + * 66666666666 Some another commit with version 24.8.1.xxxxx-testing, tweak is counted from new_tag = v24.8.1.1-new + | * 55555555555 (tag: v24.7.1.123123123-stable, branch: 24.7) tweak counted from new_tag = v24.7.1.1-new + |/ + * 44444444444 (tag: v24.8.1.1-new) + | * 33333333333 (tag: v24.6.1.123123123-stable, branch: 24.6) tweak counted from new_tag = v24.6.1.1-new + |/ + * 22222222222 (tag: v24.7.1.1-new) + | * 11111111111 (tag: v24.5.1.123123123-stable, branch: 24.5) tweak counted from new_tag = v24.4.1.2088-stable + |/ + * 00000000000 (tag: v24.6.1.1-new) + * 6d4b31322d1 (tag: v24.4.1.2088-stable) + * 2c5c589a882 (tag: v24.3.1.2672-lts) + * 891689a4150 (tag: v24.2.1.2248-stable) + * 5a024dfc093 (tag: v24.1.1.2048-stable) + * a2faa65b080 (tag: v23.12.1.1368-stable) + * 05bc8ef1e02 (tag: v23.11.1.2711-stable) + """ self.root = git_runner.cwd self._ignore_no_tags = ignore_no_tags self.run = git_runner.run self.latest_tag = "" self.new_tag = "" - self.new_branch = "" self.branch = "" self.sha = "" self.sha_short = "" - self.description = "shallow-checkout" - self.commits_since_tag = 0 + self.commits_since_latest = 0 + self.commits_since_new = 0 self.update() def update(self): @@ -155,10 +175,20 @@ class Git: stderr = subprocess.DEVNULL if suppress_stderr else None self.latest_tag = self.run("git describe --tags --abbrev=0", stderr=stderr) # Format should be: {latest_tag}-{commits_since_tag}-g{sha_short} - self.description = self.run("git describe --tags --long") - self.commits_since_tag = int( + self.commits_since_latest = int( self.run(f"git rev-list {self.latest_tag}..HEAD --count") ) + if self.latest_tag.endswith("-new"): + # We won't change the behaviour of the the "latest_tag" + # So here we set "new_tag" to the previous tag in the graph, that will allow + # getting alternative "tweak" + self.new_tag = self.run( + f"git describe --tags --abbrev=0 --exclude='{self.latest_tag}'", + stderr=stderr, + ) + self.commits_since_new = int( + self.run(f"git rev-list {self.new_tag}..HEAD --count") + ) @staticmethod def check_tag(value: str) -> None: @@ -187,19 +217,34 @@ class Git: @property def tweak(self) -> int: - if not self.latest_tag.endswith("-testing"): + return self._tweak("latest") + + @property + def tweak_to_new(self) -> int: + return self._tweak("new") + + def _tweak(self, tag_type: Literal["latest", "new"]) -> int: + """Accepts latest or new as a tag_type and returns the tweak number to it""" + if tag_type == "latest": + commits = self.commits_since_latest + tag = self.latest_tag + else: + commits = self.commits_since_new + tag = self.new_tag + + if not tag.endswith("-testing"): # When we are on the tag, we still need to have tweak=1 to not # break cmake with versions like 12.13.14.0 - if not self.commits_since_tag: - # We are in a tagged commit. The tweak should match the - # current version's value - version = self.latest_tag.split("-", maxsplit=1)[0] - try: - return int(version.split(".")[-1]) - except ValueError: - # There are no tags, or a wrong tag. Return default - return TWEAK - return self.commits_since_tag + if commits: + return commits + # We are in a tagged commit or shallow checkout. The tweak should match the + # current version's value + version = tag.split("-", maxsplit=1)[0] + try: + return int(version.split(".")[-1]) + except ValueError: + # There are no tags (shallow checkout), or a wrong tag. Return default + return TWEAK - version = self.latest_tag.split("-", maxsplit=1)[0] - return int(version.split(".")[-1]) + self.commits_since_tag + version = tag.split("-", maxsplit=1)[0] + return int(version.split(".")[-1]) + commits diff --git a/tests/ci/github_helper.py b/tests/ci/github_helper.py index eb0f6c24527..b6407c5d531 100644 --- a/tests/ci/github_helper.py +++ b/tests/ci/github_helper.py @@ -6,7 +6,7 @@ from datetime import date, datetime, timedelta from os import path as p from pathlib import Path from time import sleep -from typing import List, Optional, Tuple, Union +from typing import Any, Callable, List, Optional, Tuple, Union import github import requests @@ -49,38 +49,43 @@ class GitHub(github.Github): """Wrapper around search method with throttling and splitting by date. We split only by the first""" - splittable = False + splittable_arg = "" + splittable_value = [] for arg, value in kwargs.items(): if arg in ["closed", "created", "merged", "updated"]: if hasattr(value, "__iter__") and not isinstance(value, str): - assert [True for v in value if isinstance(v, (date, datetime))] + assert all(True for v in value if isinstance(v, (date, datetime))) assert len(value) == 2 kwargs[arg] = f"{value[0].isoformat()}..{value[1].isoformat()}" - if not splittable: + if not splittable_arg: # We split only by the first met splittable argument - preserved_arg = arg - preserved_value = value middle_value = value[0] + (value[1] - value[0]) / 2 - splittable = middle_value not in value + if middle_value in value: + # When the middle value in itareble value, we can't use it + # to split by dates later + continue + splittable_arg = arg + splittable_value = value continue assert isinstance(value, (date, datetime, str)) inter_result = [] # type: Issues + exception = RateLimitExceededException(0) for i in range(self.retries): try: logger.debug("Search issues, args=%s, kwargs=%s", args, kwargs) result = super().search_issues(*args, **kwargs) - if result.totalCount == 1000 and splittable: + if result.totalCount == 1000 and splittable_arg: # The hard limit is 1000. If it's splittable, then we make # two subrequests requests with less time frames logger.debug( "The search result contain exactly 1000 results, " "splitting %s=%s by middle point %s", - preserved_arg, - kwargs[preserved_arg], + splittable_arg, + kwargs[splittable_arg], middle_value, ) - kwargs[preserved_arg] = [preserved_value[0], middle_value] + kwargs[splittable_arg] = [splittable_value[0], middle_value] inter_result.extend(self.search_issues(*args, **kwargs)) if isinstance(middle_value, date): # When middle_value is a date, 2022-01-01..2022-01-03 @@ -88,9 +93,10 @@ class GitHub(github.Github): # 2022-01-02..2022-01-03, so we have results for # 2022-01-02 twicely. We split it to # 2022-01-01..2022-01-02 and 2022-01-03..2022-01-03. - # 2022-01-01..2022-01-02 aren't split, see splittable + # 2022-01-01..2022-01-02 aren't split, see splittable_arg + # definition above for kwargs.items middle_value += timedelta(days=1) - kwargs[preserved_arg] = [middle_value, preserved_value[1]] + kwargs[splittable_arg] = [middle_value, splittable_value[1]] inter_result.extend(self.search_issues(*args, **kwargs)) return inter_result @@ -104,12 +110,15 @@ class GitHub(github.Github): raise exception # pylint: enable=signature-differs - def get_pulls_from_search(self, *args, **kwargs) -> PullRequests: # type: ignore + def get_pulls_from_search(self, *args: Any, **kwargs: Any) -> PullRequests: """The search api returns actually issues, so we need to fetch PullRequests""" issues = self.search_issues(*args, **kwargs) repos = {} prs = [] # type: PullRequests - for issue in issues: + progress_func = kwargs.pop( + "progress_func", lambda x: x + ) # type: Callable[[Issues], Issues] + for issue in progress_func(issues): # See https://github.com/PyGithub/PyGithub/issues/2202, # obj._rawData doesn't spend additional API requests # pylint: disable=protected-access diff --git a/tests/ci/pr_info.py b/tests/ci/pr_info.py index ccf5dc23121..dda5b30f1e3 100644 --- a/tests/ci/pr_info.py +++ b/tests/ci/pr_info.py @@ -59,7 +59,7 @@ def get_pr_for_commit(sha, ref): data = response.json() our_prs = [] # type: List[Dict] if len(data) > 1: - print("Got more than one pr for commit", sha) + logging.warning("Got more than one pr for commit %s", sha) for pr in data: # We need to check if the PR is created in our repo, because # https://github.com/kaynewu/ClickHouse/pull/2 @@ -71,13 +71,20 @@ def get_pr_for_commit(sha, ref): if pr["head"]["ref"] in ref: return pr our_prs.append(pr) - print( - f"Cannot find PR with required ref {ref}, sha {sha} - returning first one" + logging.warning( + "Cannot find PR with required ref %s, sha %s - returning first one", + ref, + sha, ) first_pr = our_prs[0] return first_pr except Exception as ex: - print(f"Cannot fetch PR info from commit {ref}, {sha}", ex) + logging.error( + "Cannot fetch PR info from commit ref %s, sha %s, exception: %s", + ref, + sha, + ex, + ) return None @@ -259,12 +266,12 @@ class PRInfo: self.diff_urls.append( self.compare_url( pull_request["base"]["repo"]["default_branch"], - pull_request["head"]["label"], + pull_request["head"]["sha"], ) ) self.diff_urls.append( self.compare_url( - pull_request["head"]["label"], + pull_request["head"]["sha"], pull_request["base"]["repo"]["default_branch"], ) ) @@ -279,7 +286,7 @@ class PRInfo: # itself, but as well files changed since we branched out self.diff_urls.append( self.compare_url( - pull_request["head"]["label"], + pull_request["head"]["sha"], pull_request["base"]["repo"]["default_branch"], ) ) @@ -289,8 +296,10 @@ class PRInfo: else: # assume this is a dispatch self.event_type = EventType.DISPATCH - print("event.json does not match pull_request or push:") - print(json.dumps(github_event, sort_keys=True, indent=4)) + logging.warning( + "event.json does not match pull_request or push:\n%s", + json.dumps(github_event, sort_keys=True, indent=4), + ) self.sha = os.getenv( "GITHUB_SHA", "0000000000000000000000000000000000000000" ) @@ -330,7 +339,7 @@ class PRInfo: return self.event_type == EventType.DISPATCH def compare_pr_url(self, pr_object: dict) -> str: - return self.compare_url(pr_object["base"]["label"], pr_object["head"]["label"]) + return self.compare_url(pr_object["base"]["sha"], pr_object["head"]["sha"]) @staticmethod def compare_url(first: str, second: str) -> str: @@ -357,7 +366,7 @@ class PRInfo: diff_object = PatchSet(response.text) self.changed_files.update({f.path for f in diff_object}) self.changed_files_requested = True - print(f"Fetched info about {len(self.changed_files)} changed files") + logging.info("Fetched info about %s changed files", len(self.changed_files)) def get_dict(self): return { diff --git a/tests/ci/release.py b/tests/ci/release.py index 2ae82177c67..7490ce0a373 100755 --- a/tests/ci/release.py +++ b/tests/ci/release.py @@ -94,6 +94,7 @@ class Release: self._version = get_version_from_repo(git=self._git) self.release_version = self.version self._release_branch = "" + self._version_new_tag = None # type: Optional[ClickHouseVersion] self._rollback_stack = [] # type: List[str] def run( @@ -180,7 +181,8 @@ class Release: ) raise - self.check_commit_release_ready() + if self.release_type == self.PATCH: + self.check_commit_release_ready() def do( self, check_dirty: bool, check_run_from_master: bool, check_branch: bool @@ -328,10 +330,16 @@ class Release: self.check_no_tags_after() # Create release branch self.read_version() - with self._create_branch(self.release_branch, self.release_commit): - with self._checkout(self.release_branch, True): - with self._bump_release_branch(): - yield + assert self._version_new_tag is not None + with self._create_tag( + self._version_new_tag.describe, + self.release_commit, + f"Initial commit for release {self._version_new_tag.major}.{self._version_new_tag.minor}", + ): + with self._create_branch(self.release_branch, self.release_commit): + with self._checkout(self.release_branch, True): + with self._bump_release_branch(): + yield @contextmanager def patch_release(self): @@ -444,6 +452,11 @@ class Release: self.version.with_description(VersionType.TESTING) self._update_cmake_contributors(self.version) self._commit_cmake_contributors(self.version) + # Create a version-new tag + self._version_new_tag = self.version.copy() + self._version_new_tag.tweak = 1 + self._version_new_tag.with_description(VersionType.NEW) + with self._push(helper_branch): body_file = get_abs_path(".github/PULL_REQUEST_TEMPLATE.md") # The following command is rolled back by deleting branch in self._push @@ -458,10 +471,10 @@ class Release: @contextmanager def _checkout(self, ref: str, with_checkout_back: bool = False) -> Iterator[None]: + self._git.update() orig_ref = self._git.branch or self._git.sha - need_rollback = False + rollback_cmd = "" if ref not in (self._git.branch, self._git.sha): - need_rollback = True self.run(f"git checkout {ref}") # checkout is not put into rollback_stack intentionally rollback_cmd = f"git checkout {orig_ref}" @@ -474,7 +487,7 @@ class Release: self.run(f"git reset --hard; git checkout -f {orig_ref}") raise # Normal flow when we need to checkout back - if with_checkout_back and need_rollback: + if with_checkout_back and rollback_cmd: self.run(rollback_cmd) @contextmanager @@ -510,9 +523,9 @@ class Release: @contextmanager def _create_gh_release(self, as_prerelease: bool) -> Iterator[None]: - with self._create_tag(): + tag = self.release_version.describe + with self._create_tag(tag, self.release_commit): # Preserve tag if version is changed - tag = self.release_version.describe prerelease = "" if as_prerelease: prerelease = "--prerelease" @@ -534,13 +547,13 @@ class Release: raise @contextmanager - def _create_tag(self): - tag = self.release_version.describe - self.run( - f"git tag -a -m 'Release {tag}' '{tag}' {self.release_commit}", - dry_run=self.dry_run, - ) - rollback_cmd = f"{self.dry_run_prefix}git tag -d '{tag}'" + def _create_tag( + self, tag: str, commit: str, tag_message: str = "" + ) -> Iterator[None]: + tag_message = tag_message or "Release {tag}" + # Create tag even in dry-run + self.run(f"git tag -a -m '{tag_message}' '{tag}' {commit}") + rollback_cmd = f"git tag -d '{tag}'" self._rollback_stack.append(rollback_cmd) try: with self._push(tag): diff --git a/tests/ci/test_git.py b/tests/ci/test_git.py index 3aedd8a8dea..60cd95b6869 100644 --- a/tests/ci/test_git.py +++ b/tests/ci/test_git.py @@ -1,10 +1,11 @@ #!/usr/bin/env python -from unittest.mock import patch import os.path as p import unittest +from dataclasses import dataclass +from unittest.mock import patch -from git_helper import Git, Runner, CWD +from git_helper import CWD, Git, Runner, git_runner class TestRunner(unittest.TestCase): @@ -35,8 +36,10 @@ class TestRunner(unittest.TestCase): class TestGit(unittest.TestCase): def setUp(self): """we use dummy git object""" + # get the git_runner's cwd to set it properly before the Runner is patched + _ = git_runner.cwd run_patcher = patch("git_helper.Runner.run", return_value="") - self.run_mock = run_patcher.start() + run_mock = run_patcher.start() self.addCleanup(run_patcher.stop) update_patcher = patch("git_helper.Git.update") update_mock = update_patcher.start() @@ -44,15 +47,13 @@ class TestGit(unittest.TestCase): self.git = Git() update_mock.assert_called_once() self.git.run("test") - self.run_mock.assert_called_once() - self.git.new_branch = "NEW_BRANCH_NAME" - self.git.new_tag = "v21.12.333.22222-stable" + run_mock.assert_called_once() self.git.branch = "old_branch" self.git.sha = "" self.git.sha_short = "" self.git.latest_tag = "" - self.git.description = "" - self.git.commits_since_tag = 0 + self.git.commits_since_latest = 0 + self.git.commits_since_new = 0 def test_tags(self): self.git.new_tag = "v21.12.333.22222-stable" @@ -71,11 +72,30 @@ class TestGit(unittest.TestCase): setattr(self.git, tag_attr, tag) def test_tweak(self): - self.git.commits_since_tag = 0 - self.assertEqual(self.git.tweak, 1) - self.git.commits_since_tag = 2 - self.assertEqual(self.git.tweak, 2) - self.git.latest_tag = "v21.12.333.22222-testing" - self.assertEqual(self.git.tweak, 22224) - self.git.commits_since_tag = 0 - self.assertEqual(self.git.tweak, 22222) + # tweak for the latest tag + @dataclass + class TestCase: + tag: str + commits: int + tweak: int + + cases = ( + TestCase("", 0, 1), + TestCase("", 2, 2), + TestCase("v21.12.333.22222-stable", 0, 22222), + TestCase("v21.12.333.22222-stable", 2, 2), + TestCase("v21.12.333.22222-testing", 0, 22222), + TestCase("v21.12.333.22222-testing", 2, 22224), + ) + for tag, commits, tweak in ( + ("latest_tag", "commits_since_latest", "tweak"), + ("new_tag", "commits_since_new", "tweak_to_new"), + ): + for tc in cases: + setattr(self.git, tag, tc.tag) + setattr(self.git, commits, tc.commits) + self.assertEqual( + getattr(self.git, tweak), + tc.tweak, + f"Wrong tweak for tag {tc.tag} and commits {tc.commits} of {tag}", + ) diff --git a/tests/ci/test_version.py b/tests/ci/test_version.py index 978edcc093e..c4f12091ec0 100644 --- a/tests/ci/test_version.py +++ b/tests/ci/test_version.py @@ -2,8 +2,13 @@ import unittest from argparse import ArgumentTypeError +from dataclasses import dataclass +from pathlib import Path import version_helper as vh +from git_helper import Git + +CHV = vh.ClickHouseVersion class TestFunctions(unittest.TestCase): @@ -32,3 +37,55 @@ class TestFunctions(unittest.TestCase): for error_case in error_cases: with self.assertRaises(ArgumentTypeError): version = vh.version_arg(error_case[0]) + + def test_get_version_from_repo(self): + @dataclass + class TestCase: + latest_tag: str + commits_since_latest: int + new_tag: str + commits_since_new: int + expected: CHV + + cases = ( + TestCase( + "v24.6.1.1-new", + 15, + "v24.4.1.2088-stable", + 415, + CHV(24, 5, 1, 54487, None, 415), + ), + TestCase( + "v24.6.1.1-testing", + 15, + "v24.4.1.2088-stable", + 415, + CHV(24, 5, 1, 54487, None, 16), + ), + TestCase( + "v24.6.1.1-stable", + 15, + "v24.4.1.2088-stable", + 415, + CHV(24, 5, 1, 54487, None, 15), + ), + TestCase( + "v24.5.1.1-stable", + 15, + "v24.4.1.2088-stable", + 415, + CHV(24, 5, 1, 54487, None, 15), + ), + ) + git = Git(True) + for tc in cases: + git.latest_tag = tc.latest_tag + git.commits_since_latest = tc.commits_since_latest + git.new_tag = tc.new_tag + git.commits_since_new = tc.commits_since_new + self.assertEqual( + vh.get_version_from_repo( + Path("tests/ci/tests/autogenerated_versions.txt"), git + ), + tc.expected, + ) diff --git a/tests/ci/tests/autogenerated_versions.txt b/tests/ci/tests/autogenerated_versions.txt new file mode 100644 index 00000000000..10028bf50c8 --- /dev/null +++ b/tests/ci/tests/autogenerated_versions.txt @@ -0,0 +1,12 @@ +# This variables autochanged by tests/ci/version_helper.py: + +# NOTE: has nothing common with DBMS_TCP_PROTOCOL_VERSION, +# only DBMS_TCP_PROTOCOL_VERSION should be incremented on protocol changes. +SET(VERSION_REVISION 54487) +SET(VERSION_MAJOR 24) +SET(VERSION_MINOR 5) +SET(VERSION_PATCH 1) +SET(VERSION_GITHASH 70a1d3a63d47f0be077d67b8deb907230fc7cfb0) +SET(VERSION_DESCRIBE v24.5.1.1-testing) +SET(VERSION_STRING 24.5.1.1) +# end of autochange diff --git a/tests/ci/tmp/.gitignore b/tests/ci/tmp/.gitignore new file mode 100644 index 00000000000..72e8ffc0db8 --- /dev/null +++ b/tests/ci/tmp/.gitignore @@ -0,0 +1 @@ +* diff --git a/tests/ci/version_helper.py b/tests/ci/version_helper.py index 800bfcf52c3..50263f6ebb6 100755 --- a/tests/ci/version_helper.py +++ b/tests/ci/version_helper.py @@ -1,7 +1,7 @@ #!/usr/bin/env python3 import logging -import os.path as p from argparse import ArgumentDefaultsHelpFormatter, ArgumentParser, ArgumentTypeError +from pathlib import Path from typing import Any, Dict, Iterable, List, Literal, Optional, Set, Tuple, Union from git_helper import TWEAK, Git, get_tags, git_runner, removeprefix @@ -22,7 +22,7 @@ VERSIONS = Dict[str, Union[int, str]] VERSIONS_TEMPLATE = """# This variables autochanged by tests/ci/version_helper.py: -# NOTE: has nothing common with DBMS_TCP_PROTOCOL_VERSION, +# NOTE: VERSION_REVISION has nothing common with DBMS_TCP_PROTOCOL_VERSION, # only DBMS_TCP_PROTOCOL_VERSION should be incremented on protocol changes. SET(VERSION_REVISION {revision}) SET(VERSION_MAJOR {major}) @@ -47,7 +47,7 @@ class ClickHouseVersion: patch: Union[int, str], revision: Union[int, str], git: Optional[Git], - tweak: Optional[str] = None, + tweak: Optional[Union[int, str]] = None, ): self._major = int(major) self._minor = int(minor) @@ -95,7 +95,7 @@ class ClickHouseVersion: if self._git is not None: self._git.update() return ClickHouseVersion( - self.major, self.minor, self.patch, self.revision, self._git, "1" + self.major, self.minor, self.patch, self.revision, self._git, 1 ) @property @@ -114,6 +114,10 @@ class ClickHouseVersion: def tweak(self) -> int: return self._tweak + @tweak.setter + def tweak(self, tweak: int) -> None: + self._tweak = tweak + @property def revision(self) -> int: return self._revision @@ -172,7 +176,7 @@ class ClickHouseVersion: self.patch, self.revision, self._git, - str(self.tweak), + self.tweak, ) try: copy.with_description(self.description) @@ -190,7 +194,9 @@ class ClickHouseVersion: and self.tweak == other.tweak ) - def __lt__(self, other: "ClickHouseVersion") -> bool: + def __lt__(self, other: Any) -> bool: + if not isinstance(self, type(other)): + return NotImplemented for part in ("major", "minor", "patch", "tweak"): if getattr(self, part) < getattr(other, part): return True @@ -220,10 +226,11 @@ ClickHouseVersions = List[ClickHouseVersion] class VersionType: LTS = "lts" + NEW = "new" PRESTABLE = "prestable" STABLE = "stable" TESTING = "testing" - VALID = (TESTING, PRESTABLE, STABLE, LTS) + VALID = (NEW, TESTING, PRESTABLE, STABLE, LTS) def validate_version(version: str) -> None: @@ -234,43 +241,56 @@ def validate_version(version: str) -> None: int(part) -def get_abs_path(path: str) -> str: - return p.abspath(p.join(git_runner.cwd, path)) +def get_abs_path(path: Union[Path, str]) -> Path: + return (Path(git_runner.cwd) / path).absolute() -def read_versions(versions_path: str = FILE_WITH_VERSION_PATH) -> VERSIONS: +def read_versions(versions_path: Union[Path, str] = FILE_WITH_VERSION_PATH) -> VERSIONS: versions = {} - path_to_file = get_abs_path(versions_path) - with open(path_to_file, "r", encoding="utf-8") as f: - for line in f: - line = line.strip() - if not line.startswith("SET("): - continue + for line in get_abs_path(versions_path).read_text(encoding="utf-8").splitlines(): + line = line.strip() + if not line.startswith("SET("): + continue - value = 0 # type: Union[int, str] - name, value = line[4:-1].split(maxsplit=1) - name = removeprefix(name, "VERSION_").lower() - try: - value = int(value) - except ValueError: - pass - versions[name] = value + value = 0 # type: Union[int, str] + name, value = line[4:-1].split(maxsplit=1) + name = removeprefix(name, "VERSION_").lower() + try: + value = int(value) + except ValueError: + pass + versions[name] = value return versions def get_version_from_repo( - versions_path: str = FILE_WITH_VERSION_PATH, + versions_path: Union[Path, str] = FILE_WITH_VERSION_PATH, git: Optional[Git] = None, ) -> ClickHouseVersion: + """Get a ClickHouseVersion from FILE_WITH_VERSION_PATH. When the `git` parameter is + present, a proper `tweak` version part is calculated for case if the latest tag has + a `new` type and greater than version in `FILE_WITH_VERSION_PATH`""" versions = read_versions(versions_path) - return ClickHouseVersion( + cmake_version = ClickHouseVersion( versions["major"], versions["minor"], versions["patch"], versions["revision"], git, ) + # Since 24.5 we have tags like v24.6.1.1-new, and we must check if the release + # branch already has it's own commit. It's necessary for a proper tweak version + if git is not None and git.latest_tag: + version_from_tag = get_version_from_tag(git.latest_tag) + if ( + version_from_tag.description == VersionType.NEW + and cmake_version < version_from_tag + ): + # We are in a new release branch without existing release. + # We should change the tweak version to a `tweak_to_new` + cmake_version.tweak = git.tweak_to_new + return cmake_version def get_version_from_string( @@ -350,15 +370,15 @@ def get_supported_versions( def update_cmake_version( version: ClickHouseVersion, - versions_path: str = FILE_WITH_VERSION_PATH, + versions_path: Union[Path, str] = FILE_WITH_VERSION_PATH, ) -> None: - path_to_file = get_abs_path(versions_path) - with open(path_to_file, "w", encoding="utf-8") as f: - f.write(VERSIONS_TEMPLATE.format_map(version.as_dict())) + get_abs_path(versions_path).write_text( + VERSIONS_TEMPLATE.format_map(version.as_dict()), encoding="utf-8" + ) def update_contributors( - relative_contributors_path: str = GENERATED_CONTRIBUTORS, + relative_contributors_path: Union[Path, str] = GENERATED_CONTRIBUTORS, force: bool = False, raise_error: bool = False, ) -> None: @@ -378,13 +398,11 @@ def update_contributors( ) contributors = [f' "{c}",' for c in contributors] - executer = p.relpath(p.realpath(__file__), git_runner.cwd) + executer = Path(__file__).relative_to(git_runner.cwd) content = CONTRIBUTORS_TEMPLATE.format( executer=executer, contributors="\n".join(contributors) ) - contributors_path = get_abs_path(relative_contributors_path) - with open(contributors_path, "w", encoding="utf-8") as cfd: - cfd.write(content) + get_abs_path(relative_contributors_path).write_text(content, encoding="utf-8") def update_version_local(version, version_type="testing"): diff --git a/tests/integration/helpers/test_tools.py b/tests/integration/helpers/test_tools.py index 2afbae340be..1c8c5c33a13 100644 --- a/tests/integration/helpers/test_tools.py +++ b/tests/integration/helpers/test_tools.py @@ -139,12 +139,18 @@ def assert_logs_contain_with_retry(instance, substring, retry_count=20, sleep_ti def exec_query_with_retry( - instance, query, retry_count=40, sleep_time=0.5, silent=False, settings={} + instance, + query, + retry_count=40, + sleep_time=0.5, + silent=False, + settings={}, + timeout=30, ): exception = None for cnt in range(retry_count): try: - res = instance.query(query, timeout=30, settings=settings) + res = instance.query(query, timeout=timeout, settings=settings) if not silent: logging.debug(f"Result of {query} on {cnt} try is {res}") break diff --git a/tests/integration/test_named_collections/configs/config.d/named_collections_with_zookeeper.xml b/tests/integration/test_named_collections/configs/config.d/named_collections_with_zookeeper.xml new file mode 100644 index 00000000000..2d7946d1587 --- /dev/null +++ b/tests/integration/test_named_collections/configs/config.d/named_collections_with_zookeeper.xml @@ -0,0 +1,12 @@ + + + zookeeper + /named_collections_path/ + 5000 + + + + value1 + + + diff --git a/tests/integration/test_named_collections/test.py b/tests/integration/test_named_collections/test.py index cbb8c94c701..dbc502236c0 100644 --- a/tests/integration/test_named_collections/test.py +++ b/tests/integration/test_named_collections/test.py @@ -9,6 +9,8 @@ NAMED_COLLECTIONS_CONFIG = os.path.join( SCRIPT_DIR, "./configs/config.d/named_collections.xml" ) +ZK_PATH = "/named_collections_path" + @pytest.fixture(scope="module") def cluster(): @@ -24,6 +26,28 @@ def cluster(): ], stay_alive=True, ) + cluster.add_instance( + "node_with_keeper", + main_configs=[ + "configs/config.d/named_collections_with_zookeeper.xml", + ], + user_configs=[ + "configs/users.d/users.xml", + ], + stay_alive=True, + with_zookeeper=True, + ) + cluster.add_instance( + "node_with_keeper_2", + main_configs=[ + "configs/config.d/named_collections_with_zookeeper.xml", + ], + user_configs=[ + "configs/users.d/users.xml", + ], + stay_alive=True, + with_zookeeper=True, + ) cluster.add_instance( "node_only_named_collection_control", main_configs=[ @@ -447,8 +471,16 @@ def test_config_reload(cluster): ) -def test_sql_commands(cluster): - node = cluster.instances["node"] +@pytest.mark.parametrize("with_keeper", [False, True]) +def test_sql_commands(cluster, with_keeper): + zk = None + node = None + if with_keeper: + node = cluster.instances["node_with_keeper"] + zk = cluster.get_kazoo_client("zoo1") + else: + node = cluster.instances["node"] + assert "1" == node.query("select count() from system.named_collections").strip() node.query("CREATE NAMED COLLECTION collection2 AS key1=1, key2='value2'") @@ -479,6 +511,14 @@ def test_sql_commands(cluster): "select collection['key2'] from system.named_collections where name = 'collection2'" ).strip() ) + if zk is not None: + children = zk.get_children(ZK_PATH) + assert 1 == len(children) + assert "collection2.sql" in children + assert ( + b"CREATE NAMED COLLECTION collection2 AS key1 = 1, key2 = 'value2'" + in zk.get(ZK_PATH + "/collection2.sql")[0] + ) check_created() node.restart_clickhouse() @@ -508,6 +548,15 @@ def test_sql_commands(cluster): ).strip() ) + if zk is not None: + children = zk.get_children(ZK_PATH) + assert 1 == len(children) + assert "collection2.sql" in children + assert ( + b"CREATE NAMED COLLECTION collection2 AS key1 = 4, key2 = 'value2', key3 = 'value3'" + in zk.get(ZK_PATH + "/collection2.sql")[0] + ) + check_altered() node.restart_clickhouse() check_altered() @@ -522,6 +571,15 @@ def test_sql_commands(cluster): ).strip() ) + if zk is not None: + children = zk.get_children(ZK_PATH) + assert 1 == len(children) + assert "collection2.sql" in children + assert ( + b"CREATE NAMED COLLECTION collection2 AS key1 = 4, key3 = 'value3'" + in zk.get(ZK_PATH + "/collection2.sql")[0] + ) + check_deleted() node.restart_clickhouse() check_deleted() @@ -529,6 +587,7 @@ def test_sql_commands(cluster): node.query( "ALTER NAMED COLLECTION collection2 SET key3=3, key4='value4' DELETE key1" ) + time.sleep(2) def check_altered_and_deleted(): assert ( @@ -552,6 +611,15 @@ def test_sql_commands(cluster): ).strip() ) + if zk is not None: + children = zk.get_children(ZK_PATH) + assert 1 == len(children) + assert "collection2.sql" in children + assert ( + b"CREATE NAMED COLLECTION collection2 AS key3 = 3, key4 = 'value4'" + in zk.get(ZK_PATH + "/collection2.sql")[0] + ) + check_altered_and_deleted() node.restart_clickhouse() check_altered_and_deleted() @@ -564,7 +632,132 @@ def test_sql_commands(cluster): "collection1" == node.query("select name from system.named_collections").strip() ) + if zk is not None: + children = zk.get_children(ZK_PATH) + assert 0 == len(children) check_dropped() node.restart_clickhouse() check_dropped() + + +def test_keeper_storage(cluster): + node1 = cluster.instances["node_with_keeper"] + node2 = cluster.instances["node_with_keeper_2"] + zk = cluster.get_kazoo_client("zoo1") + + assert "1" == node1.query("select count() from system.named_collections").strip() + assert "1" == node2.query("select count() from system.named_collections").strip() + + node1.query("CREATE NAMED COLLECTION collection2 AS key1=1, key2='value2'") + + def check_created(node): + assert ( + "collection1\ncollection2" + == node.query("select name from system.named_collections").strip() + ) + + assert ( + "['key1','key2']" + == node.query( + "select mapKeys(collection) from system.named_collections where name = 'collection2'" + ).strip() + ) + + assert ( + "1" + == node.query( + "select collection['key1'] from system.named_collections where name = 'collection2'" + ).strip() + ) + + assert ( + "value2" + == node.query( + "select collection['key2'] from system.named_collections where name = 'collection2'" + ).strip() + ) + + children = zk.get_children(ZK_PATH) + assert 1 == len(children) + assert "collection2.sql" in children + assert ( + b"CREATE NAMED COLLECTION collection2 AS key1 = 1, key2 = 'value2'" + in zk.get(ZK_PATH + "/collection2.sql")[0] + ) + + check_created(node1) + check_created(node2) + + node1.restart_clickhouse() + node2.restart_clickhouse() + + check_created(node1) + check_created(node2) + + node2.query("ALTER NAMED COLLECTION collection2 SET key1=4, key3='value3'") + + time.sleep(5) + + def check_altered(node): + assert ( + "['key1','key2','key3']" + == node.query( + "select mapKeys(collection) from system.named_collections where name = 'collection2'" + ).strip() + ) + + assert ( + "4" + == node.query( + "select collection['key1'] from system.named_collections where name = 'collection2'" + ).strip() + ) + + assert ( + "value3" + == node.query( + "select collection['key3'] from system.named_collections where name = 'collection2'" + ).strip() + ) + + if zk is not None: + children = zk.get_children(ZK_PATH) + assert 1 == len(children) + assert "collection2.sql" in children + assert ( + b"CREATE NAMED COLLECTION collection2 AS key1 = 4, key2 = 'value2', key3 = 'value3'" + in zk.get(ZK_PATH + "/collection2.sql")[0] + ) + + check_altered(node2) + check_altered(node1) + + node1.restart_clickhouse() + node2.restart_clickhouse() + + check_altered(node1) + check_altered(node2) + + node1.query("DROP NAMED COLLECTION collection2") + + time.sleep(5) + + def check_dropped(node): + assert "1" == node.query("select count() from system.named_collections").strip() + assert ( + "collection1" + == node.query("select name from system.named_collections").strip() + ) + if zk is not None: + children = zk.get_children(ZK_PATH) + assert 0 == len(children) + + check_dropped(node1) + check_dropped(node2) + + node1.restart_clickhouse() + node2.restart_clickhouse() + + check_dropped(node1) + check_dropped(node2) diff --git a/tests/integration/test_replicated_database/configs/config2.xml b/tests/integration/test_replicated_database/configs/config2.xml new file mode 100644 index 00000000000..727461697ca --- /dev/null +++ b/tests/integration/test_replicated_database/configs/config2.xml @@ -0,0 +1,10 @@ + + 10 + 1 + + 10 + + 50 + 42 + group + diff --git a/tests/integration/test_replicated_database/test.py b/tests/integration/test_replicated_database/test.py index fd1bfc75227..ea569939c1c 100644 --- a/tests/integration/test_replicated_database/test.py +++ b/tests/integration/test_replicated_database/test.py @@ -61,7 +61,7 @@ all_nodes = [ bad_settings_node = cluster.add_instance( "bad_settings_node", - main_configs=["configs/config.xml"], + main_configs=["configs/config2.xml"], user_configs=["configs/inconsistent_settings.xml"], with_zookeeper=True, macros={"shard": 1, "replica": 4}, @@ -1522,3 +1522,24 @@ def test_auto_recovery(started_cluster): assert "42\n" == bad_settings_node.query("SELECT * FROM auto_recovery.t2") assert "137\n" == bad_settings_node.query("SELECT * FROM auto_recovery.t1") + + +def test_all_groups_cluster(started_cluster): + dummy_node.query("DROP DATABASE IF EXISTS db_cluster") + bad_settings_node.query("DROP DATABASE IF EXISTS db_cluster") + dummy_node.query( + "CREATE DATABASE db_cluster ENGINE = Replicated('/clickhouse/databases/all_groups_cluster', 'shard1', 'replica1');" + ) + bad_settings_node.query( + "CREATE DATABASE db_cluster ENGINE = Replicated('/clickhouse/databases/all_groups_cluster', 'shard1', 'replica2');" + ) + + assert "dummy_node\n" == dummy_node.query( + "select host_name from system.clusters where name='db_cluster' order by host_name" + ) + assert "bad_settings_node\n" == bad_settings_node.query( + "select host_name from system.clusters where name='db_cluster' order by host_name" + ) + assert "bad_settings_node\ndummy_node\n" == bad_settings_node.query( + "select host_name from system.clusters where name='all_groups.db_cluster' order by host_name" + ) diff --git a/tests/integration/test_storage_azure_blob_storage/test.py b/tests/integration/test_storage_azure_blob_storage/test.py index f836c58ce30..d986c1f9746 100644 --- a/tests/integration/test_storage_azure_blob_storage/test.py +++ b/tests/integration/test_storage_azure_blob_storage/test.py @@ -790,6 +790,25 @@ def test_read_subcolumns(cluster): assert res == "42\tcont/test_subcolumns.jsonl\t(42,42)\ttest_subcolumns.jsonl\t42\n" +def test_read_subcolumn_time(cluster): + node = cluster.instances["node"] + storage_account_url = cluster.env_variables["AZURITE_STORAGE_ACCOUNT_URL"] + azure_query( + node, + f"INSERT INTO TABLE FUNCTION azureBlobStorage('{storage_account_url}', 'cont', 'test_subcolumn_time.tsv', " + f"'devstoreaccount1', 'Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq/K1SZFPTOtr/KBHBeksoGMGw==', 'auto', 'auto'," + f" 'a UInt32') select (42)", + ) + + res = node.query( + f"select a, dateDiff('minute', _time, now()) < 59 from azureBlobStorage('{storage_account_url}', 'cont', 'test_subcolumn_time.tsv'," + f" 'devstoreaccount1', 'Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq/K1SZFPTOtr/KBHBeksoGMGw==', 'auto', 'auto'," + f" 'a UInt32')" + ) + + assert res == "42\t1\n" + + def test_read_from_not_existing_container(cluster): node = cluster.instances["node"] query = ( diff --git a/tests/integration/test_storage_hdfs/test.py b/tests/integration/test_storage_hdfs/test.py index 44c0223e677..47d8f44c0b7 100644 --- a/tests/integration/test_storage_hdfs/test.py +++ b/tests/integration/test_storage_hdfs/test.py @@ -999,6 +999,20 @@ def test_read_subcolumns(started_cluster): assert res == "42\ttest_subcolumns.jsonl\t(42,42)\ttest_subcolumns.jsonl\t42\n" +def test_read_subcolumn_time(started_cluster): + node = started_cluster.instances["node1"] + + node.query( + f"insert into function hdfs('hdfs://hdfs1:9000/test_subcolumn_time.tsv', auto, 'a UInt32') select (42)" + ) + + res = node.query( + f"select a, dateDiff('minute', _time, now()) < 59 from hdfs('hdfs://hdfs1:9000/test_subcolumn_time.tsv', auto, 'a UInt32')" + ) + + assert res == "42\t1\n" + + def test_union_schema_inference_mode(started_cluster): node = started_cluster.instances["node1"] diff --git a/tests/integration/test_storage_s3/test.py b/tests/integration/test_storage_s3/test.py index 09b27fff1e8..b2ebd12ce00 100644 --- a/tests/integration/test_storage_s3/test.py +++ b/tests/integration/test_storage_s3/test.py @@ -1133,6 +1133,7 @@ def test_seekable_formats(started_cluster): exec_query_with_retry( instance, f"insert into table function {table_function} SELECT number, randomString(100) FROM numbers(1000000) settings s3_truncate_on_insert=1", + timeout=100, ) result = instance.query(f"SELECT count() FROM {table_function}") @@ -1142,6 +1143,7 @@ def test_seekable_formats(started_cluster): exec_query_with_retry( instance, f"insert into table function {table_function} SELECT number, randomString(100) FROM numbers(1500000) settings s3_truncate_on_insert=1", + timeout=100, ) result = instance.query( @@ -1169,6 +1171,7 @@ def test_seekable_formats_url(started_cluster): exec_query_with_retry( instance, f"insert into table function {table_function} SELECT number, randomString(100) FROM numbers(1500000) settings s3_truncate_on_insert=1", + timeout=100, ) result = instance.query(f"SELECT count() FROM {table_function}") @@ -1178,6 +1181,7 @@ def test_seekable_formats_url(started_cluster): exec_query_with_retry( instance, f"insert into table function {table_function} SELECT number, randomString(100) FROM numbers(1500000) settings s3_truncate_on_insert=1", + timeout=100, ) table_function = f"url('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/test_parquet', 'Parquet', 'a Int32, b String')" @@ -2149,6 +2153,21 @@ def test_read_subcolumns(started_cluster): ) +def test_read_subcolumn_time(started_cluster): + bucket = started_cluster.minio_bucket + instance = started_cluster.instances["dummy"] + + instance.query( + f"insert into function s3('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/test_subcolumn_time.tsv', auto, 'a UInt32') select (42)" + ) + + res = instance.query( + f"select a, dateDiff('minute', _time, now()) < 59 from s3('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/test_subcolumn_time.tsv', auto, 'a UInt32')" + ) + + assert res == "42\t1\n" + + def test_filtering_by_file_or_path(started_cluster): bucket = started_cluster.minio_bucket instance = started_cluster.instances["dummy"] diff --git a/tests/performance/least_greatest.xml b/tests/performance/least_greatest.xml new file mode 100644 index 00000000000..522bcd9004d --- /dev/null +++ b/tests/performance/least_greatest.xml @@ -0,0 +1,10 @@ + + CREATE TABLE test (id Int32, x1 Nullable(Int32), x2 Nullable(Float32)) ENGINE = MergeTree() ORDER BY id + + INSERT INTO test SELECT number, number+1, number + 2 FROM numbers(1000000) + + SELECT COUNT(1) FROM test WHERE least(x1, x2) > 1 + SELECT COUNT(1) FROM test WHERE GREATEST(x1, x2) > 1 + + DROP TABLE IF EXISTS test + \ No newline at end of file diff --git a/tests/queries/0_stateless/00636_partition_key_parts_pruning.sh b/tests/queries/0_stateless/00636_partition_key_parts_pruning.sh index 7ec4d99f028..f2b4cae8bb0 100755 --- a/tests/queries/0_stateless/00636_partition_key_parts_pruning.sh +++ b/tests/queries/0_stateless/00636_partition_key_parts_pruning.sh @@ -11,9 +11,11 @@ ${CLICKHOUSE_CLIENT} --query="CREATE TABLE single_col_partition_key(x UInt32) EN ${CLICKHOUSE_CLIENT} --query="INSERT INTO single_col_partition_key VALUES (1), (2), (3), (4), (11), (12), (20)" -${CLICKHOUSE_CLIENT} --query="SELECT count() FROM single_col_partition_key WHERE x < 3 FORMAT XML" | grep -F rows_read | sed 's/^[ \t]*//g' -${CLICKHOUSE_CLIENT} --query="SELECT count() FROM single_col_partition_key WHERE x >= 11 FORMAT XML" | grep -F rows_read | sed 's/^[ \t]*//g' -${CLICKHOUSE_CLIENT} --query="SELECT count() FROM single_col_partition_key WHERE x = 20 FORMAT XML" | grep -F rows_read | sed 's/^[ \t]*//g' +DISABLE_COUNT_OPTIMIZATION="SETTINGS optimize_trivial_count_query = 0, optimize_use_implicit_projections = 0" + +${CLICKHOUSE_CLIENT} --query="SELECT count() FROM single_col_partition_key WHERE x < 3 FORMAT XML $DISABLE_COUNT_OPTIMIZATION" | grep -F rows_read | sed 's/^[ \t]*//g' +${CLICKHOUSE_CLIENT} --query="SELECT count() FROM single_col_partition_key WHERE x >= 11 FORMAT XML $DISABLE_COUNT_OPTIMIZATION" | grep -F rows_read | sed 's/^[ \t]*//g' +${CLICKHOUSE_CLIENT} --query="SELECT count() FROM single_col_partition_key WHERE x = 20 FORMAT XML $DISABLE_COUNT_OPTIMIZATION" | grep -F rows_read | sed 's/^[ \t]*//g' ${CLICKHOUSE_CLIENT} --query="DROP TABLE single_col_partition_key" @@ -31,14 +33,14 @@ ${CLICKHOUSE_CLIENT} --query="INSERT INTO composite_partition_key VALUES \ ${CLICKHOUSE_CLIENT} --query="INSERT INTO composite_partition_key VALUES \ (301, 20, 3), (302, 21, 3), (303, 22, 3)" -${CLICKHOUSE_CLIENT} --query="SELECT count() FROM composite_partition_key WHERE a > 400 FORMAT XML SETTINGS optimize_trivial_count_query = 0" | grep -F rows_read | sed 's/^[ \t]*//g' -${CLICKHOUSE_CLIENT} --query="SELECT count() FROM composite_partition_key WHERE b = 11 FORMAT XML SETTINGS optimize_trivial_count_query = 0" | grep -F rows_read | sed 's/^[ \t]*//g' -${CLICKHOUSE_CLIENT} --query="SELECT count() FROM composite_partition_key WHERE c = 4 FORMAT XML SETTINGS optimize_trivial_count_query = 0" | grep -F rows_read | sed 's/^[ \t]*//g' +${CLICKHOUSE_CLIENT} --query="SELECT count() FROM composite_partition_key WHERE a > 400 FORMAT XML $DISABLE_COUNT_OPTIMIZATION" | grep -F rows_read | sed 's/^[ \t]*//g' +${CLICKHOUSE_CLIENT} --query="SELECT count() FROM composite_partition_key WHERE b = 11 FORMAT XML $DISABLE_COUNT_OPTIMIZATION" | grep -F rows_read | sed 's/^[ \t]*//g' +${CLICKHOUSE_CLIENT} --query="SELECT count() FROM composite_partition_key WHERE c = 4 FORMAT XML $DISABLE_COUNT_OPTIMIZATION" | grep -F rows_read | sed 's/^[ \t]*//g' -${CLICKHOUSE_CLIENT} --query="SELECT count() FROM composite_partition_key WHERE a < 200 AND c = 2 FORMAT XML SETTINGS optimize_trivial_count_query = 0" | grep -F rows_read | sed 's/^[ \t]*//g' -${CLICKHOUSE_CLIENT} --query="SELECT count() FROM composite_partition_key WHERE a = 301 AND b < 20 FORMAT XML SETTINGS optimize_trivial_count_query = 0" | grep -F rows_read | sed 's/^[ \t]*//g' -${CLICKHOUSE_CLIENT} --query="SELECT count() FROM composite_partition_key WHERE b >= 12 AND c = 2 FORMAT XML SETTINGS optimize_trivial_count_query = 0" | grep -F rows_read | sed 's/^[ \t]*//g' +${CLICKHOUSE_CLIENT} --query="SELECT count() FROM composite_partition_key WHERE a < 200 AND c = 2 FORMAT XML $DISABLE_COUNT_OPTIMIZATION" | grep -F rows_read | sed 's/^[ \t]*//g' +${CLICKHOUSE_CLIENT} --query="SELECT count() FROM composite_partition_key WHERE a = 301 AND b < 20 FORMAT XML $DISABLE_COUNT_OPTIMIZATION" | grep -F rows_read | sed 's/^[ \t]*//g' +${CLICKHOUSE_CLIENT} --query="SELECT count() FROM composite_partition_key WHERE b >= 12 AND c = 2 FORMAT XML $DISABLE_COUNT_OPTIMIZATION" | grep -F rows_read | sed 's/^[ \t]*//g' -${CLICKHOUSE_CLIENT} --query="SELECT count() FROM composite_partition_key WHERE a = 301 AND b = 21 AND c = 3 FORMAT XML SETTINGS optimize_trivial_count_query = 0" | grep -F rows_read | sed 's/^[ \t]*//g' +${CLICKHOUSE_CLIENT} --query="SELECT count() FROM composite_partition_key WHERE a = 301 AND b = 21 AND c = 3 FORMAT XML $DISABLE_COUNT_OPTIMIZATION" | grep -F rows_read | sed 's/^[ \t]*//g' ${CLICKHOUSE_CLIENT} --query="DROP TABLE composite_partition_key" diff --git a/tests/queries/0_stateless/00653_verification_monotonic_data_load.sh b/tests/queries/0_stateless/00653_verification_monotonic_data_load.sh index e10b2f86145..7766d7720e1 100755 --- a/tests/queries/0_stateless/00653_verification_monotonic_data_load.sh +++ b/tests/queries/0_stateless/00653_verification_monotonic_data_load.sh @@ -45,6 +45,7 @@ ${CLICKHOUSE_CLIENT} --query="INSERT INTO enum_test_table VALUES ('hello'), ('wo ${CLICKHOUSE_CLIENT} --query="INSERT INTO date_test_table VALUES (1), (2), (2), (256), (257), (257);" CLICKHOUSE_CLIENT=$(echo ${CLICKHOUSE_CLIENT} | sed 's/'"--send_logs_level=${CLICKHOUSE_CLIENT_SERVER_LOGS_LEVEL}"'/--send_logs_level=debug/g') +CLICKHOUSE_CLIENT="${CLICKHOUSE_CLIENT} --optimize_use_implicit_projections 0" ${CLICKHOUSE_CLIENT} --query="SELECT count() FROM string_test_table WHERE toUInt64(val) == 0;" 2>&1 |grep -q "3 marks to read from 1 ranges" && echo "no monotonic int case: String -> UInt64" ${CLICKHOUSE_CLIENT} --query="SELECT count() FROM fixed_string_test_table WHERE toUInt64(val) == 0;" 2>&1 |grep -q "3 marks to read from 1 ranges" && echo "no monotonic int case: FixedString -> UInt64" diff --git a/tests/queries/0_stateless/01710_projection_pk_trivial_count.reference b/tests/queries/0_stateless/01710_projection_pk_trivial_count.reference new file mode 100644 index 00000000000..43316772467 --- /dev/null +++ b/tests/queries/0_stateless/01710_projection_pk_trivial_count.reference @@ -0,0 +1,3 @@ + ReadFromMergeTree (default.x) + ReadFromPreparedSource (Optimized trivial count) +5 diff --git a/tests/queries/0_stateless/01710_projection_pk_trivial_count.sql b/tests/queries/0_stateless/01710_projection_pk_trivial_count.sql new file mode 100644 index 00000000000..ce9eadf06b2 --- /dev/null +++ b/tests/queries/0_stateless/01710_projection_pk_trivial_count.sql @@ -0,0 +1,8 @@ +drop table if exists x; + +create table x (i int) engine MergeTree order by i settings index_granularity = 3; +insert into x select * from numbers(10); +select * from (explain select count() from x where (i >= 3 and i <= 6) or i = 7) where explain like '%ReadFromPreparedSource%' or explain like '%ReadFromMergeTree%'; +select count() from x where (i >= 3 and i <= 6) or i = 7; + +drop table x; diff --git a/tests/queries/0_stateless/01822_union_and_constans_error.reference b/tests/queries/0_stateless/01822_union_and_constans_error.reference index d00491fd7e5..e69de29bb2d 100644 --- a/tests/queries/0_stateless/01822_union_and_constans_error.reference +++ b/tests/queries/0_stateless/01822_union_and_constans_error.reference @@ -1 +0,0 @@ -1 diff --git a/tests/queries/0_stateless/01822_union_and_constans_error.sql b/tests/queries/0_stateless/01822_union_and_constans_error.sql index 38b7df700cd..9017e8769eb 100644 --- a/tests/queries/0_stateless/01822_union_and_constans_error.sql +++ b/tests/queries/0_stateless/01822_union_and_constans_error.sql @@ -15,6 +15,6 @@ SELECT isNull(t0.c0) OR COUNT('\n?pVa') FROM t0 GROUP BY t0.c0 HAVING isNull(isNull(t0.c0)) -SETTINGS aggregate_functions_null_for_empty = 1, enable_optimize_predicate_expression = 0; +SETTINGS aggregate_functions_null_for_empty = 1, enable_optimize_predicate_expression = 0 format Null; drop table if exists t0; diff --git a/tests/queries/0_stateless/02156_storage_merge_prewhere.reference b/tests/queries/0_stateless/02156_storage_merge_prewhere.reference index 86a36a9392c..8a18c609ede 100644 --- a/tests/queries/0_stateless/02156_storage_merge_prewhere.reference +++ b/tests/queries/0_stateless/02156_storage_merge_prewhere.reference @@ -1,6 +1,3 @@ - Prewhere info - Prewhere filter - Prewhere filter column: and(notEmpty(v), equals(k, 3)) (removed) Prewhere info Prewhere filter Prewhere filter column: and(notEmpty(v), equals(k, 3)) (removed) @@ -8,8 +5,15 @@ Prewhere filter Prewhere filter column: and(notEmpty(v), equals(k, 3)) (removed) 2 - Filter column: and(equals(k, 3), notEmpty(v)) (removed) + Filter column: and(equals(k, 3), notEmpty(v)) (removed) + Prewhere info + Prewhere filter + Prewhere filter column: and(notEmpty(v), equals(k, 3)) (removed) 2 - Filter column: and(equals(k, 3), notEmpty(v)) (removed) - Filter column: and(equals(k, 3), notEmpty(v)) (removed) + Prewhere info + Prewhere filter + Prewhere filter column: and(notEmpty(v), equals(k, 3)) (removed) + Prewhere info + Prewhere filter + Prewhere filter column: and(notEmpty(v), equals(k, 3)) (removed) 2 diff --git a/tests/queries/0_stateless/02156_storage_merge_prewhere_not_ready_set_bug.reference b/tests/queries/0_stateless/02156_storage_merge_prewhere_not_ready_set_bug.reference new file mode 100644 index 00000000000..20c58c33770 --- /dev/null +++ b/tests/queries/0_stateless/02156_storage_merge_prewhere_not_ready_set_bug.reference @@ -0,0 +1 @@ +59900 1000 1396 diff --git a/tests/queries/0_stateless/02156_storage_merge_prewhere_not_ready_set_bug.sql b/tests/queries/0_stateless/02156_storage_merge_prewhere_not_ready_set_bug.sql new file mode 100644 index 00000000000..fc18c97cb6e --- /dev/null +++ b/tests/queries/0_stateless/02156_storage_merge_prewhere_not_ready_set_bug.sql @@ -0,0 +1,7 @@ +create table merge_kek_1 (x UInt32, y UInt32) engine = MergeTree order by x; +create table merge_kek_2 (x UInt32, y UInt32) engine = MergeTree order by x; + +insert into merge_kek_1 select number, number from numbers(100); +insert into merge_kek_2 select number + 500, number + 500 from numbers(1e6); + +select sum(x), min(x + x), max(x + x) from merge(currentDatabase(), '^merge_kek_.$') where x > 200 and y in (select 500 + number * 2 from numbers(100)) settings max_threads=2; diff --git a/tests/queries/0_stateless/02340_parts_refcnt_mergetree.sh b/tests/queries/0_stateless/02340_parts_refcnt_mergetree.sh index caa600298ce..b100f96befa 100755 --- a/tests/queries/0_stateless/02340_parts_refcnt_mergetree.sh +++ b/tests/queries/0_stateless/02340_parts_refcnt_mergetree.sh @@ -24,6 +24,7 @@ function check_refcnt_for_table() local log_file log_file=$(mktemp "$CUR_DIR/clickhouse-tests.XXXXXX.log") local args=( + --allow_repeated_settings --format Null --max_threads 1 --max_block_size 1 diff --git a/tests/queries/0_stateless/02477_logical_expressions_optimizer_low_cardinality.reference b/tests/queries/0_stateless/02477_logical_expressions_optimizer_low_cardinality.reference index 649b037fafa..11c178ac0d0 100644 --- a/tests/queries/0_stateless/02477_logical_expressions_optimizer_low_cardinality.reference +++ b/tests/queries/0_stateless/02477_logical_expressions_optimizer_low_cardinality.reference @@ -10,11 +10,15 @@ QUERY id: 0 JOIN TREE TABLE id: 3, alias: __table1, table_name: default.t_logical_expressions_optimizer_low_cardinality WHERE - FUNCTION id: 4, function_name: in, function_type: ordinary, result_type: UInt8 + FUNCTION id: 4, function_name: or, function_type: ordinary, result_type: UInt8 ARGUMENTS LIST id: 5, nodes: 2 - COLUMN id: 2, column_name: a, result_type: LowCardinality(String), source_id: 3 - CONSTANT id: 6, constant_value: Tuple_(\'x\', \'y\'), constant_value_type: Tuple(String, String) + FUNCTION id: 6, function_name: in, function_type: ordinary, result_type: LowCardinality(UInt8) + ARGUMENTS + LIST id: 7, nodes: 2 + COLUMN id: 2, column_name: a, result_type: LowCardinality(String), source_id: 3 + CONSTANT id: 8, constant_value: Tuple_(\'x\', \'y\'), constant_value_type: Tuple(String, String) + CONSTANT id: 9, constant_value: UInt64_0, constant_value_type: UInt8 SETTINGS allow_experimental_analyzer=1 SELECT a FROM t_logical_expressions_optimizer_low_cardinality @@ -28,11 +32,15 @@ QUERY id: 0 JOIN TREE TABLE id: 3, alias: __table1, table_name: default.t_logical_expressions_optimizer_low_cardinality WHERE - FUNCTION id: 4, function_name: in, function_type: ordinary, result_type: UInt8 + FUNCTION id: 4, function_name: or, function_type: ordinary, result_type: UInt8 ARGUMENTS LIST id: 5, nodes: 2 - COLUMN id: 2, column_name: a, result_type: LowCardinality(String), source_id: 3 - CONSTANT id: 6, constant_value: Tuple_(\'x\', \'y\'), constant_value_type: Tuple(String, String) + FUNCTION id: 6, function_name: in, function_type: ordinary, result_type: LowCardinality(UInt8) + ARGUMENTS + LIST id: 7, nodes: 2 + COLUMN id: 2, column_name: a, result_type: LowCardinality(String), source_id: 3 + CONSTANT id: 8, constant_value: Tuple_(\'x\', \'y\'), constant_value_type: Tuple(String, String) + CONSTANT id: 9, constant_value: UInt64_0, constant_value_type: UInt8 SETTINGS allow_experimental_analyzer=1 SELECT a FROM t_logical_expressions_optimizer_low_cardinality @@ -46,11 +54,15 @@ QUERY id: 0 JOIN TREE TABLE id: 3, alias: __table1, table_name: default.t_logical_expressions_optimizer_low_cardinality WHERE - FUNCTION id: 4, function_name: notIn, function_type: ordinary, result_type: UInt8 + FUNCTION id: 4, function_name: _CAST, function_type: ordinary, result_type: UInt8 ARGUMENTS LIST id: 5, nodes: 2 - COLUMN id: 2, column_name: a, result_type: LowCardinality(String), source_id: 3 - CONSTANT id: 6, constant_value: Tuple_(\'x\', \'y\'), constant_value_type: Tuple(String, String) + FUNCTION id: 6, function_name: notIn, function_type: ordinary, result_type: LowCardinality(UInt8) + ARGUMENTS + LIST id: 7, nodes: 2 + COLUMN id: 2, column_name: a, result_type: LowCardinality(String), source_id: 3 + CONSTANT id: 8, constant_value: Tuple_(\'x\', \'y\'), constant_value_type: Tuple(String, String) + CONSTANT id: 9, constant_value: \'UInt8\', constant_value_type: String SETTINGS allow_experimental_analyzer=1 SELECT a FROM t_logical_expressions_optimizer_low_cardinality @@ -64,11 +76,15 @@ QUERY id: 0 JOIN TREE TABLE id: 3, alias: __table1, table_name: default.t_logical_expressions_optimizer_low_cardinality WHERE - FUNCTION id: 4, function_name: notIn, function_type: ordinary, result_type: UInt8 + FUNCTION id: 4, function_name: _CAST, function_type: ordinary, result_type: UInt8 ARGUMENTS LIST id: 5, nodes: 2 - COLUMN id: 2, column_name: a, result_type: LowCardinality(String), source_id: 3 - CONSTANT id: 6, constant_value: Tuple_(\'x\', \'y\'), constant_value_type: Tuple(String, String) + FUNCTION id: 6, function_name: notIn, function_type: ordinary, result_type: LowCardinality(UInt8) + ARGUMENTS + LIST id: 7, nodes: 2 + COLUMN id: 2, column_name: a, result_type: LowCardinality(String), source_id: 3 + CONSTANT id: 8, constant_value: Tuple_(\'x\', \'y\'), constant_value_type: Tuple(String, String) + CONSTANT id: 9, constant_value: \'UInt8\', constant_value_type: String SETTINGS allow_experimental_analyzer=1 SELECT a FROM t_logical_expressions_optimizer_low_cardinality diff --git a/tests/queries/0_stateless/02488_zero_copy_detached_parts_drop_table.sh b/tests/queries/0_stateless/02488_zero_copy_detached_parts_drop_table.sh index b01f16e1cad..60cec5caea3 100755 --- a/tests/queries/0_stateless/02488_zero_copy_detached_parts_drop_table.sh +++ b/tests/queries/0_stateless/02488_zero_copy_detached_parts_drop_table.sh @@ -19,7 +19,7 @@ $CLICKHOUSE_CLIENT -q "select throwIf(substring('$path', 1, 1) != '/', 'Path is rm -f $path/count.txt $CLICKHOUSE_CLIENT -q "detach table rmt2 sync" -$CLICKHOUSE_CLIENT --send_logs_level='fatal' -q "attach table rmt2" +$CLICKHOUSE_CLIENT --allow_repeated_settings --send_logs_level='fatal' -q "attach table rmt2" $CLICKHOUSE_CLIENT -q "select reason, name from system.detached_parts where database='$CLICKHOUSE_DATABASE' and table='rmt2'" diff --git a/tests/queries/0_stateless/02494_query_cache_nested_query_bug.sh b/tests/queries/0_stateless/02494_query_cache_nested_query_bug.sh index 6bc3d03ac66..24803ed7420 100755 --- a/tests/queries/0_stateless/02494_query_cache_nested_query_bug.sh +++ b/tests/queries/0_stateless/02494_query_cache_nested_query_bug.sh @@ -20,12 +20,12 @@ SETTINGS_ANALYZER="SETTINGS use_query_cache=1, max_threads=1, allow_experimental # Verify that the first query does two aggregations and the second query zero aggregations. Since query cache is currently not integrated # with EXPLAIN PLAN, we need to check the logs. -${CLICKHOUSE_CLIENT} --send_logs_level=trace --query "SELECT count(a) / (SELECT sum(a) FROM tab) FROM tab $SETTINGS_NO_ANALYZER" 2>&1 | grep "Aggregated. " | wc -l -${CLICKHOUSE_CLIENT} --send_logs_level=trace --query "SELECT count(a) / (SELECT sum(a) FROM tab) FROM tab $SETTINGS_NO_ANALYZER" 2>&1 | grep "Aggregated. " | wc -l +${CLICKHOUSE_CLIENT} --allow_repeated_settings --send_logs_level=trace --query "SELECT count(a) / (SELECT sum(a) FROM tab) FROM tab $SETTINGS_NO_ANALYZER" 2>&1 | grep "Aggregated. " | wc -l +${CLICKHOUSE_CLIENT} --allow_repeated_settings --send_logs_level=trace --query "SELECT count(a) / (SELECT sum(a) FROM tab) FROM tab $SETTINGS_NO_ANALYZER" 2>&1 | grep "Aggregated. " | wc -l ${CLICKHOUSE_CLIENT} --query "SYSTEM DROP QUERY CACHE" -${CLICKHOUSE_CLIENT} --send_logs_level=trace --query "SELECT count(a) / (SELECT sum(a) FROM tab) FROM tab $SETTINGS_ANALYZER" 2>&1 | grep "Aggregated. " | wc -l -${CLICKHOUSE_CLIENT} --send_logs_level=trace --query "SELECT count(a) / (SELECT sum(a) FROM tab) FROM tab $SETTINGS_ANALYZER" 2>&1 | grep "Aggregated. " | wc -l +${CLICKHOUSE_CLIENT} --allow_repeated_settings --send_logs_level=trace --query "SELECT count(a) / (SELECT sum(a) FROM tab) FROM tab $SETTINGS_ANALYZER" 2>&1 | grep "Aggregated. " | wc -l +${CLICKHOUSE_CLIENT} --allow_repeated_settings --send_logs_level=trace --query "SELECT count(a) / (SELECT sum(a) FROM tab) FROM tab $SETTINGS_ANALYZER" 2>&1 | grep "Aggregated. " | wc -l ${CLICKHOUSE_CLIENT} --query "SYSTEM DROP QUERY CACHE" diff --git a/tests/queries/0_stateless/02530_dictionaries_update_field.reference b/tests/queries/0_stateless/02530_dictionaries_update_field.reference index 88c910e0313..4d5a7447a49 100644 --- a/tests/queries/0_stateless/02530_dictionaries_update_field.reference +++ b/tests/queries/0_stateless/02530_dictionaries_update_field.reference @@ -10,7 +10,7 @@ SELECT key, value FROM dict_flat ORDER BY key ASC; 2 Second INSERT INTO table_for_update_field_dictionary VALUES (2, 'SecondUpdated', now()); INSERT INTO table_for_update_field_dictionary VALUES (3, 'Third', now()); -SELECT sleepEachRow(1) FROM numbers(10) SETTINGS function_sleep_max_microseconds_per_block = 10000000 FORMAT Null; +SELECT sleepEachRow(1) FROM numbers(20) SETTINGS function_sleep_max_microseconds_per_block = 20000000 FORMAT Null; SELECT key, value FROM dict_flat ORDER BY key ASC; 1 First 2 SecondUpdated @@ -27,7 +27,7 @@ SELECT key, value FROM dict_flat_custom ORDER BY key ASC; 2 Second INSERT INTO table_for_update_field_dictionary VALUES (2, 'SecondUpdated', now()); INSERT INTO table_for_update_field_dictionary VALUES (3, 'Third', now()); -SELECT sleepEachRow(1) FROM numbers(10) SETTINGS function_sleep_max_microseconds_per_block = 10000000 FORMAT Null; +SELECT sleepEachRow(1) FROM numbers(20) SETTINGS function_sleep_max_microseconds_per_block = 20000000 FORMAT Null; SELECT key, value FROM dict_flat_custom ORDER BY key ASC; 1 First 2 SecondUpdated @@ -44,7 +44,7 @@ SELECT key, value FROM dict_hashed ORDER BY key ASC; 2 Second INSERT INTO table_for_update_field_dictionary VALUES (2, 'SecondUpdated', now()); INSERT INTO table_for_update_field_dictionary VALUES (3, 'Third', now()); -SELECT sleepEachRow(1) FROM numbers(10) SETTINGS function_sleep_max_microseconds_per_block = 10000000 FORMAT Null; +SELECT sleepEachRow(1) FROM numbers(20) SETTINGS function_sleep_max_microseconds_per_block = 20000000 FORMAT Null; SELECT key, value FROM dict_hashed ORDER BY key ASC; 1 First 2 SecondUpdated @@ -61,7 +61,7 @@ SELECT key, value FROM dict_hashed_custom ORDER BY key ASC; 2 Second INSERT INTO table_for_update_field_dictionary VALUES (2, 'SecondUpdated', now()); INSERT INTO table_for_update_field_dictionary VALUES (3, 'Third', now()); -SELECT sleepEachRow(1) FROM numbers(10) SETTINGS function_sleep_max_microseconds_per_block = 10000000 FORMAT Null; +SELECT sleepEachRow(1) FROM numbers(20) SETTINGS function_sleep_max_microseconds_per_block = 20000000 FORMAT Null; SELECT key, value FROM dict_hashed_custom ORDER BY key ASC; 1 First 2 SecondUpdated @@ -78,7 +78,7 @@ SELECT key, value FROM dict_complex_key_hashed ORDER BY key ASC; 2 Second INSERT INTO table_for_update_field_dictionary VALUES (2, 'SecondUpdated', now()); INSERT INTO table_for_update_field_dictionary VALUES (3, 'Third', now()); -SELECT sleepEachRow(1) FROM numbers(10) SETTINGS function_sleep_max_microseconds_per_block = 10000000 FORMAT Null; +SELECT sleepEachRow(1) FROM numbers(20) SETTINGS function_sleep_max_microseconds_per_block = 20000000 FORMAT Null; SELECT key, value FROM dict_complex_key_hashed ORDER BY key ASC; 1 First 2 SecondUpdated @@ -95,7 +95,7 @@ SELECT key, value FROM dict_complex_key_hashed_custom ORDER BY key ASC; 2 Second INSERT INTO table_for_update_field_dictionary VALUES (2, 'SecondUpdated', now()); INSERT INTO table_for_update_field_dictionary VALUES (3, 'Third', now()); -SELECT sleepEachRow(1) FROM numbers(10) SETTINGS function_sleep_max_microseconds_per_block = 10000000 FORMAT Null; +SELECT sleepEachRow(1) FROM numbers(20) SETTINGS function_sleep_max_microseconds_per_block = 20000000 FORMAT Null; SELECT key, value FROM dict_complex_key_hashed_custom ORDER BY key ASC; 1 First 2 SecondUpdated diff --git a/tests/queries/0_stateless/02530_dictionaries_update_field.sh b/tests/queries/0_stateless/02530_dictionaries_update_field.sh index 35881bdf896..9ced78a1196 100755 --- a/tests/queries/0_stateless/02530_dictionaries_update_field.sh +++ b/tests/queries/0_stateless/02530_dictionaries_update_field.sh @@ -60,7 +60,7 @@ for layout in "${layouts[@]}"; do INSERT INTO table_for_update_field_dictionary VALUES (2, 'SecondUpdated', now()); INSERT INTO table_for_update_field_dictionary VALUES (3, 'Third', now()); - SELECT sleepEachRow(1) FROM numbers(10) SETTINGS function_sleep_max_microseconds_per_block = 10000000 FORMAT Null; + SELECT sleepEachRow(1) FROM numbers(20) SETTINGS function_sleep_max_microseconds_per_block = 20000000 FORMAT Null; SELECT key, value FROM $dictionary_name ORDER BY key ASC; -- { echoOff } diff --git a/tests/queries/0_stateless/02535_max_parallel_replicas_custom_key.sh b/tests/queries/0_stateless/02535_max_parallel_replicas_custom_key.sh index 63644a51f8f..dccb680be42 100755 --- a/tests/queries/0_stateless/02535_max_parallel_replicas_custom_key.sh +++ b/tests/queries/0_stateless/02535_max_parallel_replicas_custom_key.sh @@ -41,6 +41,6 @@ run_count_with_custom_key "y" run_count_with_custom_key "cityHash64(y)" run_count_with_custom_key "cityHash64(y) + 1" -$CLICKHOUSE_CLIENT --query="SELECT count() FROM cluster(test_cluster_one_shard_three_replicas_localhost, currentDatabase(), 02535_custom_key) as t1 JOIN 02535_custom_key USING y" --parallel_replicas_custom_key="y" --send_logs_level="trace" 2>&1 | grep -Fac "JOINs are not supported with" +$CLICKHOUSE_CLIENT --query="SELECT count() FROM cluster(test_cluster_one_shard_three_replicas_localhost, currentDatabase(), 02535_custom_key) as t1 JOIN 02535_custom_key USING y" --allow_repeated_settings --parallel_replicas_custom_key="y" --send_logs_level="trace" 2>&1 | grep -Fac "JOINs are not supported with" $CLICKHOUSE_CLIENT --query="DROP TABLE 02535_custom_key" diff --git a/tests/queries/0_stateless/02892_orc_filter_pushdown.reference b/tests/queries/0_stateless/02892_orc_filter_pushdown.reference index e6c2e9b2b57..903d42bf492 100644 --- a/tests/queries/0_stateless/02892_orc_filter_pushdown.reference +++ b/tests/queries/0_stateless/02892_orc_filter_pushdown.reference @@ -206,7 +206,7 @@ select count(), sum(number) from file('02892.orc', ORC, 'number UInt64, negative select count(), min(negative_or_null), max(negative_or_null) from file('02892.orc', ORC, 'number UInt64, negative_or_null Int64') where (negative_or_null < -500); 596 -1099 -501 select count(), sum(number) from file('02892.orc', ORC, 'number UInt64, negative_or_null Int64') where indexHint(negative_or_null is null); -1000 499500 +0 0 select count(), min(negative_or_null), max(negative_or_null) from file('02892.orc', ORC, 'number UInt64, negative_or_null Int64') where (negative_or_null is null); 0 0 0 select count(), sum(number) from file('02892.orc', ORC, 'number UInt64, negative_or_null Int64') where indexHint(negative_or_null in (0, -1, -10, -100, -1000)); diff --git a/tests/queries/0_stateless/02896_cyclic_aliases_crash.reference b/tests/queries/0_stateless/02896_cyclic_aliases_crash.reference index caf11f5c15a..e537236478d 100644 --- a/tests/queries/0_stateless/02896_cyclic_aliases_crash.reference +++ b/tests/queries/0_stateless/02896_cyclic_aliases_crash.reference @@ -1,2 +1,3 @@ 1 2 3 1 5 +300 diff --git a/tests/queries/0_stateless/02896_cyclic_aliases_crash.sql b/tests/queries/0_stateless/02896_cyclic_aliases_crash.sql index 5fb628eeb67..5440872e052 100644 --- a/tests/queries/0_stateless/02896_cyclic_aliases_crash.sql +++ b/tests/queries/0_stateless/02896_cyclic_aliases_crash.sql @@ -30,3 +30,7 @@ WHERE (time_stamp_utc >= toDateTime('2024-04-25 00:00:00')) AND (time_stamp_utc GROUP BY time_stamp_utc ORDER BY Impressions DESC LIMIT 1000; + +drop table test_table; +create table test_table engine MergeTree order by sum as select 100 as sum union all select 200 as sum; +select sum as sum from (select sum(sum) as sum from test_table); diff --git a/tests/queries/0_stateless/02908_many_requests_to_system_replicas.sh b/tests/queries/0_stateless/02908_many_requests_to_system_replicas.sh index 144831a2cdc..a247c99a818 100755 --- a/tests/queries/0_stateless/02908_many_requests_to_system_replicas.sh +++ b/tests/queries/0_stateless/02908_many_requests_to_system_replicas.sh @@ -1,5 +1,5 @@ #!/usr/bin/env bash -# Tags: long, zookeeper, no-parallel, no-fasttest +# Tags: long, zookeeper, no-parallel, no-fasttest, no-asan CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) # shellcheck source=../shell_config.sh diff --git a/tests/queries/0_stateless/02918_optimize_count_for_merge_tables.reference b/tests/queries/0_stateless/02918_optimize_count_for_merge_tables.reference index 786a6b3bf25..7278018f1d6 100644 --- a/tests/queries/0_stateless/02918_optimize_count_for_merge_tables.reference +++ b/tests/queries/0_stateless/02918_optimize_count_for_merge_tables.reference @@ -7,6 +7,9 @@ Expression ((Projection + Before ORDER BY)) Aggregating Expression (Before GROUP BY) ReadFromMerge - ReadFromMergeTree (default.mt1) - ReadFromMergeTree (default.mt2) - ReadFromStorage (TinyLog) + Expression + ReadFromMergeTree (default.mt1) + Expression + ReadFromMergeTree (default.mt2) + Expression + ReadFromStorage (TinyLog) diff --git a/tests/queries/0_stateless/02922_deduplication_with_zero_copy.sh b/tests/queries/0_stateless/02922_deduplication_with_zero_copy.sh index bb013dccb65..dad4e6747e0 100755 --- a/tests/queries/0_stateless/02922_deduplication_with_zero_copy.sh +++ b/tests/queries/0_stateless/02922_deduplication_with_zero_copy.sh @@ -58,9 +58,9 @@ function filter_temporary_locks() function insert_duplicates() { - $CLICKHOUSE_CLIENT -q "insert into r1 values(1);" --send_logs_level="error" & + $CLICKHOUSE_CLIENT -q "insert into r1 values(1);" --allow_repeated_settings --send_logs_level="error" & - $CLICKHOUSE_CLIENT -q "insert into r2 values(1);" --send_logs_level="error" + $CLICKHOUSE_CLIENT -q "insert into r2 values(1);" --allow_repeated_settings --send_logs_level="error" wait @@ -137,8 +137,8 @@ function list_keeper_nodes() { list_keeper_nodes "${table_shared_id}" -$CLICKHOUSE_CLIENT -nm -q "drop table r1;" --send_logs_level="error" & -$CLICKHOUSE_CLIENT -nm -q "drop table r2;" --send_logs_level="error" & +$CLICKHOUSE_CLIENT -nm -q "drop table r1;" --allow_repeated_settings --send_logs_level="error" & +$CLICKHOUSE_CLIENT -nm -q "drop table r2;" --allow_repeated_settings --send_logs_level="error" & wait list_keeper_nodes "${table_shared_id}" diff --git a/tests/queries/0_stateless/02941_variant_type_1.sh b/tests/queries/0_stateless/02941_variant_type_1.sh index 22ca909a26e..723de45eaad 100755 --- a/tests/queries/0_stateless/02941_variant_type_1.sh +++ b/tests/queries/0_stateless/02941_variant_type_1.sh @@ -7,7 +7,7 @@ CLICKHOUSE_LOG_COMMENT= # shellcheck source=../shell_config.sh . "$CUR_DIR"/../shell_config.sh -CH_CLIENT="$CLICKHOUSE_CLIENT --allow_experimental_variant_type=1 --allow_suspicious_variant_types=1 --index_granularity_bytes=10485760 --index_granularity=8192" +CH_CLIENT="$CLICKHOUSE_CLIENT --allow_experimental_variant_type=1 --allow_suspicious_variant_types=1" function test1_insert() { @@ -115,11 +115,11 @@ run 0 $CH_CLIENT -q "drop table test;" echo "MergeTree compact" -$CH_CLIENT -q "create table test (id UInt64, v Variant(String, UInt64, LowCardinality(String), Tuple(a UInt32, b UInt32), Array(UInt64))) engine=MergeTree order by id settings min_rows_for_wide_part=100000000, min_bytes_for_wide_part=1000000000;" +$CH_CLIENT -q "create table test (id UInt64, v Variant(String, UInt64, LowCardinality(String), Tuple(a UInt32, b UInt32), Array(UInt64))) engine=MergeTree order by id settings min_rows_for_wide_part=100000000, min_bytes_for_wide_part=1000000000, index_granularity_bytes=10485760, index_granularity=8192;" run 1 $CH_CLIENT -q "drop table test;" echo "MergeTree wide" -$CH_CLIENT -q "create table test (id UInt64, v Variant(String, UInt64, LowCardinality(String), Tuple(a UInt32, b UInt32), Array(UInt64))) engine=MergeTree order by id settings min_rows_for_wide_part=1, min_bytes_for_wide_part=1;" +$CH_CLIENT -q "create table test (id UInt64, v Variant(String, UInt64, LowCardinality(String), Tuple(a UInt32, b UInt32), Array(UInt64))) engine=MergeTree order by id settings min_rows_for_wide_part=1, min_bytes_for_wide_part=1, index_granularity_bytes=10485760, index_granularity=8192;" run 1 $CH_CLIENT -q "drop table test;" diff --git a/tests/queries/0_stateless/02941_variant_type_2.sh b/tests/queries/0_stateless/02941_variant_type_2.sh index 91ba0285bd8..f43cd2bb0d6 100755 --- a/tests/queries/0_stateless/02941_variant_type_2.sh +++ b/tests/queries/0_stateless/02941_variant_type_2.sh @@ -7,7 +7,7 @@ CLICKHOUSE_LOG_COMMENT= # shellcheck source=../shell_config.sh . "$CUR_DIR"/../shell_config.sh -CH_CLIENT="$CLICKHOUSE_CLIENT --allow_experimental_variant_type=1 --allow_suspicious_variant_types=1 --index_granularity_bytes=10485760 --index_granularity=8192" +CH_CLIENT="$CLICKHOUSE_CLIENT --allow_experimental_variant_type=1 --allow_suspicious_variant_types=1" function test4_insert() { @@ -61,11 +61,11 @@ run 0 $CH_CLIENT -q "drop table test;" echo "MergeTree compact" -$CH_CLIENT -q "create table test (id UInt64, v Variant(String, UInt64, LowCardinality(String), Tuple(a UInt32, b UInt32), Array(UInt64))) engine=MergeTree order by id settings min_rows_for_wide_part=100000000, min_bytes_for_wide_part=1000000000;" +$CH_CLIENT -q "create table test (id UInt64, v Variant(String, UInt64, LowCardinality(String), Tuple(a UInt32, b UInt32), Array(UInt64))) engine=MergeTree order by id settings min_rows_for_wide_part=100000000, min_bytes_for_wide_part=1000000000, index_granularity_bytes=10485760, index_granularity=8192;" run 1 $CH_CLIENT -q "drop table test;" echo "MergeTree wide" -$CH_CLIENT -q "create table test (id UInt64, v Variant(String, UInt64, LowCardinality(String), Tuple(a UInt32, b UInt32), Array(UInt64))) engine=MergeTree order by id settings min_rows_for_wide_part=1, min_bytes_for_wide_part=1;" +$CH_CLIENT -q "create table test (id UInt64, v Variant(String, UInt64, LowCardinality(String), Tuple(a UInt32, b UInt32), Array(UInt64))) engine=MergeTree order by id settings min_rows_for_wide_part=1, min_bytes_for_wide_part=1, index_granularity_bytes=10485760, index_granularity=8192;" run 1 $CH_CLIENT -q "drop table test;" diff --git a/tests/queries/0_stateless/02941_variant_type_3.sh b/tests/queries/0_stateless/02941_variant_type_3.sh index 8a039a02d6d..f4b2b304f56 100755 --- a/tests/queries/0_stateless/02941_variant_type_3.sh +++ b/tests/queries/0_stateless/02941_variant_type_3.sh @@ -7,7 +7,7 @@ CLICKHOUSE_LOG_COMMENT= # shellcheck source=../shell_config.sh . "$CUR_DIR"/../shell_config.sh -CH_CLIENT="$CLICKHOUSE_CLIENT --allow_experimental_variant_type=1 --allow_suspicious_variant_types=1 --index_granularity_bytes=10485760 --index_granularity=8192 " +CH_CLIENT="$CLICKHOUSE_CLIENT --allow_experimental_variant_type=1 --allow_suspicious_variant_types=1" function test5_insert() { @@ -63,11 +63,11 @@ run 0 $CH_CLIENT -q "drop table test;" echo "MergeTree compact" -$CH_CLIENT -q "create table test (id UInt64, v Variant(String, UInt64, LowCardinality(String), Tuple(a UInt32, b UInt32), Array(UInt64))) engine=MergeTree order by id settings min_rows_for_wide_part=100000000, min_bytes_for_wide_part=1000000000;" +$CH_CLIENT -q "create table test (id UInt64, v Variant(String, UInt64, LowCardinality(String), Tuple(a UInt32, b UInt32), Array(UInt64))) engine=MergeTree order by id settings min_rows_for_wide_part=100000000, min_bytes_for_wide_part=1000000000, index_granularity_bytes=10485760, index_granularity=8192;" run 1 $CH_CLIENT -q "drop table test;" echo "MergeTree wide" -$CH_CLIENT -q "create table test (id UInt64, v Variant(String, UInt64, LowCardinality(String), Tuple(a UInt32, b UInt32), Array(UInt64))) engine=MergeTree order by id settings min_rows_for_wide_part=1, min_bytes_for_wide_part=1;" +$CH_CLIENT -q "create table test (id UInt64, v Variant(String, UInt64, LowCardinality(String), Tuple(a UInt32, b UInt32), Array(UInt64))) engine=MergeTree order by id settings min_rows_for_wide_part=1, min_bytes_for_wide_part=1, index_granularity_bytes=10485760, index_granularity=8192;" run 1 $CH_CLIENT -q "drop table test;" diff --git a/tests/queries/0_stateless/02941_variant_type_4.sh b/tests/queries/0_stateless/02941_variant_type_4.sh index e38db8fda54..f9a16847864 100755 --- a/tests/queries/0_stateless/02941_variant_type_4.sh +++ b/tests/queries/0_stateless/02941_variant_type_4.sh @@ -7,7 +7,8 @@ CLICKHOUSE_LOG_COMMENT= # shellcheck source=../shell_config.sh . "$CUR_DIR"/../shell_config.sh -CH_CLIENT="$CLICKHOUSE_CLIENT --allow_experimental_variant_type=1 --allow_suspicious_variant_types=1 --index_granularity_bytes=10485760 --index_granularity=8192 " + +CH_CLIENT="$CLICKHOUSE_CLIENT --allow_experimental_variant_type=1 --allow_suspicious_variant_types=1" function test6_insert() { @@ -57,11 +58,11 @@ run 0 $CH_CLIENT -q "drop table test;" echo "MergeTree compact" -$CH_CLIENT -q "create table test (id UInt64, v Variant(String, UInt64, LowCardinality(String), Tuple(a UInt32, b UInt32), Array(UInt64))) engine=MergeTree order by id settings min_rows_for_wide_part=100000000, min_bytes_for_wide_part=1000000000;" +$CH_CLIENT -q "create table test (id UInt64, v Variant(String, UInt64, LowCardinality(String), Tuple(a UInt32, b UInt32), Array(UInt64))) engine=MergeTree order by id settings min_rows_for_wide_part=100000000, min_bytes_for_wide_part=1000000000, index_granularity_bytes=10485760, index_granularity=8192;" run 1 $CH_CLIENT -q "drop table test;" echo "MergeTree wide" -$CH_CLIENT -q "create table test (id UInt64, v Variant(String, UInt64, LowCardinality(String), Tuple(a UInt32, b UInt32), Array(UInt64))) engine=MergeTree order by id settings min_rows_for_wide_part=1, min_bytes_for_wide_part=1;" +$CH_CLIENT -q "create table test (id UInt64, v Variant(String, UInt64, LowCardinality(String), Tuple(a UInt32, b UInt32), Array(UInt64))) engine=MergeTree order by id settings min_rows_for_wide_part=1, min_bytes_for_wide_part=1, index_granularity_bytes=10485760, index_granularity=8192;" run 1 $CH_CLIENT -q "drop table test;" diff --git a/tests/queries/0_stateless/03023_zeros_generate_random_with_limit_progress_bar.expect b/tests/queries/0_stateless/03023_zeros_generate_random_with_limit_progress_bar.expect deleted file mode 100755 index de15a199132..00000000000 --- a/tests/queries/0_stateless/03023_zeros_generate_random_with_limit_progress_bar.expect +++ /dev/null @@ -1,49 +0,0 @@ -#!/usr/bin/expect -f - -set basedir [file dirname $argv0] -set basename [file tail $argv0] -if {[info exists env(CLICKHOUSE_TMP)]} { - set CLICKHOUSE_TMP $env(CLICKHOUSE_TMP) -} else { - set CLICKHOUSE_TMP "." -} -exp_internal -f $CLICKHOUSE_TMP/$basename.debuglog 0 - -log_user 0 -set timeout 60 -match_max 100000 -set stty_init "rows 25 cols 120" - -expect_after { - -i $any_spawn_id eof { exp_continue } - -i $any_spawn_id timeout { exit 1 } -} - -spawn clickhouse-local -expect ":) " - -# Trivial SELECT with LIMIT from system.zeros shows progress bar. -send "SELECT * FROM system.zeros LIMIT 10000000 FORMAT Null SETTINGS max_execution_speed = 1000000, timeout_before_checking_execution_speed = 0, max_block_size = 128\r" -expect "Progress: " -expect "█" -send "\3" -expect "Query was cancelled." -expect ":) " - -send "SELECT * FROM system.zeros_mt LIMIT 10000000 FORMAT Null SETTINGS max_execution_speed = 1000000, timeout_before_checking_execution_speed = 0, max_block_size = 128\r" -expect "Progress: " -expect "█" -send "\3" -expect "Query was cancelled." -expect ":) " - -# As well as from generateRandom -send "SELECT * FROM generateRandom() LIMIT 10000000 FORMAT Null SETTINGS max_execution_speed = 1000000, timeout_before_checking_execution_speed = 0, max_block_size = 128\r" -expect "Progress: " -expect "█" -send "\3" -expect "Query was cancelled." -expect ":) " - -send "exit\r" -expect eof diff --git a/tests/queries/0_stateless/03023_zeros_generate_random_with_limit_progress_bar.reference b/tests/queries/0_stateless/03023_zeros_generate_random_with_limit_progress_bar.reference index e69de29bb2d..6ca5ae94f9a 100644 --- a/tests/queries/0_stateless/03023_zeros_generate_random_with_limit_progress_bar.reference +++ b/tests/queries/0_stateless/03023_zeros_generate_random_with_limit_progress_bar.reference @@ -0,0 +1,3 @@ +Matched +Matched +Matched diff --git a/tests/queries/0_stateless/03023_zeros_generate_random_with_limit_progress_bar.sh b/tests/queries/0_stateless/03023_zeros_generate_random_with_limit_progress_bar.sh new file mode 100755 index 00000000000..500a12587a2 --- /dev/null +++ b/tests/queries/0_stateless/03023_zeros_generate_random_with_limit_progress_bar.sh @@ -0,0 +1,18 @@ +#!/usr/bin/env bash +# Tags: no-random-settings + +CUR_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CUR_DIR"/../shell_config.sh + +function run_with_progress_and_match_total_rows() +{ + CURL_RESPONSE=$(echo "$1" | \ + ${CLICKHOUSE_CURL} -vsS "${CLICKHOUSE_URL}&wait_end_of_query=1&max_block_size=1&send_progress_in_http_headers=1&http_headers_progress_interval_ms=0&output_format_parallel_formatting=0" --data-binary @- 2>&1) + + echo "$CURL_RESPONSE" | grep -q '"total_rows_to_read":"100"' && echo "Matched" || echo "Expected total_rows_to_read not found: ${CURL_RESPONSE}" +} + +run_with_progress_and_match_total_rows 'SELECT * FROM system.zeros LIMIT 100' +run_with_progress_and_match_total_rows 'SELECT * FROM system.zeros_mt LIMIT 100' +run_with_progress_and_match_total_rows "SELECT * FROM generateRandom('number UInt64') LIMIT 100" diff --git a/tests/queries/0_stateless/03037_dynamic_merges_1_horizontal.sh b/tests/queries/0_stateless/03037_dynamic_merges_1_horizontal.sh index 7c1ac41cfdc..887b2ed94d7 100755 --- a/tests/queries/0_stateless/03037_dynamic_merges_1_horizontal.sh +++ b/tests/queries/0_stateless/03037_dynamic_merges_1_horizontal.sh @@ -8,7 +8,7 @@ CLICKHOUSE_LOG_COMMENT= . "$CUR_DIR"/../shell_config.sh -CH_CLIENT="$CLICKHOUSE_CLIENT --allow_experimental_dynamic_type=1 --index_granularity_bytes 10485760 --merge_max_block_size 8192 --merge_max_block_size_bytes=10485760 --index_granularity 8192" +CH_CLIENT="$CLICKHOUSE_CLIENT --allow_experimental_dynamic_type=1" function test() { @@ -41,12 +41,12 @@ function test() $CH_CLIENT -q "drop table if exists test;" echo "MergeTree compact" -$CH_CLIENT -q "create table test (id UInt64, d Dynamic(max_types=3)) engine=MergeTree order by id settings min_rows_for_wide_part=1000000000, min_bytes_for_wide_part=10000000000, vertical_merge_algorithm_min_columns_to_activate=10;" +$CH_CLIENT -q "create table test (id UInt64, d Dynamic(max_types=3)) engine=MergeTree order by id settings min_rows_for_wide_part=1000000000, min_bytes_for_wide_part=10000000000, vertical_merge_algorithm_min_columns_to_activate=10, index_granularity_bytes=10485760, index_granularity=8192, merge_max_block_size=8192, merge_max_block_size_bytes=10485760;" test $CH_CLIENT -q "drop table test;" echo "MergeTree wide" -$CH_CLIENT -q "create table test (id UInt64, d Dynamic(max_types=3)) engine=MergeTree order by id settings min_rows_for_wide_part=1, min_bytes_for_wide_part=1, vertical_merge_algorithm_min_columns_to_activate=10;" +$CH_CLIENT -q "create table test (id UInt64, d Dynamic(max_types=3)) engine=MergeTree order by id settings min_rows_for_wide_part=1, min_bytes_for_wide_part=1, vertical_merge_algorithm_min_columns_to_activate=10, index_granularity_bytes=10485760, index_granularity=8192, merge_max_block_size=8192, merge_max_block_size_bytes=10485760;" test $CH_CLIENT -q "drop table test;" diff --git a/tests/queries/0_stateless/03037_dynamic_merges_1_vertical.sh b/tests/queries/0_stateless/03037_dynamic_merges_1_vertical.sh index 927ceac72b5..371ae87c2ef 100755 --- a/tests/queries/0_stateless/03037_dynamic_merges_1_vertical.sh +++ b/tests/queries/0_stateless/03037_dynamic_merges_1_vertical.sh @@ -9,7 +9,7 @@ CLICKHOUSE_LOG_COMMENT= -CH_CLIENT="$CLICKHOUSE_CLIENT --allow_experimental_dynamic_type=1 --index_granularity_bytes 10485760 --merge_max_block_size 8192 --merge_max_block_size_bytes=10485760 --index_granularity 8192" +CH_CLIENT="$CLICKHOUSE_CLIENT --allow_experimental_dynamic_type=1" function test() { echo "test" @@ -41,11 +41,11 @@ function test() $CH_CLIENT -q "drop table if exists test;" echo "MergeTree compact" -$CH_CLIENT -q "create table test (id UInt64, d Dynamic(max_types=3)) engine=MergeTree order by id settings min_rows_for_wide_part=1000000000, min_bytes_for_wide_part=10000000000, vertical_merge_algorithm_min_rows_to_activate=1, vertical_merge_algorithm_min_columns_to_activate=1;" +$CH_CLIENT -q "create table test (id UInt64, d Dynamic(max_types=3)) engine=MergeTree order by id settings min_rows_for_wide_part=1000000000, min_bytes_for_wide_part=10000000000, vertical_merge_algorithm_min_rows_to_activate=1, vertical_merge_algorithm_min_columns_to_activate=1, index_granularity_bytes=10485760, index_granularity=8192, merge_max_block_size=8192, merge_max_block_size_bytes=10485760;" test $CH_CLIENT -q "drop table test;" echo "MergeTree wide" -$CH_CLIENT -q "create table test (id UInt64, d Dynamic(max_types=3)) engine=MergeTree order by id settings min_rows_for_wide_part=1, min_bytes_for_wide_part=1, vertical_merge_algorithm_min_rows_to_activate=1, vertical_merge_algorithm_min_columns_to_activate=1;" +$CH_CLIENT -q "create table test (id UInt64, d Dynamic(max_types=3)) engine=MergeTree order by id settings min_rows_for_wide_part=1, min_bytes_for_wide_part=1, vertical_merge_algorithm_min_rows_to_activate=1, vertical_merge_algorithm_min_columns_to_activate=1, index_granularity_bytes=10485760, index_granularity=8192, merge_max_block_size=8192, merge_max_block_size_bytes=10485760;" test $CH_CLIENT -q "drop table test;" diff --git a/tests/queries/0_stateless/03039_dynamic_versioned_collapsing_merge_tree.sh b/tests/queries/0_stateless/03039_dynamic_versioned_collapsing_merge_tree.sh index ca313307a6d..60248f4453a 100755 --- a/tests/queries/0_stateless/03039_dynamic_versioned_collapsing_merge_tree.sh +++ b/tests/queries/0_stateless/03039_dynamic_versioned_collapsing_merge_tree.sh @@ -7,6 +7,7 @@ CLICKHOUSE_LOG_COMMENT= # shellcheck source=../shell_config.sh . "$CUR_DIR"/../shell_config.sh + # Fix some settings to avoid timeouts because of some settings randomization CH_CLIENT="$CLICKHOUSE_CLIENT --allow_merge_tree_settings --allow_experimental_dynamic_type=1 --index_granularity_bytes 10485760 --index_granularity 8128 --merge_max_block_size 8128" @@ -32,7 +33,7 @@ echo "MergeTree wide + horizontal merge" test "min_rows_for_wide_part=1, min_bytes_for_wide_part=1" echo "MergeTree compact + vertical merge" -test "min_rows_for_wide_part=100000000000, min_bytes_for_wide_part=1000000000000, vertical_merge_algorithm_min_rows_to_activate=1, vertical_merge_algorithm_min_columns_to_activate=1" +test "min_rows_for_wide_part=100000000000, min_bytes_for_wide_part=1000000000000, vertical_merge_algorithm_min_rows_to_activate=1, vertical_merge_algorithm_min_columns_to_activate=1;" echo "MergeTree wide + vertical merge" -test "min_rows_for_wide_part=1, min_bytes_for_wide_part=1, vertical_merge_algorithm_min_rows_to_activate=1, vertical_merge_algorithm_min_columns_to_activate=1" +test "min_rows_for_wide_part=1, min_bytes_for_wide_part=1, vertical_merge_algorithm_min_rows_to_activate=1, vertical_merge_algorithm_min_columns_to_activate=1;" diff --git a/tests/queries/0_stateless/03142_alter_comment_parameterized_view.reference b/tests/queries/0_stateless/03142_alter_comment_parameterized_view.reference new file mode 100644 index 00000000000..9b93c75ea56 --- /dev/null +++ b/tests/queries/0_stateless/03142_alter_comment_parameterized_view.reference @@ -0,0 +1 @@ +CREATE VIEW default.test_table_comment AS (SELECT toString({date_from:String})) COMMENT \'test comment\' diff --git a/tests/queries/0_stateless/03142_alter_comment_parameterized_view.sql b/tests/queries/0_stateless/03142_alter_comment_parameterized_view.sql new file mode 100644 index 00000000000..98318e99e4a --- /dev/null +++ b/tests/queries/0_stateless/03142_alter_comment_parameterized_view.sql @@ -0,0 +1,5 @@ +DROP TABLE IF EXISTS test_table_comment; +CREATE VIEW test_table_comment AS SELECT toString({date_from:String}); +ALTER TABLE test_table_comment MODIFY COMMENT 'test comment'; +SELECT create_table_query FROM system.tables WHERE name = 'test_table_comment' AND database = currentDatabase(); +DROP TABLE test_table_comment; diff --git a/tests/queries/0_stateless/03143_prewhere_profile_events.sh b/tests/queries/0_stateless/03143_prewhere_profile_events.sh index 863fcc1fe01..00daa0fe7cc 100755 --- a/tests/queries/0_stateless/03143_prewhere_profile_events.sh +++ b/tests/queries/0_stateless/03143_prewhere_profile_events.sh @@ -1,5 +1,5 @@ #!/usr/bin/env bash -# Tags: no-random-merge-tree-settings +# Tags: no-random-settings, no-random-merge-tree-settings CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) # shellcheck source=../shell_config.sh @@ -8,7 +8,7 @@ CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) ${CLICKHOUSE_CLIENT} -nq " DROP TABLE IF EXISTS t; - CREATE TABLE t(a UInt32, b UInt32, c UInt32, d UInt32) ENGINE=MergeTree ORDER BY a SETTINGS min_bytes_for_wide_part=1, min_rows_for_wide_part=1; + CREATE TABLE t(a UInt32, b UInt32, c UInt32, d UInt32) ENGINE=MergeTree ORDER BY a SETTINGS min_bytes_for_wide_part=0, min_rows_for_wide_part=0; INSERT INTO t SELECT number, number, number, number FROM numbers_mt(1e7); diff --git a/tests/queries/0_stateless/03155_analyzer_interpolate.reference b/tests/queries/0_stateless/03155_analyzer_interpolate.reference index 791aaa5b2a2..eade3b45d26 100644 --- a/tests/queries/0_stateless/03155_analyzer_interpolate.reference +++ b/tests/queries/0_stateless/03155_analyzer_interpolate.reference @@ -11,3 +11,8 @@ 5 [5] 5.5 [5] 7 [7] +2 +100500 +18 +26 +34 diff --git a/tests/queries/0_stateless/03155_analyzer_interpolate.sql b/tests/queries/0_stateless/03155_analyzer_interpolate.sql index b3c1d233f47..30423cb86ff 100644 --- a/tests/queries/0_stateless/03155_analyzer_interpolate.sql +++ b/tests/queries/0_stateless/03155_analyzer_interpolate.sql @@ -10,3 +10,6 @@ SELECT n, number+5 AS inter FROM ( -- { serverError NOT_AN_AGGREGATE } SELECT toFloat32(number % 10) AS n, number, number*2 AS mn FROM numbers(10) WHERE number % 3 = 1 ) GROUP BY n, inter ORDER BY n WITH FILL FROM 0 TO 5.51 STEP 0.5 INTERPOLATE (inter AS mn * 2); + +-- https://github.com/ClickHouse/ClickHouse/issues/64636 +select sum(number) as s from remote('127.0.0.{1,2}', numbers(10)) where (intDiv(number, 2) as key) != 1 group by key order by key with fill interpolate (s as 100500); diff --git a/tests/queries/0_stateless/03164_adapting_parquet_reader_output_size.reference b/tests/queries/0_stateless/03164_adapting_parquet_reader_output_size.reference new file mode 100644 index 00000000000..ef9b07ba955 --- /dev/null +++ b/tests/queries/0_stateless/03164_adapting_parquet_reader_output_size.reference @@ -0,0 +1,4 @@ +65409 +16 +128 +2363 diff --git a/tests/queries/0_stateless/03164_adapting_parquet_reader_output_size.sql b/tests/queries/0_stateless/03164_adapting_parquet_reader_output_size.sql new file mode 100644 index 00000000000..fa098b64702 --- /dev/null +++ b/tests/queries/0_stateless/03164_adapting_parquet_reader_output_size.sql @@ -0,0 +1,25 @@ +-- Tags: no-fasttest, no-parallel, no-random-settings + +set max_insert_threads=1; + +DROP TABLE IF EXISTS test_parquet; +CREATE TABLE test_parquet (col1 String, col2 String, col3 String, col4 String, col5 String, col6 String, col7 String) ENGINE=File(Parquet); +INSERT INTO test_parquet SELECT rand(),rand(),rand(),rand(),rand(),rand(),rand() FROM numbers(100000); +SELECT max(blockSize()) FROM test_parquet; + +DROP TABLE IF EXISTS test_parquet; +CREATE TABLE test_parquet (col1 String, col2 String, col3 String, col4 String, col5 String, col6 String, col7 String) ENGINE=File(Parquet) settings input_format_parquet_max_block_size=16; +INSERT INTO test_parquet SELECT rand(),rand(),rand(),rand(),rand(),rand(),rand() FROM numbers(100000); +SELECT max(blockSize()) FROM test_parquet; + +DROP TABLE IF EXISTS test_parquet; +CREATE TABLE test_parquet (col1 String, col2 String, col3 String, col4 String, col5 String, col6 String, col7 String) ENGINE=File(Parquet) settings input_format_parquet_prefer_block_bytes=30; +INSERT INTO test_parquet SELECT rand(),rand(),rand(),rand(),rand(),rand(),rand() FROM numbers(100000); +SELECT max(blockSize()) FROM test_parquet; + +DROP TABLE IF EXISTS test_parquet; +CREATE TABLE test_parquet (col1 String, col2 String, col3 String, col4 String, col5 String, col6 String, col7 String) ENGINE=File(Parquet) settings input_format_parquet_prefer_block_bytes=30720; +INSERT INTO test_parquet SELECT rand(),rand(),rand(),rand(),rand(),rand(),rand() FROM numbers(100000); +SELECT max(blockSize()) FROM test_parquet; + +DROP TABLE IF EXISTS test_parquet; \ No newline at end of file diff --git a/tests/queries/0_stateless/03164_early_constant_folding_analyzer.reference b/tests/queries/0_stateless/03164_early_constant_folding_analyzer.reference new file mode 100644 index 00000000000..227b118bb7f --- /dev/null +++ b/tests/queries/0_stateless/03164_early_constant_folding_analyzer.reference @@ -0,0 +1 @@ +ReadFromPreparedSource (Optimized trivial count) diff --git a/tests/queries/0_stateless/03164_early_constant_folding_analyzer.sql b/tests/queries/0_stateless/03164_early_constant_folding_analyzer.sql new file mode 100644 index 00000000000..dbffbc1af71 --- /dev/null +++ b/tests/queries/0_stateless/03164_early_constant_folding_analyzer.sql @@ -0,0 +1,30 @@ +CREATE TABLE checks +( + `pull_request_number` UInt32, + `commit_sha` LowCardinality(String), + `check_name` LowCardinality(String), + `check_status` LowCardinality(String), + `check_duration_ms` UInt64, + `check_start_time` DateTime, + `test_name` LowCardinality(String), + `test_status` LowCardinality(String), + `test_duration_ms` UInt64, + `report_url` String, + `pull_request_url` String, + `commit_url` String, + `task_url` String, + `base_ref` String, + `base_repo` String, + `head_ref` String, + `head_repo` String, + `test_context_raw` String, + `instance_type` LowCardinality(String), + `instance_id` String, + `date` Date MATERIALIZED toDate(check_start_time) +) +ENGINE = MergeTree ORDER BY (date, pull_request_number, commit_sha, check_name, test_name, check_start_time); + +insert into checks select * from generateRandom() limit 1; + + +select trimLeft(explain) from (explain SELECT count(1) FROM checks WHERE test_name IS NOT NULL) where explain like '%ReadFromPreparedSource%' SETTINGS allow_experimental_analyzer = 1, allow_experimental_parallel_reading_from_replicas = 0; diff --git a/tests/queries/0_stateless/03165_string_functions_with_token_text_indexes.sql b/tests/queries/0_stateless/03165_string_functions_with_token_text_indexes.sql index fee30af0245..bae98bd1eb6 100644 --- a/tests/queries/0_stateless/03165_string_functions_with_token_text_indexes.sql +++ b/tests/queries/0_stateless/03165_string_functions_with_token_text_indexes.sql @@ -127,7 +127,9 @@ CREATE TABLE 03165_token_ft INDEX idx_message message TYPE full_text() GRANULARITY 1 ) ENGINE = MergeTree -ORDER BY id; +ORDER BY id +-- Full text index works only with full parts. +SETTINGS min_bytes_for_full_part_storage=0; INSERT INTO 03165_token_ft VALUES(1, 'Service is not ready'); diff --git a/tests/queries/0_stateless/03166_optimize_row_order_during_insert.sql b/tests/queries/0_stateless/03166_optimize_row_order_during_insert.sql index bb2f5e94d05..5fc71598e47 100644 --- a/tests/queries/0_stateless/03166_optimize_row_order_during_insert.sql +++ b/tests/queries/0_stateless/03166_optimize_row_order_during_insert.sql @@ -14,7 +14,7 @@ CREATE TABLE tab ( event Int8 ) ENGINE = MergeTree ORDER BY name -SETTINGS allow_experimental_optimized_row_order = true; +SETTINGS optimize_row_order = true; INSERT INTO tab VALUES ('Igor', 3), ('Egor', 1), ('Egor', 2), ('Igor', 2), ('Igor', 1); SELECT * FROM tab ORDER BY name SETTINGS max_threads=1; @@ -34,7 +34,7 @@ CREATE TABLE tab ( flag String ) ENGINE = MergeTree ORDER BY () -SETTINGS allow_experimental_optimized_row_order = True; +SETTINGS optimize_row_order = True; INSERT INTO tab VALUES ('Bob', 4, 100, '1'), ('Nikita', 2, 54, '1'), ('Nikita', 1, 228, '1'), ('Alex', 4, 83, '1'), ('Alex', 4, 134, '1'), ('Alex', 1, 65, '0'), ('Alex', 4, 134, '1'), ('Bob', 2, 53, '0'), ('Alex', 4, 83, '0'), ('Alex', 1, 63, '1'), ('Bob', 2, 53, '1'), ('Alex', 4, 192, '1'), ('Alex', 2, 128, '1'), ('Nikita', 2, 148, '0'), ('Bob', 4, 177, '0'), ('Nikita', 1, 173, '0'), ('Alex', 1, 239, '0'), ('Alex', 1, 63, '0'), ('Alex', 2, 224, '1'), ('Bob', 4, 177, '0'), ('Alex', 2, 128, '1'), ('Alex', 4, 134, '0'), ('Alex', 4, 83, '1'), ('Bob', 4, 100, '0'), ('Nikita', 2, 54, '1'), ('Alex', 1, 239, '1'), ('Bob', 2, 187, '1'), ('Alex', 1, 65, '1'), ('Bob', 2, 53, '1'), ('Alex', 2, 224, '0'), ('Alex', 4, 192, '0'), ('Nikita', 1, 173, '1'), ('Nikita', 2, 148, '1'), ('Bob', 2, 187, '1'), ('Nikita', 2, 208, '1'), ('Nikita', 2, 208, '0'), ('Nikita', 1, 228, '0'), ('Nikita', 2, 148, '0'); SELECT * FROM tab SETTINGS max_threads=1; @@ -58,7 +58,7 @@ CREATE TABLE tab ( flag Nullable(Int32) ) ENGINE = MergeTree ORDER BY (flag, money) -SETTINGS allow_experimental_optimized_row_order = True, allow_nullable_key = True; +SETTINGS optimize_row_order = True, allow_nullable_key = True; INSERT INTO tab VALUES ('AB', 0, 42, Null), ('AB', 0, 42, Null), ('A', 1, 42, Null), ('AB', 1, 9.81, 0), ('B', 0, 42, Null), ('B', -1, 3.14, Null), ('B', 1, 2.7, 1), ('B', 0, 42, 1), ('A', 1, 42, 1), ('B', 1, 42, Null), ('B', 0, 2.7, 1), ('A', 0, 2.7, 1), ('B', 2, 3.14, Null), ('A', 0, 3.14, Null), ('A', 1, 2.7, 1), ('A', 1, 42, Null); SELECT * FROM tab ORDER BY (flag, money) SETTINGS max_threads=1; @@ -89,7 +89,7 @@ CREATE TABLE tab ( tuple_column Tuple(UInt256) ) ENGINE = MergeTree() ORDER BY (fixed_str, event_date) -SETTINGS allow_experimental_optimized_row_order = True; +SETTINGS optimize_row_order = True; INSERT INTO tab VALUES ('A', '2020-01-01', [0.0, 1.1], 10, 'some string', {'key':'value'}, (123)), ('A', '2020-01-01', [0.0, 1.1], NULL, 'example', {}, (26)), ('A', '2020-01-01', [2.2, 1.1], 1, 'some other string', {'key2':'value2'}, (5)), ('A', '2020-01-02', [0.0, 1.1], 10, 'some string', {'key':'value'}, (123)), ('A', '2020-01-02', [0.0, 2.2], 10, 'example', {}, (26)), ('A', '2020-01-02', [2.2, 1.1], 1, 'some other string', {'key2':'value2'}, (5)), ('B', '2020-01-04', [0.0, 1.1], 10, 'some string', {'key':'value'}, (123)), ('B', '2020-01-04', [0.0, 2.2], Null, 'example', {}, (26)), ('B', '2020-01-04', [2.2, 1.1], 1, 'some string', {'key2':'value2'}, (5)), ('B', '2020-01-05', [0.0, 1.1], 10, 'some string', {'key':'value'}, (123)), ('B', '2020-01-05', [0.0, 2.2], Null, 'example', {}, (26)), ('B', '2020-01-05', [2.2, 1.1], 1, 'some other string', {'key':'value'}, (5)), ('C', '2020-01-04', [0.0, 1.1], 10, 'some string', {'key':'value'}, (5)), ('C', '2020-01-04', [0.0, 2.2], Null, 'example', {}, (26)), ('C', '2020-01-04', [2.2, 1.1], 1, 'some other string', {'key2':'value2'}, (5)); diff --git a/tests/queries/0_stateless/03166_skip_indexes_vertical_merge_1.reference b/tests/queries/0_stateless/03166_skip_indexes_vertical_merge_1.reference new file mode 100644 index 00000000000..86f79bea4ba --- /dev/null +++ b/tests/queries/0_stateless/03166_skip_indexes_vertical_merge_1.reference @@ -0,0 +1,33 @@ +200 +Expression ((Project names + Projection)) + Aggregating + Expression (Before GROUP BY) + Filter ((WHERE + Change column names to column identifiers)) + ReadFromMergeTree (default.t_ind_merge_1) + Indexes: + PrimaryKey + Condition: true + Parts: 2/2 + Granules: 32/32 + Skip + Name: idx_b + Description: minmax GRANULARITY 1 + Parts: 2/2 + Granules: 4/32 +200 +Expression ((Project names + Projection)) + Aggregating + Expression (Before GROUP BY) + Filter ((WHERE + Change column names to column identifiers)) + ReadFromMergeTree (default.t_ind_merge_1) + Indexes: + PrimaryKey + Condition: true + Parts: 1/1 + Granules: 32/32 + Skip + Name: idx_b + Description: minmax GRANULARITY 1 + Parts: 1/1 + Granules: 4/32 +4 1 3 diff --git a/tests/queries/0_stateless/03166_skip_indexes_vertical_merge_1.sql b/tests/queries/0_stateless/03166_skip_indexes_vertical_merge_1.sql new file mode 100644 index 00000000000..d3e3b38a3cb --- /dev/null +++ b/tests/queries/0_stateless/03166_skip_indexes_vertical_merge_1.sql @@ -0,0 +1,39 @@ +DROP TABLE IF EXISTS t_ind_merge_1; + +SET allow_experimental_analyzer = 1; + +CREATE TABLE t_ind_merge_1 (a UInt64, b UInt64, c UInt64, d UInt64, INDEX idx_b b TYPE minmax) +ENGINE = MergeTree +ORDER BY a SETTINGS + index_granularity = 64, + merge_max_block_size = 8192, + vertical_merge_algorithm_min_rows_to_activate = 1, + vertical_merge_algorithm_min_columns_to_activate = 1, + min_bytes_for_wide_part = 0, + min_bytes_for_full_part_storage = 0; + +INSERT INTO t_ind_merge_1 SELECT number, number, rand(), rand() FROM numbers(1000); +INSERT INTO t_ind_merge_1 SELECT number, number, rand(), rand() FROM numbers(1000); + +SELECT count() FROM t_ind_merge_1 WHERE b < 100 SETTINGS force_data_skipping_indices = 'idx_b'; +EXPLAIN indexes = 1 SELECT count() FROM t_ind_merge_1 WHERE b < 100; + +OPTIMIZE TABLE t_ind_merge_1 FINAL; + +SELECT count() FROM t_ind_merge_1 WHERE b < 100 SETTINGS force_data_skipping_indices = 'idx_b'; +EXPLAIN indexes = 1 SELECT count() FROM t_ind_merge_1 WHERE b < 100; + +SYSTEM FLUSH LOGS; + +WITH + (SELECT uuid FROM system.tables WHERE database = currentDatabase() AND table = 't_ind_merge_1') AS uuid, + extractAllGroupsVertical(message, 'containing (\\d+) columns \((\\d+) merged, (\\d+) gathered\)')[1] AS groups +SELECT + groups[1] AS total, + groups[2] AS merged, + groups[3] AS gathered +FROM system.text_log +WHERE ((query_id = uuid || '::all_1_2_1') OR (query_id = currentDatabase() || '.t_ind_merge_1::all_1_2_1')) AND notEmpty(groups) +ORDER BY event_time_microseconds; + +DROP TABLE t_ind_merge_1; diff --git a/tests/queries/0_stateless/03166_skip_indexes_vertical_merge_2.reference b/tests/queries/0_stateless/03166_skip_indexes_vertical_merge_2.reference new file mode 100644 index 00000000000..4c2f01294a4 --- /dev/null +++ b/tests/queries/0_stateless/03166_skip_indexes_vertical_merge_2.reference @@ -0,0 +1 @@ +6 3 3 diff --git a/tests/queries/0_stateless/03166_skip_indexes_vertical_merge_2.sql b/tests/queries/0_stateless/03166_skip_indexes_vertical_merge_2.sql new file mode 100644 index 00000000000..b749e0c84b0 --- /dev/null +++ b/tests/queries/0_stateless/03166_skip_indexes_vertical_merge_2.sql @@ -0,0 +1,42 @@ +DROP TABLE IF EXISTS t_ind_merge_2; + +CREATE TABLE t_ind_merge_2 ( + a UInt64, + b UInt64, + c UInt64, + d UInt64, + e UInt64, + f UInt64, + INDEX idx_a a TYPE minmax, + INDEX idx_b b TYPE minmax, + INDEX idx_cd c * d TYPE minmax, + INDEX idx_d1 d TYPE minmax, + INDEX idx_d2 d + 7 TYPE set(3), + INDEX idx_e e * 3 TYPE set(3)) +ENGINE = MergeTree +ORDER BY a SETTINGS + index_granularity = 64, + vertical_merge_algorithm_min_rows_to_activate = 1, + vertical_merge_algorithm_min_columns_to_activate = 1, + min_bytes_for_wide_part = 0, + min_bytes_for_full_part_storage = 0; + +INSERT INTO t_ind_merge_2 SELECT number, number, rand(), rand(), rand(), rand() FROM numbers(1000); +INSERT INTO t_ind_merge_2 SELECT number, number, rand(), rand(), rand(), rand() FROM numbers(1000); + +OPTIMIZE TABLE t_ind_merge_2 FINAL; +SYSTEM FLUSH LOGS; + +--- merged: a, c, d; gathered: b, e, f +WITH + (SELECT uuid FROM system.tables WHERE database = currentDatabase() AND table = 't_ind_merge_2') AS uuid, + extractAllGroupsVertical(message, 'containing (\\d+) columns \((\\d+) merged, (\\d+) gathered\)')[1] AS groups +SELECT + groups[1] AS total, + groups[2] AS merged, + groups[3] AS gathered +FROM system.text_log +WHERE ((query_id = uuid || '::all_1_2_1') OR (query_id = currentDatabase() || '.t_ind_merge_2::all_1_2_1')) AND notEmpty(groups) +ORDER BY event_time_microseconds; + +DROP TABLE t_ind_merge_2; diff --git a/tests/queries/0_stateless/03168_cld2_tsan.reference b/tests/queries/0_stateless/03168_cld2_tsan.reference new file mode 100644 index 00000000000..6c3cafd4a6d --- /dev/null +++ b/tests/queries/0_stateless/03168_cld2_tsan.reference @@ -0,0 +1,2 @@ +{'ja':0.62,'fr':0.36} +{'ja':0.62,'fr':0.36} diff --git a/tests/queries/0_stateless/03168_cld2_tsan.sql b/tests/queries/0_stateless/03168_cld2_tsan.sql new file mode 100644 index 00000000000..701a781c472 --- /dev/null +++ b/tests/queries/0_stateless/03168_cld2_tsan.sql @@ -0,0 +1,10 @@ +-- Tags: no-fasttest +-- Tag no-fasttest: depends on cld2 + +-- https://github.com/ClickHouse/ClickHouse/issues/64931 +SELECT detectLanguageMixed(materialize('二兎を追う者は一兎をも得ず二兎を追う者は一兎をも得ず A vaincre sans peril, on triomphe sans gloire.')) +GROUP BY + GROUPING SETS ( + ('a', toUInt256(1)), + (stringToH3(toFixedString(toFixedString('85283473ffffff', 14), 14)))) +SETTINGS allow_experimental_nlp_functions = 1; diff --git a/tests/queries/0_stateless/03168_fuzz_multiIf_short_circuit.reference b/tests/queries/0_stateless/03168_fuzz_multiIf_short_circuit.reference new file mode 100644 index 00000000000..e69de29bb2d diff --git a/tests/queries/0_stateless/03168_fuzz_multiIf_short_circuit.sql b/tests/queries/0_stateless/03168_fuzz_multiIf_short_circuit.sql new file mode 100644 index 00000000000..4e4cc291e9b --- /dev/null +++ b/tests/queries/0_stateless/03168_fuzz_multiIf_short_circuit.sql @@ -0,0 +1,6 @@ +-- https://github.com/ClickHouse/ClickHouse/issues/64946 +SELECT + multiIf((number % toLowCardinality(toNullable(toUInt128(2)))) = (number % toNullable(2)), toInt8(1), (number % materialize(toLowCardinality(3))) = toUInt128(toNullable(0)), toInt8(materialize(materialize(2))), toInt64(toUInt128(3))) +FROM system.numbers +LIMIT 44857 +FORMAT Null; diff --git a/tests/queries/0_stateless/03169_cache_complex_dict_short_circuit_bug.reference b/tests/queries/0_stateless/03169_cache_complex_dict_short_circuit_bug.reference new file mode 100644 index 00000000000..e69de29bb2d diff --git a/tests/queries/0_stateless/03169_cache_complex_dict_short_circuit_bug.sql b/tests/queries/0_stateless/03169_cache_complex_dict_short_circuit_bug.sql new file mode 100644 index 00000000000..8463d13d251 --- /dev/null +++ b/tests/queries/0_stateless/03169_cache_complex_dict_short_circuit_bug.sql @@ -0,0 +1,31 @@ +DROP TABLE IF EXISTS complex_key_simple_attributes_source_short_circuit_table; +DROP DICTIONARY IF EXISTS cache_dictionary_complex_key_simple_attributes_short_circuit; + +CREATE TABLE complex_key_simple_attributes_source_short_circuit_table +( + id UInt64, + id_key String, + value_first String, + value_second String +) + ENGINE = TinyLog; + +INSERT INTO complex_key_simple_attributes_source_short_circuit_table VALUES(0, 'id_key_0', 'value_0', 'value_second_0'); + +CREATE DICTIONARY cache_dictionary_complex_key_simple_attributes_short_circuit +( + `id` UInt64, + `id_key` String, + `value_first` String DEFAULT 'value_first_default', + `value_second` String DEFAULT 'value_second_default' +) +PRIMARY KEY id, id_key +SOURCE(CLICKHOUSE(TABLE 'complex_key_simple_attributes_source_short_circuit_table')) +LIFETIME(MIN 1 MAX 1000) +LAYOUT(COMPLEX_KEY_CACHE(SIZE_IN_CELLS 10)); + +SELECT dictGetOrDefault('cache_dictionary_complex_key_simple_attributes_short_circuit', 'value_first', (number, concat(toString(number))), toString(materialize('default'))) AS value_first FROM system.numbers LIMIT 20 FORMAT Null; +SELECT dictGetOrDefault('cache_dictionary_complex_key_simple_attributes_short_circuit', 'value_first', (number, concat(toString(number))), toString(materialize('default'))) AS value_first FROM system.numbers LIMIT 20 FORMAT Null; + +DROP TABLE IF EXISTS complex_key_simple_attributes_source_short_circuit_table; +DROP DICTIONARY IF EXISTS cache_dictionary_complex_key_simple_attributes_short_circuit; diff --git a/tests/queries/0_stateless/03169_time_virtual_column.reference b/tests/queries/0_stateless/03169_time_virtual_column.reference new file mode 100644 index 00000000000..4482956b706 --- /dev/null +++ b/tests/queries/0_stateless/03169_time_virtual_column.reference @@ -0,0 +1 @@ +4 1 diff --git a/tests/queries/0_stateless/03169_time_virtual_column.sh b/tests/queries/0_stateless/03169_time_virtual_column.sh new file mode 100755 index 00000000000..fef1de8c6f2 --- /dev/null +++ b/tests/queries/0_stateless/03169_time_virtual_column.sh @@ -0,0 +1,12 @@ +#!/usr/bin/env bash + +CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CURDIR"/../shell_config.sh + +echo "1,2" > $CLICKHOUSE_TEST_UNIQUE_NAME.csv +sleep 1 +$CLICKHOUSE_LOCAL -nm -q " +select _size, (dateDiff('millisecond', _time, now()) < 600000 AND dateDiff('millisecond', _time, now()) > 0) from file('$CLICKHOUSE_TEST_UNIQUE_NAME.csv'); +" +rm $CLICKHOUSE_TEST_UNIQUE_NAME.csv diff --git a/tests/queries/0_stateless/03170_float_schema_inference_small_block.reference b/tests/queries/0_stateless/03170_float_schema_inference_small_block.reference new file mode 100644 index 00000000000..9ee16da8728 --- /dev/null +++ b/tests/queries/0_stateless/03170_float_schema_inference_small_block.reference @@ -0,0 +1,15 @@ +Int64 +x Nullable(Int64) +x Nullable(Int64) +x Nullable(Int64) +Float64 +x Nullable(Float64) +x Nullable(Float64) +x Nullable(Float64) +x Nullable(Float64) +Float64.explicit File +x Nullable(Float64) +Float64.pipe +x Nullable(Float64) +Float64.default max_read_buffer_size +x Nullable(Float64) diff --git a/tests/queries/0_stateless/03170_float_schema_inference_small_block.sh b/tests/queries/0_stateless/03170_float_schema_inference_small_block.sh new file mode 100755 index 00000000000..88f9bfad7ed --- /dev/null +++ b/tests/queries/0_stateless/03170_float_schema_inference_small_block.sh @@ -0,0 +1,32 @@ +#!/usr/bin/env bash + +CUR_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CUR_DIR"/../shell_config.sh + +# do not fallback to float always +echo "Int64" +$CLICKHOUSE_LOCAL --storage_file_read_method read --max_read_buffer_size 1 --input-format JSONEachRow 'desc "table"' <<<'{"x" : 1}' +$CLICKHOUSE_LOCAL --storage_file_read_method read --max_read_buffer_size 1 --input-format JSONEachRow 'desc "table"' <<<'{"x" : +1}' +$CLICKHOUSE_LOCAL --storage_file_read_method read --max_read_buffer_size 1 --input-format JSONEachRow 'desc "table"' <<<'{"x" : -1}' + +echo "Float64" +$CLICKHOUSE_LOCAL --storage_file_read_method read --max_read_buffer_size 1 --input-format JSONEachRow 'desc "table"' <<<'{"x" : 1.1}' +$CLICKHOUSE_LOCAL --storage_file_read_method read --max_read_buffer_size 1 --input-format JSONEachRow 'desc "table"' <<<'{"x" : +1.1}' +$CLICKHOUSE_LOCAL --storage_file_read_method read --max_read_buffer_size 1 --input-format JSONEachRow 'desc "table"' <<<'{"x" : 1.111}' +$CLICKHOUSE_LOCAL --storage_file_read_method read --max_read_buffer_size 1 --input-format JSONEachRow 'desc "table"' <<<'{"x" : +1.111}' + +# this is requried due to previously clickhouse-local does not interprets +# --max_read_buffer_size for fds [1] +# +# [1]: https://github.com/ClickHouse/ClickHouse/pull/64532 +echo "Float64.explicit File" +tmp_path=$(mktemp "$CUR_DIR/03170_float_schema_inference_small_block.json.XXXXXX") +trap 'rm -f $tmp_path' EXIT +cat > "$tmp_path" <<<'{"x" : 1.111}' +$CLICKHOUSE_LOCAL --storage_file_read_method read --max_read_buffer_size 1 --input-format JSONEachRow 'desc "table"' --file "$tmp_path" + +echo "Float64.pipe" +echo '{"x" : 1.1}' | $CLICKHOUSE_LOCAL --storage_file_read_method read --max_read_buffer_size 1 --input-format JSONEachRow 'desc "table"' +echo "Float64.default max_read_buffer_size" +echo '{"x" : 1.1}' | $CLICKHOUSE_LOCAL --storage_file_read_method read --input-format JSONEachRow 'desc "table"' diff --git a/tests/queries/1_stateful/00166_explain_estimate.sql b/tests/queries/1_stateful/00166_explain_estimate.sql index c4071271736..136433c16ee 100644 --- a/tests/queries/1_stateful/00166_explain_estimate.sql +++ b/tests/queries/1_stateful/00166_explain_estimate.sql @@ -1,6 +1,8 @@ -- Tags: no-replicated-database -- Tag no-replicated-database: Requires investigation +SET optimize_use_implicit_projections = 0; + EXPLAIN ESTIMATE SELECT count() FROM test.hits WHERE CounterID = 29103473; EXPLAIN ESTIMATE SELECT count() FROM test.hits WHERE CounterID != 29103473; EXPLAIN ESTIMATE SELECT count() FROM test.hits WHERE CounterID > 29103473; diff --git a/tests/queries/1_stateful/00175_counting_resources_in_subqueries.sql b/tests/queries/1_stateful/00175_counting_resources_in_subqueries.sql index 63eca96414f..5733bf6dd18 100644 --- a/tests/queries/1_stateful/00175_counting_resources_in_subqueries.sql +++ b/tests/queries/1_stateful/00175_counting_resources_in_subqueries.sql @@ -1,3 +1,5 @@ +SET optimize_use_implicit_projections = 0; + -- the work for scalar subquery is properly accounted: SET max_rows_to_read = 1000000; SELECT 1 = (SELECT count() FROM test.hits WHERE NOT ignore(AdvEngineID)); -- { serverError TOO_MANY_ROWS } diff --git a/utils/changelog/changelog.py b/utils/changelog/changelog.py index 314461a6b3a..b79e4139bcc 100755 --- a/utils/changelog/changelog.py +++ b/utils/changelog/changelog.py @@ -1,427 +1,15 @@ #!/usr/bin/env python3 # In our CI this script runs in style-test containers -import argparse -import logging -import os -import os.path as p -import re -from datetime import date, timedelta -from subprocess import DEVNULL, CalledProcessError -from typing import Dict, List, Optional, TextIO +# The main script is moved to tests/ci/changelog.py +# It depends on the ci scripts too hard to keep it here +# Here's only a wrapper around it for the people who used to it -from github.GithubException import RateLimitExceededException, UnknownObjectException -from github.NamedUser import NamedUser -from thefuzz.fuzz import ratio # type: ignore - -from git_helper import git_runner as runner -from git_helper import is_shallow -from github_helper import GitHub, PullRequest, PullRequests, Repository - -# This array gives the preferred category order, and is also used to -# normalize category names. -# Categories are used in .github/PULL_REQUEST_TEMPLATE.md, keep comments there -# updated accordingly -categories_preferred_order = ( - "Backward Incompatible Change", - "New Feature", - "Performance Improvement", - "Improvement", - "Critical Bug Fix", - "Bug Fix", - "Build/Testing/Packaging Improvement", - "Other", -) - -FROM_REF = "" -TO_REF = "" -SHA_IN_CHANGELOG = [] # type: List[str] -gh = GitHub(create_cache_dir=False) -CACHE_PATH = p.join(p.dirname(p.realpath(__file__)), "gh_cache") - - -class Description: - def __init__( - self, number: int, user: NamedUser, html_url: str, entry: str, category: str - ): - self.number = number - self.html_url = html_url - self.user = gh.get_user_cached(user._rawData["login"]) # type: ignore - self.entry = entry - self.category = category - - @property - def formatted_entry(self) -> str: - # Substitute issue links. - # 1) issue number w/o markdown link - entry = re.sub( - r"([^[])#([0-9]{4,})", - r"\1[#\2](https://github.com/ClickHouse/ClickHouse/issues/\2)", - self.entry, - ) - # 2) issue URL w/o markdown link - # including #issuecomment-1 or #event-12 - entry = re.sub( - r"([^(])(https://github.com/ClickHouse/ClickHouse/issues/([0-9]{4,})[-#a-z0-9]*)", - r"\1[#\3](\2)", - entry, - ) - # It's possible that we face a secondary rate limit. - # In this case we should sleep until we get it - while True: - try: - user_name = self.user.name if self.user.name else self.user.login - break - except UnknownObjectException: - user_name = self.user.login - break - except RateLimitExceededException: - gh.sleep_on_rate_limit() - return ( - f"* {entry} [#{self.number}]({self.html_url}) " - f"([{user_name}]({self.user.html_url}))." - ) - - # Sort PR descriptions by numbers - def __eq__(self, other) -> bool: - if not isinstance(self, type(other)): - return NotImplemented - return self.number == other.number - - def __lt__(self, other: "Description") -> bool: - return self.number < other.number - - -def get_descriptions(prs: PullRequests) -> Dict[str, List[Description]]: - descriptions = {} # type: Dict[str, List[Description]] - repos = {} # type: Dict[str, Repository] - for pr in prs: - # See https://github.com/PyGithub/PyGithub/issues/2202, - # obj._rawData doesn't spend additional API requests - # We'll save some requests - # pylint: disable=protected-access - repo_name = pr._rawData["base"]["repo"]["full_name"] - # pylint: enable=protected-access - if repo_name not in repos: - repos[repo_name] = pr.base.repo - in_changelog = False - merge_commit = pr.merge_commit_sha - if merge_commit is None: - logging.warning("PR %s does not have merge-commit, skipping", pr.number) - continue - - in_changelog = merge_commit in SHA_IN_CHANGELOG - if in_changelog: - desc = generate_description(pr, repos[repo_name]) - if desc: - if desc.category not in descriptions: - descriptions[desc.category] = [] - descriptions[desc.category].append(desc) - - for descs in descriptions.values(): - descs.sort() - - return descriptions - - -def parse_args() -> argparse.Namespace: - parser = argparse.ArgumentParser( - formatter_class=argparse.ArgumentDefaultsHelpFormatter, - description="Generate a changelog in Markdown format between given tags. " - "It fetches all tags and unshallow the git repository automatically", - ) - parser.add_argument( - "-v", - "--verbose", - action="count", - default=0, - help="set the script verbosity, could be used multiple", - ) - parser.add_argument( - "--debug-helpers", - action="store_true", - help="add debug logging for git_helper and github_helper", - ) - parser.add_argument( - "--output", - type=argparse.FileType("w"), - default="-", - help="output file for changelog", - ) - parser.add_argument( - "--repo", - default="ClickHouse/ClickHouse", - help="a repository to query for pull-requests from GitHub", - ) - parser.add_argument( - "--jobs", - type=int, - default=10, - help="number of jobs to get pull-requests info from GitHub API", - ) - parser.add_argument( - "--gh-user-or-token", - help="user name or GH token to authenticate", - ) - parser.add_argument( - "--gh-password", - help="a password that should be used when user is given", - ) - parser.add_argument( - "--with-testing-tags", - action="store_true", - help="by default '*-testing' tags are ignored, this argument enables them too", - ) - parser.add_argument( - "--from", - dest="from_ref", - help="git ref for a starting point of changelog, by default is calculated " - "automatically to match a previous tag in history", - ) - parser.add_argument( - "to_ref", - metavar="TO_REF", - help="git ref for the changelog end", - ) - args = parser.parse_args() - return args - - -# This function mirrors the PR description checks in ClickhousePullRequestTrigger. -# Returns None if the PR should not be mentioned in changelog. -def generate_description(item: PullRequest, repo: Repository) -> Optional[Description]: - backport_number = item.number - if item.head.ref.startswith("backport/"): - branch_parts = item.head.ref.split("/") - if len(branch_parts) == 3: - try: - item = gh.get_pull_cached(repo, int(branch_parts[-1])) - except Exception as e: - logging.warning("unable to get backpoted PR, exception: %s", e) - else: - logging.warning( - "The branch %s doesn't match backport template, using PR %s as is", - item.head.ref, - item.number, - ) - description = item.body - # Don't skip empty lines because they delimit parts of description - lines = [x.strip() for x in (description.split("\n") if description else [])] - lines = [re.sub(r"\s+", " ", ln) for ln in lines] - - category = "" - entry = "" - - if lines: - i = 0 - while i < len(lines): - if re.match(r"(?i)^[#>*_ ]*change\s*log\s*category", lines[i]): - i += 1 - if i >= len(lines): - break - # Can have one empty line between header and the category itself. - # Filter it out. - if not lines[i]: - i += 1 - if i >= len(lines): - break - category = re.sub(r"^[-*\s]*", "", lines[i]) - i += 1 - elif re.match( - r"(?i)^[#>*_ ]*(short\s*description|change\s*log\s*entry)", lines[i] - ): - i += 1 - # Can have one empty line between header and the entry itself. - # Filter it out. - if i < len(lines) and not lines[i]: - i += 1 - # All following lines until empty one are the changelog entry. - entry_lines = [] - while i < len(lines) and lines[i]: - entry_lines.append(lines[i]) - i += 1 - entry = " ".join(entry_lines) - else: - i += 1 - - # Remove excessive bullets from the entry. - if re.match(r"^[\-\*] ", entry): - entry = entry[2:] - - # Better style. - if re.match(r"^[a-z]", entry): - entry = entry.capitalize() - - if not category: - # Shouldn't happen, because description check in CI should catch such PRs. - # Fall through, so that it shows up in output and the user can fix it. - category = "NO CL CATEGORY" - - # Filter out the PR categories that are not for changelog. - if re.match( - r"(?i)((non|in|not|un)[-\s]*significant)|(not[ ]*for[ ]*changelog)", - category, - ): - category = "NOT FOR CHANGELOG / INSIGNIFICANT" - return Description(item.number, item.user, item.html_url, item.title, category) - - # Normalize bug fixes - if re.match( - r"(?i)bug\Wfix", - category, - ): - category = "Bug Fix (user-visible misbehavior in an official stable release)" - - # Filter out documentations changelog - if re.match( - r"(?i)doc", - category, - ): - return None - - if backport_number != item.number: - entry = f"Backported in #{backport_number}: {entry}" - - if not entry: - # Shouldn't happen, because description check in CI should catch such PRs. - category = "NO CL ENTRY" - entry = "NO CL ENTRY: '" + item.title + "'" - - entry = entry.strip() - if entry[-1] != ".": - entry += "." - - for c in categories_preferred_order: - if ratio(category.lower(), c.lower()) >= 90: - category = c - break - - return Description(item.number, item.user, item.html_url, entry, category) - - -def write_changelog( - fd: TextIO, descriptions: Dict[str, List[Description]], year: int -) -> None: - to_commit = runner(f"git rev-parse {TO_REF}^{{}}")[:11] - from_commit = runner(f"git rev-parse {FROM_REF}^{{}}")[:11] - fd.write( - f"---\nsidebar_position: 1\nsidebar_label: {year}\n---\n\n" - f"# {year} Changelog\n\n" - f"### ClickHouse release {TO_REF} ({to_commit}) FIXME " - f"as compared to {FROM_REF} ({from_commit})\n\n" - ) - - seen_categories = [] # type: List[str] - for category in categories_preferred_order: - if category in descriptions: - seen_categories.append(category) - fd.write(f"#### {category}\n") - for desc in descriptions[category]: - fd.write(f"{desc.formatted_entry}\n") - - fd.write("\n") - - for category in sorted(descriptions): - if category not in seen_categories: - fd.write(f"#### {category}\n\n") - for desc in descriptions[category]: - fd.write(f"{desc.formatted_entry}\n") - - fd.write("\n") - - -def check_refs(from_ref: Optional[str], to_ref: str, with_testing_tags: bool): - global FROM_REF, TO_REF - TO_REF = to_ref - - # Check TO_REF - runner.run(f"git rev-parse {TO_REF}") - - # Check from_ref - if from_ref is None: - # Get all tags pointing to TO_REF - tags = runner.run(f"git tag --points-at '{TO_REF}^{{}}'").split("\n") - logging.info("All tags pointing to %s:\n%s", TO_REF, tags) - if not with_testing_tags: - tags.append("*-testing") - exclude = " ".join([f"--exclude='{tag}'" for tag in tags]) - cmd = f"git describe --abbrev=0 --tags {exclude} '{TO_REF}'" - FROM_REF = runner.run(cmd) - else: - runner.run(f"git rev-parse {FROM_REF}") - FROM_REF = from_ref - - -def set_sha_in_changelog(): - global SHA_IN_CHANGELOG - SHA_IN_CHANGELOG = runner.run( - f"git log --format=format:%H {FROM_REF}..{TO_REF}" - ).split("\n") - - -def get_year(prs: PullRequests) -> int: - if not prs: - return date.today().year - return max(pr.created_at.year for pr in prs) - - -def main(): - log_levels = [logging.WARN, logging.INFO, logging.DEBUG] - args = parse_args() - logging.basicConfig( - format="%(asctime)s %(levelname)-8s [%(filename)s:%(lineno)d]:\n%(message)s", - level=log_levels[min(args.verbose, 2)], - ) - if args.debug_helpers: - logging.getLogger("github_helper").setLevel(logging.DEBUG) - logging.getLogger("git_helper").setLevel(logging.DEBUG) - # Create a cache directory - if not p.isdir(CACHE_PATH): - os.mkdir(CACHE_PATH, 0o700) - - # Get the full repo - if is_shallow(): - logging.info("Unshallow repository") - runner.run("git fetch --unshallow", stderr=DEVNULL) - logging.info("Fetching all tags") - runner.run("git fetch --tags", stderr=DEVNULL) - - check_refs(args.from_ref, args.to_ref, args.with_testing_tags) - set_sha_in_changelog() - - logging.info("Using %s..%s as changelog interval", FROM_REF, TO_REF) - - # use merge-base commit as a starting point, if used ref in another branch - base_commit = runner.run(f"git merge-base '{FROM_REF}^{{}}' '{TO_REF}^{{}}'") - # Get starting and ending dates for gathering PRs - # Add one day after and before to mitigate TZ possible issues - # `tag^{}` format gives commit ref when we have annotated tags - # format %cs gives a committer date, works better for cherry-picked commits - from_date = runner.run(f"git log -1 --format=format:%cs '{base_commit}'") - to_date = runner.run(f"git log -1 --format=format:%cs '{TO_REF}^{{}}'") - merged = ( - date.fromisoformat(from_date) - timedelta(1), - date.fromisoformat(to_date) + timedelta(1), - ) - - # Get all PRs for the given time frame - global gh - gh = GitHub( - args.gh_user_or_token, - args.gh_password, - create_cache_dir=False, - per_page=100, - pool_size=args.jobs, - ) - gh.cache_path = CACHE_PATH - query = f"type:pr repo:{args.repo} is:merged" - prs = gh.get_pulls_from_search(query=query, merged=merged, sort="created") - - descriptions = get_descriptions(prs) - changelog_year = get_year(prs) - - write_changelog(args.output, descriptions, changelog_year) +import subprocess +import sys +from pathlib import Path +SCRIPT_PATH = (Path(__file__).parents[2] / "tests/ci/changelog.py").absolute() if __name__ == "__main__": - main() + subprocess.check_call(["python3", SCRIPT_PATH, *sys.argv[1:]]) diff --git a/utils/changelog/git_helper.py b/utils/changelog/git_helper.py deleted file mode 120000 index 03b05a7eddd..00000000000 --- a/utils/changelog/git_helper.py +++ /dev/null @@ -1 +0,0 @@ -../../tests/ci/git_helper.py \ No newline at end of file diff --git a/utils/changelog/github_helper.py b/utils/changelog/github_helper.py deleted file mode 120000 index 2d44dfe8000..00000000000 --- a/utils/changelog/github_helper.py +++ /dev/null @@ -1 +0,0 @@ -../../tests/ci/github_helper.py \ No newline at end of file diff --git a/utils/check-style/aspell-ignore/en/aspell-dict.txt b/utils/check-style/aspell-ignore/en/aspell-dict.txt index 21e0ec3e40d..84682689934 100644 --- a/utils/check-style/aspell-ignore/en/aspell-dict.txt +++ b/utils/check-style/aspell-ignore/en/aspell-dict.txt @@ -222,6 +222,7 @@ DatabaseOrdinaryThreadsActive DateTime DateTimes DbCL +deallocated Decrypted Deduplicate Deduplication @@ -293,6 +294,7 @@ FilesystemMainPathUsedBytes FilesystemMainPathUsedINodes FixedString FlameGraph +flameGraph Flink ForEach FreeBSD diff --git a/utils/check-style/check-style b/utils/check-style/check-style index 5c05907e9dd..db491c67f2c 100755 --- a/utils/check-style/check-style +++ b/utils/check-style/check-style @@ -240,8 +240,22 @@ for test_case in "${tests_with_replicated_merge_tree[@]}"; do esac done -# All the submodules should be from https://github.com/ -find $ROOT_PATH -name '.gitmodules' | while read i; do grep -F 'url = ' $i | grep -v -F 'https://github.com/' && echo 'All the submodules should be from https://github.com/'; done +# All submodules should be from https://github.com/ +git config --file "$ROOT_PATH/.gitmodules" --get-regexp 'submodule\..+\.url' | \ +while read -r line; do + name=${line#submodule.}; name=${name%.url*} + url=${line#* } + [[ "$url" != 'https://github.com/'* ]] && echo "All submodules should be from https://github.com/, submodule '$name' has '$url'" +done + +# All submodules should be of this form: [submodule "contrib/libxyz"] (for consistency, the submodule name does matter too much) +# - restrict the check to top-level .gitmodules file +git config --file "$ROOT_PATH/.gitmodules" --get-regexp 'submodule\..+\.path' | \ +while read -r line; do + name=${line#submodule.}; name=${name%.path*} + path=${line#* } + [ "$name" != "$path" ] && echo "Submodule name '$name' is not equal to it's path '$path'" +done # There shouldn't be any code snippets under GPL or LGPL find $ROOT_PATH/{src,base,programs} -name '*.h' -or -name '*.cpp' 2>/dev/null | xargs grep -i -F 'General Public License' && echo "There shouldn't be any code snippets under GPL or LGPL" diff --git a/utils/list-versions/version_date.tsv b/utils/list-versions/version_date.tsv index f7d84cce4b1..2f96daf4887 100644 --- a/utils/list-versions/version_date.tsv +++ b/utils/list-versions/version_date.tsv @@ -1,4 +1,5 @@ v24.5.1.1763-stable 2024-06-01 +v24.4.2.141-stable 2024-06-07 v24.4.1.2088-stable 2024-05-01 v24.3.3.102-lts 2024-05-01 v24.3.2.23-lts 2024-04-03