diff --git a/.github/workflows/backport_branches.yml b/.github/workflows/backport_branches.yml index 6b05f1fe9f4..51670087ffe 100644 --- a/.github/workflows/backport_branches.yml +++ b/.github/workflows/backport_branches.yml @@ -11,7 +11,7 @@ on: # yamllint disable-line rule:truthy - 'backport/**' jobs: RunConfig: - runs-on: [self-hosted, style-checker] + runs-on: [self-hosted, style-checker-aarch64] outputs: data: ${{ steps.runconfig.outputs.CI_DATA }} steps: diff --git a/.github/workflows/master.yml b/.github/workflows/master.yml index 209995bfbdd..7cb5455ed73 100644 --- a/.github/workflows/master.yml +++ b/.github/workflows/master.yml @@ -11,7 +11,7 @@ on: # yamllint disable-line rule:truthy - 'master' jobs: RunConfig: - runs-on: [self-hosted, style-checker] + runs-on: [self-hosted, style-checker-aarch64] outputs: data: ${{ steps.runconfig.outputs.CI_DATA }} steps: @@ -327,6 +327,7 @@ jobs: run_command: | python3 build_report_check.py "$CHECK_NAME" MarkReleaseReady: + if: ${{ ! (contains(needs.*.result, 'skipped') || contains(needs.*.result, 'failure')) }} needs: - BuilderBinDarwin - BuilderBinDarwinAarch64 @@ -374,14 +375,6 @@ jobs: test_name: Stateless tests (release) runner_type: func-tester data: ${{ needs.RunConfig.outputs.data }} - FunctionalStatelessTestCoverage: - needs: [RunConfig, BuilderDebReleaseCoverage] - if: ${{ !failure() && !cancelled() }} - uses: ./.github/workflows/reusable_test.yml - with: - test_name: Stateless tests (coverage) - runner_type: func-tester - data: ${{ needs.RunConfig.outputs.data }} FunctionalStatelessTestReleaseDatabaseReplicated: needs: [RunConfig, BuilderDebRelease] if: ${{ !failure() && !cancelled() }} @@ -482,14 +475,6 @@ jobs: test_name: Stateful tests (release) runner_type: func-tester data: ${{ needs.RunConfig.outputs.data }} - FunctionalStatefulTestCoverage: - needs: [RunConfig, BuilderDebReleaseCoverage] - if: ${{ !failure() && !cancelled() }} - uses: ./.github/workflows/reusable_test.yml - with: - test_name: Stateful tests (coverage) - runner_type: func-tester - data: ${{ needs.RunConfig.outputs.data }} FunctionalStatefulTestAarch64: needs: [RunConfig, BuilderDebAarch64] if: ${{ !failure() && !cancelled() }} diff --git a/.github/workflows/nightly.yml b/.github/workflows/nightly.yml index 770e1ec3789..93ac2be19b4 100644 --- a/.github/workflows/nightly.yml +++ b/.github/workflows/nightly.yml @@ -14,7 +14,7 @@ jobs: # The task for having a preserved ENV and event.json for later investigation uses: ./.github/workflows/debug.yml RunConfig: - runs-on: [self-hosted, style-checker] + runs-on: [self-hosted, style-checker-aarch64] outputs: data: ${{ steps.runconfig.outputs.CI_DATA }} steps: diff --git a/.github/workflows/pull_request.yml b/.github/workflows/pull_request.yml index 405e1ec1502..1afcdab938b 100644 --- a/.github/workflows/pull_request.yml +++ b/.github/workflows/pull_request.yml @@ -18,7 +18,7 @@ on: # yamllint disable-line rule:truthy ########################################################################################## jobs: RunConfig: - runs-on: [self-hosted, style-checker] + runs-on: [self-hosted, style-checker-aarch64] outputs: data: ${{ steps.runconfig.outputs.CI_DATA }} steps: @@ -391,14 +391,6 @@ jobs: test_name: Stateless tests (release) runner_type: func-tester data: ${{ needs.RunConfig.outputs.data }} - FunctionalStatelessTestCoverage: - needs: [RunConfig, BuilderDebReleaseCoverage] - if: ${{ !failure() && !cancelled() }} - uses: ./.github/workflows/reusable_test.yml - with: - test_name: Stateless tests (coverage) - runner_type: func-tester - data: ${{ needs.RunConfig.outputs.data }} FunctionalStatelessTestReleaseDatabaseReplicated: needs: [RunConfig, BuilderDebRelease] if: ${{ !failure() && !cancelled() }} @@ -500,21 +492,9 @@ jobs: if: ${{ !failure() && !cancelled() }} uses: ./.github/workflows/reusable_test.yml with: - test_name: tests bugfix validate check + test_name: Bugfix validation runner_type: func-tester data: ${{ needs.RunConfig.outputs.data }} - additional_envs: | - KILL_TIMEOUT=3600 - run_command: | - TEMP_PATH="${TEMP_PATH}/integration" \ - python3 integration_test_check.py "Integration $CHECK_NAME" \ - --validate-bugfix --post-commit-status=file || echo "ignore exit code" - - TEMP_PATH="${TEMP_PATH}/stateless" \ - python3 functional_test_check.py "Stateless $CHECK_NAME" "$KILL_TIMEOUT" \ - --validate-bugfix --post-commit-status=file || echo "ignore exit code" - - python3 bugfix_validate_check.py "${TEMP_PATH}/stateless/functional_commit_status.tsv" "${TEMP_PATH}/integration/integration_commit_status.tsv" ############################################################################################## ############################ FUNCTIONAl STATEFUL TESTS ####################################### ############################################################################################## @@ -526,14 +506,6 @@ jobs: test_name: Stateful tests (release) runner_type: func-tester data: ${{ needs.RunConfig.outputs.data }} - FunctionalStatefulTestCoverage: - needs: [RunConfig, BuilderDebReleaseCoverage] - if: ${{ !failure() && !cancelled() }} - uses: ./.github/workflows/reusable_test.yml - with: - test_name: Stateful tests (coverage) - runner_type: func-tester - data: ${{ needs.RunConfig.outputs.data }} FunctionalStatefulTestAarch64: needs: [RunConfig, BuilderDebAarch64] if: ${{ !failure() && !cancelled() }} @@ -915,6 +887,7 @@ jobs: - BuilderSpecialReport - DocsCheck - FastTest + - TestsBugfixCheck - FunctionalStatelessTestDebug - FunctionalStatelessTestRelease - FunctionalStatelessTestReleaseDatabaseReplicated diff --git a/.github/workflows/release_branches.yml b/.github/workflows/release_branches.yml index 6eb787e6453..57e90d79ebd 100644 --- a/.github/workflows/release_branches.yml +++ b/.github/workflows/release_branches.yml @@ -14,7 +14,7 @@ on: # yamllint disable-line rule:truthy jobs: RunConfig: - runs-on: [self-hosted, style-checker] + runs-on: [self-hosted, style-checker-aarch64] outputs: data: ${{ steps.runconfig.outputs.CI_DATA }} steps: @@ -228,6 +228,7 @@ jobs: run_command: | python3 build_report_check.py "$CHECK_NAME" MarkReleaseReady: + if: ${{ ! (contains(needs.*.result, 'skipped') || contains(needs.*.result, 'failure')) }} needs: - BuilderBinDarwin - BuilderBinDarwinAarch64 diff --git a/.gitmessage b/.gitmessage index 200d19e774d..760cfec97a4 100644 --- a/.gitmessage +++ b/.gitmessage @@ -1,6 +1,6 @@ -### CI modificators (add a leading space to apply): +### CI modificators (add a leading space to apply) ### ## To avoid a merge commit in CI: #no_merge_commit @@ -8,13 +8,21 @@ ## To discard CI cache: #no_ci_cache +## To not test (only style check): +#do_not_test + ## To run specified set of tests in CI: #ci_set_ #ci_set_reduced #ci_set_arm +#ci_set_integration ## To run specified job in CI: #job_ #job_stateless_tests_release #job_package_debug #job_integration_tests_asan + +## To run only specified batches for multi-batch job(s) +#batch_2 +#btach_1_2_3 diff --git a/base/base/Decimal_fwd.h b/base/base/Decimal_fwd.h index 589d6224917..beb228cea3c 100644 --- a/base/base/Decimal_fwd.h +++ b/base/base/Decimal_fwd.h @@ -1,6 +1,7 @@ #pragma once #include +#include namespace wide { @@ -44,3 +45,8 @@ concept is_over_big_int = || std::is_same_v || std::is_same_v; } + +template <> struct is_signed { static constexpr bool value = true; }; +template <> struct is_signed { static constexpr bool value = true; }; +template <> struct is_signed { static constexpr bool value = true; }; +template <> struct is_signed { static constexpr bool value = true; }; diff --git a/contrib/curl b/contrib/curl index 7161cb17c01..5ce164e0e92 160000 --- a/contrib/curl +++ b/contrib/curl @@ -1 +1 @@ -Subproject commit 7161cb17c01dcff1dc5bf89a18437d9d729f1ecd +Subproject commit 5ce164e0e9290c96eb7d502173426c0a135ec008 diff --git a/contrib/libssh b/contrib/libssh index 2c76332ef56..ed4011b9187 160000 --- a/contrib/libssh +++ b/contrib/libssh @@ -1 +1 @@ -Subproject commit 2c76332ef56d90f55965ab24da6b6dbcbef29c4c +Subproject commit ed4011b91873836713576475a98cd132cd834539 diff --git a/contrib/libssh-cmake/CMakeLists.txt b/contrib/libssh-cmake/CMakeLists.txt index eee3df832fa..7b589718140 100644 --- a/contrib/libssh-cmake/CMakeLists.txt +++ b/contrib/libssh-cmake/CMakeLists.txt @@ -8,24 +8,12 @@ endif() set(LIB_SOURCE_DIR "${ClickHouse_SOURCE_DIR}/contrib/libssh") set(LIB_BINARY_DIR "${ClickHouse_BINARY_DIR}/contrib/libssh") -project(libssh VERSION 0.9.7 LANGUAGES C) +# Set CMake variables which are used in libssh_version.h.cmake +project(libssh VERSION 0.9.8 LANGUAGES C) -# global needed variable -set(APPLICATION_NAME ${PROJECT_NAME}) - -# SOVERSION scheme: CURRENT.AGE.REVISION -# If there was an incompatible interface change: -# Increment CURRENT. Set AGE and REVISION to 0 -# If there was a compatible interface change: -# Increment AGE. Set REVISION to 0 -# If the source code was changed, but there were no interface changes: -# Increment REVISION. -set(LIBRARY_VERSION "4.8.7") +set(LIBRARY_VERSION "4.8.8") set(LIBRARY_SOVERSION "4") -# Copy library files to a lib sub-directory -set(CMAKE_LIBRARY_OUTPUT_DIRECTORY "${LIB_BINARY_DIR}/lib") - set(CMAKE_THREAD_PREFER_PTHREADS ON) set(THREADS_PREFER_PTHREAD_FLAG ON) @@ -33,7 +21,87 @@ set(WITH_ZLIB OFF) set(WITH_SYMBOL_VERSIONING OFF) set(WITH_SERVER ON) -include(IncludeSources.cmake) +set(libssh_SRCS + ${LIB_SOURCE_DIR}/src/agent.c + ${LIB_SOURCE_DIR}/src/auth.c + ${LIB_SOURCE_DIR}/src/base64.c + ${LIB_SOURCE_DIR}/src/bignum.c + ${LIB_SOURCE_DIR}/src/buffer.c + ${LIB_SOURCE_DIR}/src/callbacks.c + ${LIB_SOURCE_DIR}/src/channels.c + ${LIB_SOURCE_DIR}/src/client.c + ${LIB_SOURCE_DIR}/src/config.c + ${LIB_SOURCE_DIR}/src/connect.c + ${LIB_SOURCE_DIR}/src/connector.c + ${LIB_SOURCE_DIR}/src/curve25519.c + ${LIB_SOURCE_DIR}/src/dh.c + ${LIB_SOURCE_DIR}/src/ecdh.c + ${LIB_SOURCE_DIR}/src/error.c + ${LIB_SOURCE_DIR}/src/getpass.c + ${LIB_SOURCE_DIR}/src/init.c + ${LIB_SOURCE_DIR}/src/kdf.c + ${LIB_SOURCE_DIR}/src/kex.c + ${LIB_SOURCE_DIR}/src/known_hosts.c + ${LIB_SOURCE_DIR}/src/knownhosts.c + ${LIB_SOURCE_DIR}/src/legacy.c + ${LIB_SOURCE_DIR}/src/log.c + ${LIB_SOURCE_DIR}/src/match.c + ${LIB_SOURCE_DIR}/src/messages.c + ${LIB_SOURCE_DIR}/src/misc.c + ${LIB_SOURCE_DIR}/src/options.c + ${LIB_SOURCE_DIR}/src/packet.c + ${LIB_SOURCE_DIR}/src/packet_cb.c + ${LIB_SOURCE_DIR}/src/packet_crypt.c + ${LIB_SOURCE_DIR}/src/pcap.c + ${LIB_SOURCE_DIR}/src/pki.c + ${LIB_SOURCE_DIR}/src/pki_container_openssh.c + ${LIB_SOURCE_DIR}/src/poll.c + ${LIB_SOURCE_DIR}/src/session.c + ${LIB_SOURCE_DIR}/src/scp.c + ${LIB_SOURCE_DIR}/src/socket.c + ${LIB_SOURCE_DIR}/src/string.c + ${LIB_SOURCE_DIR}/src/threads.c + ${LIB_SOURCE_DIR}/src/wrapper.c + ${LIB_SOURCE_DIR}/src/external/bcrypt_pbkdf.c + ${LIB_SOURCE_DIR}/src/external/blowfish.c + ${LIB_SOURCE_DIR}/src/external/chacha.c + ${LIB_SOURCE_DIR}/src/external/poly1305.c + ${LIB_SOURCE_DIR}/src/chachapoly.c + ${LIB_SOURCE_DIR}/src/config_parser.c + ${LIB_SOURCE_DIR}/src/token.c + ${LIB_SOURCE_DIR}/src/pki_ed25519_common.c + + ${LIB_SOURCE_DIR}/src/threads/noop.c + ${LIB_SOURCE_DIR}/src/threads/pthread.c + + # LIBCRYPT specific + ${libssh_SRCS} + ${LIB_SOURCE_DIR}/src/threads/libcrypto.c + ${LIB_SOURCE_DIR}/src/pki_crypto.c + ${LIB_SOURCE_DIR}/src/ecdh_crypto.c + ${LIB_SOURCE_DIR}/src/libcrypto.c + ${LIB_SOURCE_DIR}/src/dh_crypto.c + + ${LIB_SOURCE_DIR}/src/options.c + ${LIB_SOURCE_DIR}/src/server.c + ${LIB_SOURCE_DIR}/src/bind.c + ${LIB_SOURCE_DIR}/src/bind_config.c +) + +if (NOT (ENABLE_OPENSSL OR ENABLE_OPENSSL_DYNAMIC)) + add_compile_definitions(USE_BORINGSSL=1) +endif() + +configure_file(${LIB_SOURCE_DIR}/include/libssh/libssh_version.h.cmake ${LIB_BINARY_DIR}/include/libssh/libssh_version.h @ONLY) + +add_library(_ssh STATIC ${libssh_SRCS}) +add_library(ch_contrib::ssh ALIAS _ssh) + +target_link_libraries(_ssh PRIVATE OpenSSL::Crypto) + +target_include_directories(_ssh PUBLIC "${LIB_SOURCE_DIR}/include" "${LIB_BINARY_DIR}/include") + +# These headers need to be generated using the native build system on each platform. if (OS_LINUX) if (ARCH_AMD64) if (USE_MUSL) @@ -63,7 +131,3 @@ elseif (OS_FREEBSD) else () message(FATAL_ERROR "Platform is not supported") endif() - -configure_file(${LIB_SOURCE_DIR}/include/libssh/libssh_version.h.cmake - ${LIB_BINARY_DIR}/include/libssh/libssh_version.h - @ONLY) diff --git a/contrib/libssh-cmake/IncludeSources.cmake b/contrib/libssh-cmake/IncludeSources.cmake deleted file mode 100644 index 30348d5d7dd..00000000000 --- a/contrib/libssh-cmake/IncludeSources.cmake +++ /dev/null @@ -1,98 +0,0 @@ -set(LIBSSH_LINK_LIBRARIES - ${LIBSSH_LINK_LIBRARIES} - OpenSSL::Crypto -) - -set(libssh_SRCS - ${LIB_SOURCE_DIR}/src/agent.c - ${LIB_SOURCE_DIR}/src/auth.c - ${LIB_SOURCE_DIR}/src/base64.c - ${LIB_SOURCE_DIR}/src/bignum.c - ${LIB_SOURCE_DIR}/src/buffer.c - ${LIB_SOURCE_DIR}/src/callbacks.c - ${LIB_SOURCE_DIR}/src/channels.c - ${LIB_SOURCE_DIR}/src/client.c - ${LIB_SOURCE_DIR}/src/config.c - ${LIB_SOURCE_DIR}/src/connect.c - ${LIB_SOURCE_DIR}/src/connector.c - ${LIB_SOURCE_DIR}/src/curve25519.c - ${LIB_SOURCE_DIR}/src/dh.c - ${LIB_SOURCE_DIR}/src/ecdh.c - ${LIB_SOURCE_DIR}/src/error.c - ${LIB_SOURCE_DIR}/src/getpass.c - ${LIB_SOURCE_DIR}/src/init.c - ${LIB_SOURCE_DIR}/src/kdf.c - ${LIB_SOURCE_DIR}/src/kex.c - ${LIB_SOURCE_DIR}/src/known_hosts.c - ${LIB_SOURCE_DIR}/src/knownhosts.c - ${LIB_SOURCE_DIR}/src/legacy.c - ${LIB_SOURCE_DIR}/src/log.c - ${LIB_SOURCE_DIR}/src/match.c - ${LIB_SOURCE_DIR}/src/messages.c - ${LIB_SOURCE_DIR}/src/misc.c - ${LIB_SOURCE_DIR}/src/options.c - ${LIB_SOURCE_DIR}/src/packet.c - ${LIB_SOURCE_DIR}/src/packet_cb.c - ${LIB_SOURCE_DIR}/src/packet_crypt.c - ${LIB_SOURCE_DIR}/src/pcap.c - ${LIB_SOURCE_DIR}/src/pki.c - ${LIB_SOURCE_DIR}/src/pki_container_openssh.c - ${LIB_SOURCE_DIR}/src/poll.c - ${LIB_SOURCE_DIR}/src/session.c - ${LIB_SOURCE_DIR}/src/scp.c - ${LIB_SOURCE_DIR}/src/socket.c - ${LIB_SOURCE_DIR}/src/string.c - ${LIB_SOURCE_DIR}/src/threads.c - ${LIB_SOURCE_DIR}/src/wrapper.c - ${LIB_SOURCE_DIR}/src/external/bcrypt_pbkdf.c - ${LIB_SOURCE_DIR}/src/external/blowfish.c - ${LIB_SOURCE_DIR}/src/external/chacha.c - ${LIB_SOURCE_DIR}/src/external/poly1305.c - ${LIB_SOURCE_DIR}/src/chachapoly.c - ${LIB_SOURCE_DIR}/src/config_parser.c - ${LIB_SOURCE_DIR}/src/token.c - ${LIB_SOURCE_DIR}/src/pki_ed25519_common.c -) - -set(libssh_SRCS - ${libssh_SRCS} - ${LIB_SOURCE_DIR}/src/threads/noop.c - ${LIB_SOURCE_DIR}/src/threads/pthread.c -) - -# LIBCRYPT specific -set(libssh_SRCS - ${libssh_SRCS} - ${LIB_SOURCE_DIR}/src/threads/libcrypto.c - ${LIB_SOURCE_DIR}/src/pki_crypto.c - ${LIB_SOURCE_DIR}/src/ecdh_crypto.c - ${LIB_SOURCE_DIR}/src/libcrypto.c - ${LIB_SOURCE_DIR}/src/dh_crypto.c -) - -if (NOT (ENABLE_OPENSSL OR ENABLE_OPENSSL_DYNAMIC)) - add_compile_definitions(USE_BORINGSSL=1) -endif() - -set(libssh_SRCS -${libssh_SRCS} -${LIB_SOURCE_DIR}/src/options.c -${LIB_SOURCE_DIR}/src/server.c -${LIB_SOURCE_DIR}/src/bind.c -${LIB_SOURCE_DIR}/src/bind_config.c -) - - -add_library(_ssh STATIC ${libssh_SRCS}) - -target_include_directories(_ssh PRIVATE ${LIB_BINARY_DIR}) -target_include_directories(_ssh PUBLIC "${LIB_SOURCE_DIR}/include" "${LIB_BINARY_DIR}/include") -target_link_libraries(_ssh - PRIVATE ${LIBSSH_LINK_LIBRARIES}) - -add_library(ch_contrib::ssh ALIAS _ssh) - -target_compile_options(_ssh - PRIVATE - ${DEFAULT_C_COMPILE_FLAGS} - -D_GNU_SOURCE) diff --git a/contrib/lz4 b/contrib/lz4 index 92ebf1870b9..ce45a9dbdb0 160000 --- a/contrib/lz4 +++ b/contrib/lz4 @@ -1 +1 @@ -Subproject commit 92ebf1870b9acbefc0e7970409a181954a10ff40 +Subproject commit ce45a9dbdb059511a3e9576b19db3e7f1a4f172e diff --git a/docker/keeper/Dockerfile b/docker/keeper/Dockerfile index ab1bc58da1b..d39ca312454 100644 --- a/docker/keeper/Dockerfile +++ b/docker/keeper/Dockerfile @@ -34,7 +34,7 @@ RUN arch=${TARGETARCH:-amd64} \ # lts / testing / prestable / etc ARG REPO_CHANNEL="stable" ARG REPOSITORY="https://packages.clickhouse.com/tgz/${REPO_CHANNEL}" -ARG VERSION="24.1.4.20" +ARG VERSION="24.1.5.6" ARG PACKAGES="clickhouse-keeper" ARG DIRECT_DOWNLOAD_URLS="" diff --git a/docker/server/Dockerfile.alpine b/docker/server/Dockerfile.alpine index 39187781a86..2d07937ad79 100644 --- a/docker/server/Dockerfile.alpine +++ b/docker/server/Dockerfile.alpine @@ -32,7 +32,7 @@ RUN arch=${TARGETARCH:-amd64} \ # lts / testing / prestable / etc ARG REPO_CHANNEL="stable" ARG REPOSITORY="https://packages.clickhouse.com/tgz/${REPO_CHANNEL}" -ARG VERSION="24.1.4.20" +ARG VERSION="24.1.5.6" ARG PACKAGES="clickhouse-client clickhouse-server clickhouse-common-static" ARG DIRECT_DOWNLOAD_URLS="" diff --git a/docker/server/Dockerfile.ubuntu b/docker/server/Dockerfile.ubuntu index 67ea2656310..d4775b17319 100644 --- a/docker/server/Dockerfile.ubuntu +++ b/docker/server/Dockerfile.ubuntu @@ -27,7 +27,7 @@ RUN sed -i "s|http://archive.ubuntu.com|${apt_archive}|g" /etc/apt/sources.list ARG REPO_CHANNEL="stable" ARG REPOSITORY="deb [signed-by=/usr/share/keyrings/clickhouse-keyring.gpg] https://packages.clickhouse.com/deb ${REPO_CHANNEL} main" -ARG VERSION="24.1.4.20" +ARG VERSION="24.1.5.6" ARG PACKAGES="clickhouse-client clickhouse-server clickhouse-common-static" # set non-empty deb_location_url url to create a docker image diff --git a/docker/test/base/setup_export_logs.sh b/docker/test/base/setup_export_logs.sh index 9e854dce65a..8858e12c50e 100755 --- a/docker/test/base/setup_export_logs.sh +++ b/docker/test/base/setup_export_logs.sh @@ -190,7 +190,7 @@ function setup_logs_replication echo -e "Creating remote destination table ${table}_${hash} with statement:\n${statement}" >&2 echo "$statement" | clickhouse-client --database_replicated_initial_query_timeout_sec=10 \ - --distributed_ddl_task_timeout=30 \ + --distributed_ddl_task_timeout=30 --distributed_ddl_output_mode=throw_only_active \ "${CONNECTION_ARGS[@]}" || continue echo "Creating table system.${table}_sender" >&2 diff --git a/docker/test/integration/runner/compose/docker_compose_mysql_cluster.yml b/docker/test/integration/runner/compose/docker_compose_mysql_cluster.yml index 73f9e39f0d6..079c451b9d6 100644 --- a/docker/test/integration/runner/compose/docker_compose_mysql_cluster.yml +++ b/docker/test/integration/runner/compose/docker_compose_mysql_cluster.yml @@ -1,7 +1,7 @@ version: '2.3' services: mysql2: - image: mysql:5.7 + image: mysql:8.0 restart: always environment: MYSQL_ROOT_PASSWORD: clickhouse @@ -23,7 +23,7 @@ services: source: ${MYSQL_CLUSTER_LOGS:-} target: /mysql/ mysql3: - image: mysql:5.7 + image: mysql:8.0 restart: always environment: MYSQL_ROOT_PASSWORD: clickhouse @@ -45,7 +45,7 @@ services: source: ${MYSQL_CLUSTER_LOGS:-} target: /mysql/ mysql4: - image: mysql:5.7 + image: mysql:8.0 restart: always environment: MYSQL_ROOT_PASSWORD: clickhouse diff --git a/docs/changelogs/v24.1.5.6-stable.md b/docs/changelogs/v24.1.5.6-stable.md new file mode 100644 index 00000000000..ce46c51e2f4 --- /dev/null +++ b/docs/changelogs/v24.1.5.6-stable.md @@ -0,0 +1,17 @@ +--- +sidebar_position: 1 +sidebar_label: 2024 +--- + +# 2024 Changelog + +### ClickHouse release v24.1.5.6-stable (7f67181ff31) FIXME as compared to v24.1.4.20-stable (f59d842b3fa) + +#### Bug Fix (user-visible misbehavior in an official stable release) + +* UniqExactSet read crash fix [#59928](https://github.com/ClickHouse/ClickHouse/pull/59928) ([Maksim Kita](https://github.com/kitaisreal)). + +#### NOT FOR CHANGELOG / INSIGNIFICANT + +* CI: do not reuse builds on release branches [#59798](https://github.com/ClickHouse/ClickHouse/pull/59798) ([Max K.](https://github.com/maxknv)). + diff --git a/docs/en/development/architecture.md b/docs/en/development/architecture.md index cfdd2bbcc41..d3a29c9171b 100644 --- a/docs/en/development/architecture.md +++ b/docs/en/development/architecture.md @@ -166,11 +166,11 @@ For most external applications, we recommend using the HTTP interface because it ## Configuration {#configuration} -ClickHouse Server is based on POCO C++ Libraries and uses `Poco::Util::AbstractConfiguration` to represent it's configuration. Configuration is held by `Poco::Util::ServerApplication` class inherited by `DaemonBase` class, which in turn is inherited by `DB::Server` class, implementing clickhouse-server itself. So config can be accessed by `ServerApplication::config()` method. +ClickHouse Server is based on POCO C++ Libraries and uses `Poco::Util::AbstractConfiguration` to represent its configuration. Configuration is held by `Poco::Util::ServerApplication` class inherited by `DaemonBase` class, which in turn is inherited by `DB::Server` class, implementing clickhouse-server itself. So config can be accessed by `ServerApplication::config()` method. Config is read from multiple files (in XML or YAML format) and merged into single `AbstractConfiguration` by `ConfigProcessor` class. Configuration is loaded at server startup and can be reloaded later if one of config files is updated, removed or added. `ConfigReloader` class is responsible for periodic monitoring of these changes and reload procedure as well. `SYSTEM RELOAD CONFIG` query also triggers config to be reloaded. -For queries and subsystems other than `Server` config is accessible using `Context::getConfigRef()` method. Every subsystem that is capable of reloading it's config without server restart should register itself in reload callback in `Server::main()` method. Note that if newer config has an error, most subsystems will ignore new config, log warning messages and keep working with previously loaded config. Due to the nature of `AbstractConfiguration` it is not possible to pass reference to specific section, so `String config_prefix` is usually used instead. +For queries and subsystems other than `Server` config is accessible using `Context::getConfigRef()` method. Every subsystem that is capable of reloading its config without server restart should register itself in reload callback in `Server::main()` method. Note that if newer config has an error, most subsystems will ignore new config, log warning messages and keep working with previously loaded config. Due to the nature of `AbstractConfiguration` it is not possible to pass reference to specific section, so `String config_prefix` is usually used instead. ## Threads and jobs {#threads-and-jobs} @@ -255,7 +255,7 @@ When we are going to read something from a part in `MergeTree`, we look at `prim When you `INSERT` a bunch of data into `MergeTree`, that bunch is sorted by primary key order and forms a new part. There are background threads that periodically select some parts and merge them into a single sorted part to keep the number of parts relatively low. That’s why it is called `MergeTree`. Of course, merging leads to “write amplification”. All parts are immutable: they are only created and deleted, but not modified. When SELECT is executed, it holds a snapshot of the table (a set of parts). After merging, we also keep old parts for some time to make a recovery after failure easier, so if we see that some merged part is probably broken, we can replace it with its source parts. -`MergeTree` is not an LSM tree because it does not contain MEMTABLE and LOG: inserted data is written directly to the filesystem. This behavior makes MergeTree much more suitable to insert data in batches. Therefore frequently inserting small amounts of rows is not ideal for MergeTree. For example, a couple of rows per second is OK, but doing it a thousand times a second is not optimal for MergeTree. However, there is an async insert mode for small inserts to overcome this limitation. We did it this way for simplicity’s sake, and because we are already inserting data in batches in our applications +`MergeTree` is not an LSM tree because it does not contain MEMTABLE and LOG: inserted data is written directly to the filesystem. This behavior makes MergeTree much more suitable to insert data in batches. Therefore, frequently inserting small amounts of rows is not ideal for MergeTree. For example, a couple of rows per second is OK, but doing it a thousand times a second is not optimal for MergeTree. However, there is an async insert mode for small inserts to overcome this limitation. We did it this way for simplicity’s sake, and because we are already inserting data in batches in our applications There are MergeTree engines that are doing additional work during background merges. Examples are `CollapsingMergeTree` and `AggregatingMergeTree`. This could be treated as special support for updates. Keep in mind that these are not real updates because users usually have no control over the time when background merges are executed, and data in a `MergeTree` table is almost always stored in more than one part, not in completely merged form. diff --git a/docs/en/development/build-cross-s390x.md b/docs/en/development/build-cross-s390x.md index b7cda515d77..a4a83c7989b 100644 --- a/docs/en/development/build-cross-s390x.md +++ b/docs/en/development/build-cross-s390x.md @@ -38,7 +38,7 @@ ninja ## Running -Once built, the binary can be run with, eg.: +Once built, the binary can be run with, e.g.: ```bash qemu-s390x-static -L /usr/s390x-linux-gnu ./clickhouse diff --git a/docs/en/development/building_and_benchmarking_deflate_qpl.md b/docs/en/development/building_and_benchmarking_deflate_qpl.md index 4e01b41ab3c..b9d39b8cc2d 100644 --- a/docs/en/development/building_and_benchmarking_deflate_qpl.md +++ b/docs/en/development/building_and_benchmarking_deflate_qpl.md @@ -95,7 +95,7 @@ Complete below three steps mentioned in [Star Schema Benchmark](https://clickhou - Inserting data. Here should use `./benchmark_sample/rawdata_dir/ssb-dbgen/*.tbl` as input data. - Converting “star schema” to de-normalized “flat schema” -Set up database with with IAA Deflate codec +Set up database with IAA Deflate codec ``` bash $ cd ./database_dir/deflate @@ -104,7 +104,7 @@ $ [CLICKHOUSE_EXE] client ``` Complete three steps same as lz4 above -Set up database with with ZSTD codec +Set up database with ZSTD codec ``` bash $ cd ./database_dir/zstd diff --git a/docs/en/development/contrib.md b/docs/en/development/contrib.md index 4b296c43db4..bbc5fbeebcb 100644 --- a/docs/en/development/contrib.md +++ b/docs/en/development/contrib.md @@ -13,7 +13,7 @@ ClickHouse utilizes third-party libraries for different purposes, e.g., to conne SELECT library_name, license_type, license_path FROM system.licenses ORDER BY library_name COLLATE 'en'; ``` -(Note that the listed libraries are the ones located in the `contrib/` directory of the ClickHouse repository. Depending on the build options, some of of the libraries may have not been compiled, and as a result, their functionality may not be available at runtime. +Note that the listed libraries are the ones located in the `contrib/` directory of the ClickHouse repository. Depending on the build options, some of the libraries may have not been compiled, and as a result, their functionality may not be available at runtime. [Example](https://play.clickhouse.com/play?user=play#U0VMRUNUIGxpYnJhcnlfbmFtZSwgbGljZW5zZV90eXBlLCBsaWNlbnNlX3BhdGggRlJPTSBzeXN0ZW0ubGljZW5zZXMgT1JERVIgQlkgbGlicmFyeV9uYW1lIENPTExBVEUgJ2VuJw==) diff --git a/docs/en/development/developer-instruction.md b/docs/en/development/developer-instruction.md index 31346c77949..e08096d8042 100644 --- a/docs/en/development/developer-instruction.md +++ b/docs/en/development/developer-instruction.md @@ -7,13 +7,13 @@ description: Prerequisites and an overview of how to build ClickHouse # Getting Started Guide for Building ClickHouse -ClickHouse can be build on Linux, FreeBSD and macOS. If you use Windows, you can still build ClickHouse in a virtual machine running Linux, e.g. [VirtualBox](https://www.virtualbox.org/) with Ubuntu. +ClickHouse can be built on Linux, FreeBSD and macOS. If you use Windows, you can still build ClickHouse in a virtual machine running Linux, e.g. [VirtualBox](https://www.virtualbox.org/) with Ubuntu. ClickHouse requires a 64-bit system to compile and run, 32-bit systems do not work. ## Creating a Repository on GitHub {#creating-a-repository-on-github} -To start developing for ClickHouse you will need a [GitHub](https://www.virtualbox.org/) account. Please also generate a SSH key locally (if you don't have one already) and upload the public key to GitHub as this is a prerequisite for contributing patches. +To start developing for ClickHouse you will need a [GitHub](https://www.virtualbox.org/) account. Please also generate an SSH key locally (if you don't have one already) and upload the public key to GitHub as this is a prerequisite for contributing patches. Next, create a fork of the [ClickHouse repository](https://github.com/ClickHouse/ClickHouse/) in your personal account by clicking the "fork" button in the upper right corner. @@ -37,7 +37,7 @@ git clone git@github.com:your_github_username/ClickHouse.git # replace placehol cd ClickHouse ``` -This command creates a directory `ClickHouse/` containing the source code of ClickHouse. If you specify a custom checkout directory after the URL but it is important that this path does not contain whitespaces as it may lead to problems with the build later on. +This command creates a directory `ClickHouse/` containing the source code of ClickHouse. If you specify a custom checkout directory after the URL, but it is important that this path does not contain whitespaces as it may lead to problems with the build later on. The ClickHouse repository uses Git submodules, i.e. references to external repositories (usually 3rd party libraries used by ClickHouse). These are not checked out by default. To do so, you can either @@ -45,7 +45,7 @@ The ClickHouse repository uses Git submodules, i.e. references to external repos - if `git clone` did not check out submodules, run `git submodule update --init --jobs ` (e.g. ` = 12` to parallelize the checkout) to achieve the same as the previous alternative, or -- if `git clone` did not check out submodules and you like to use [sparse](https://github.blog/2020-01-17-bring-your-monorepo-down-to-size-with-sparse-checkout/) and [shallow](https://github.blog/2020-12-21-get-up-to-speed-with-partial-clone-and-shallow-clone/) submodule checkout to omit unneeded files and history in submodules to save space (ca. 5 GB instead of ca. 15 GB), run `./contrib/update-submodules.sh`. Not really recommended as it generally makes working with submodules less convenient and slower. +- if `git clone` did not check out submodules, and you like to use [sparse](https://github.blog/2020-01-17-bring-your-monorepo-down-to-size-with-sparse-checkout/) and [shallow](https://github.blog/2020-12-21-get-up-to-speed-with-partial-clone-and-shallow-clone/) submodule checkout to omit unneeded files and history in submodules to save space (ca. 5 GB instead of ca. 15 GB), run `./contrib/update-submodules.sh`. Not really recommended as it generally makes working with submodules less convenient and slower. You can check the Git status with the command: `git submodule status`. @@ -91,7 +91,7 @@ If you use Arch or Gentoo, you probably know it yourself how to install CMake. ## C++ Compiler {#c-compiler} -Compilers Clang starting from version 15 is supported for building ClickHouse. +Compilers Clang starting from version 16 is supported for building ClickHouse. Clang should be used instead of gcc. Though, our continuous integration (CI) platform runs checks for about a dozen of build combinations. @@ -143,7 +143,7 @@ When a large amount of RAM is available on build machine you should limit the nu On machines with 4GB of RAM, it is recommended to specify 1, for 8GB of RAM `-j 2` is recommended. -If you get the message: `ninja: error: loading 'build.ninja': No such file or directory`, it means that generating a build configuration has failed and you need to inspect the message above. +If you get the message: `ninja: error: loading 'build.ninja': No such file or directory`, it means that generating a build configuration has failed, and you need to inspect the message above. Upon the successful start of the building process, you’ll see the build progress - the number of processed tasks and the total number of tasks. @@ -184,7 +184,7 @@ You can also run your custom-built ClickHouse binary with the config file from t **CLion (recommended)** -If you do not know which IDE to use, we recommend that you use [CLion](https://www.jetbrains.com/clion/). CLion is commercial software but it offers a 30 day free trial. It is also free of charge for students. CLion can be used on both Linux and macOS. +If you do not know which IDE to use, we recommend that you use [CLion](https://www.jetbrains.com/clion/). CLion is commercial software, but it offers a 30 day free trial. It is also free of charge for students. CLion can be used on both Linux and macOS. A few things to know when using CLion to develop ClickHouse: diff --git a/docs/en/engines/table-engines/mergetree-family/annindexes.md b/docs/en/engines/table-engines/mergetree-family/annindexes.md index be588f1764d..78a27d3ff86 100644 --- a/docs/en/engines/table-engines/mergetree-family/annindexes.md +++ b/docs/en/engines/table-engines/mergetree-family/annindexes.md @@ -2,7 +2,7 @@ Nearest neighborhood search is the problem of finding the M closest points for a given point in an N-dimensional vector space. The most straightforward approach to solve this problem is a brute force search where the distance between all points in the vector space and the -reference point is computed. This method guarantees perfect accuracy but it is usually too slow for practical applications. Thus, nearest +reference point is computed. This method guarantees perfect accuracy, but it is usually too slow for practical applications. Thus, nearest neighborhood search problems are often solved with [approximative algorithms](https://github.com/erikbern/ann-benchmarks). Approximative nearest neighborhood search techniques, in conjunction with [embedding methods](https://cloud.google.com/architecture/overview-extracting-and-serving-feature-embeddings-for-machine-learning) allow to search huge @@ -24,7 +24,7 @@ LIMIT N `vectors` contains N-dimensional values of type [Array](../../../sql-reference/data-types/array.md) or [Tuple](../../../sql-reference/data-types/tuple.md), for example embeddings. Function `Distance` computes the distance between two vectors. -Often, the the Euclidean (L2) distance is chosen as distance function but [other +Often, the Euclidean (L2) distance is chosen as distance function but [other distance functions](/docs/en/sql-reference/functions/distance-functions.md) are also possible. `Point` is the reference point, e.g. `(0.17, 0.33, ...)`, and `N` limits the number of search results. @@ -109,7 +109,7 @@ clickhouse-client --param_vec='hello' --query="SELECT * FROM table_with_ann_inde **Restrictions**: Queries that contain both a `WHERE Distance(vectors, Point) < MaxDistance` and an `ORDER BY Distance(vectors, Point)` clause cannot use ANN indexes. Also, the approximate algorithms used to determine the nearest neighbors require a limit, hence queries -without `LIMIT` clause cannot utilize ANN indexes. Also ANN indexes are only used if the query has a `LIMIT` value smaller than setting +without `LIMIT` clause cannot utilize ANN indexes. Also, ANN indexes are only used if the query has a `LIMIT` value smaller than setting `max_limit_for_ann_queries` (default: 1 million rows). This is a safeguard to prevent large memory allocations by external libraries for approximate neighbor search. @@ -120,9 +120,9 @@ then each indexed block will contain 16384 rows. However, data structures and al provided by external libraries) are inherently row-oriented. They store a compact representation of a set of rows and also return rows for ANN queries. This causes some rather unintuitive differences in the way ANN indexes behave compared to normal skip indexes. -When a user defines a ANN index on a column, ClickHouse internally creates a ANN "sub-index" for each index block. The sub-index is "local" +When a user defines an ANN index on a column, ClickHouse internally creates an ANN "sub-index" for each index block. The sub-index is "local" in the sense that it only knows about the rows of its containing index block. In the previous example and assuming that a column has 65536 -rows, we obtain four index blocks (spanning eight granules) and a ANN sub-index for each index block. A sub-index is theoretically able to +rows, we obtain four index blocks (spanning eight granules) and an ANN sub-index for each index block. A sub-index is theoretically able to return the rows with the N closest points within its index block directly. However, since ClickHouse loads data from disk to memory at the granularity of granules, sub-indexes extrapolate matching rows to granule granularity. This is different from regular skip indexes which skip data at the granularity of index blocks. @@ -231,7 +231,7 @@ The Annoy index currently does not work with per-table, non-default `index_granu ## USearch {#usearch} -This type of ANN index is based on the [the USearch library](https://github.com/unum-cloud/usearch), which implements the [HNSW +This type of ANN index is based on the [USearch library](https://github.com/unum-cloud/usearch), which implements the [HNSW algorithm](https://arxiv.org/abs/1603.09320), i.e., builds a hierarchical graph where each point represents a vector and the edges represent similarity. Such hierarchical structures can be very efficient on large collections. They may often fetch 0.05% or less data from the overall dataset, while still providing 99% recall. This is especially useful when working with high-dimensional vectors, diff --git a/docs/en/engines/table-engines/mergetree-family/collapsingmergetree.md b/docs/en/engines/table-engines/mergetree-family/collapsingmergetree.md index 0043e1b6748..ba4021d8422 100644 --- a/docs/en/engines/table-engines/mergetree-family/collapsingmergetree.md +++ b/docs/en/engines/table-engines/mergetree-family/collapsingmergetree.md @@ -125,7 +125,7 @@ For each resulting data part ClickHouse saves: 3. The first “cancel” row, if there are more “cancel” rows than “state” rows. 4. None of the rows, in all other cases. -Also when there are at least 2 more “state” rows than “cancel” rows, or at least 2 more “cancel” rows then “state” rows, the merge continues, but ClickHouse treats this situation as a logical error and records it in the server log. This error can occur if the same data were inserted more than once. +Also, when there are at least 2 more “state” rows than “cancel” rows, or at least 2 more “cancel” rows then “state” rows, the merge continues, but ClickHouse treats this situation as a logical error and records it in the server log. This error can occur if the same data were inserted more than once. Thus, collapsing should not change the results of calculating statistics. Changes gradually collapsed so that in the end only the last state of almost every object left. @@ -196,7 +196,7 @@ What do we see and where is collapsing? With two `INSERT` queries, we created 2 data parts. The `SELECT` query was performed in 2 threads, and we got a random order of rows. Collapsing not occurred because there was no merge of the data parts yet. ClickHouse merges data part in an unknown moment which we can not predict. -Thus we need aggregation: +Thus, we need aggregation: ``` sql SELECT diff --git a/docs/en/engines/table-engines/special/distributed.md b/docs/en/engines/table-engines/special/distributed.md index de8ae0357dc..c3b8a2f2048 100644 --- a/docs/en/engines/table-engines/special/distributed.md +++ b/docs/en/engines/table-engines/special/distributed.md @@ -72,7 +72,7 @@ Specifying the `sharding_key` is necessary for the following: #### fsync_directories -`fsync_directories` - do the `fsync` for directories. Guarantees that the OS refreshed directory metadata after operations related to background inserts on Distributed table (after insert, after sending the data to shard, etc). +`fsync_directories` - do the `fsync` for directories. Guarantees that the OS refreshed directory metadata after operations related to background inserts on Distributed table (after insert, after sending the data to shard, etc.). #### bytes_to_throw_insert @@ -220,7 +220,7 @@ Second, you can perform `INSERT` statements on a `Distributed` table. In this ca Each shard can have a `` defined in the config file. By default, the weight is `1`. Data is distributed across shards in the amount proportional to the shard weight. All shard weights are summed up, then each shard's weight is divided by the total to determine each shard's proportion. For example, if there are two shards and the first has a weight of 1 while the second has a weight of 2, the first will be sent one third (1 / 3) of inserted rows and the second will be sent two thirds (2 / 3). -Each shard can have the `internal_replication` parameter defined in the config file. If this parameter is set to `true`, the write operation selects the first healthy replica and writes data to it. Use this if the tables underlying the `Distributed` table are replicated tables (e.g. any of the `Replicated*MergeTree` table engines). One of the table replicas will receive the write and it will be replicated to the other replicas automatically. +Each shard can have the `internal_replication` parameter defined in the config file. If this parameter is set to `true`, the write operation selects the first healthy replica and writes data to it. Use this if the tables underlying the `Distributed` table are replicated tables (e.g. any of the `Replicated*MergeTree` table engines). One of the table replicas will receive the write, and it will be replicated to the other replicas automatically. If `internal_replication` is set to `false` (the default), data is written to all replicas. In this case, the `Distributed` table replicates data itself. This is worse than using replicated tables because the consistency of replicas is not checked and, over time, they will contain slightly different data. diff --git a/docs/en/getting-started/example-datasets/amazon-reviews.md b/docs/en/getting-started/example-datasets/amazon-reviews.md index 00dc553782c..c07ffa86dd9 100644 --- a/docs/en/getting-started/example-datasets/amazon-reviews.md +++ b/docs/en/getting-started/example-datasets/amazon-reviews.md @@ -12,7 +12,7 @@ The queries below were executed on a **Production** instance of [ClickHouse Clou ::: -1. Without inserting the data into ClickHouse, we can query it in place. Let's grab some rows so we can see what they look like: +1. Without inserting the data into ClickHouse, we can query it in place. Let's grab some rows, so we can see what they look like: ```sql SELECT * diff --git a/docs/en/getting-started/example-datasets/cell-towers.md b/docs/en/getting-started/example-datasets/cell-towers.md index a84eb5d561f..090de1b32fd 100644 --- a/docs/en/getting-started/example-datasets/cell-towers.md +++ b/docs/en/getting-started/example-datasets/cell-towers.md @@ -29,7 +29,7 @@ Here is a preview of the dashboard created in this guide: This dataset is from [OpenCelliD](https://www.opencellid.org/) - The world's largest Open Database of Cell Towers. -As of 2021, it contains more than 40 million records about cell towers (GSM, LTE, UMTS, etc.) around the world with their geographical coordinates and metadata (country code, network, etc). +As of 2021, it contains more than 40 million records about cell towers (GSM, LTE, UMTS, etc.) around the world with their geographical coordinates and metadata (country code, network, etc.). OpenCelliD Project is licensed under a Creative Commons Attribution-ShareAlike 4.0 International License, and we redistribute a snapshot of this dataset under the terms of the same license. The up-to-date version of the dataset is available to download after sign in. @@ -355,7 +355,7 @@ Click on **UPDATE CHART** to render the visualization. ### Add the charts to a **dashboard** -This screenshot shows cell tower locations with LTE, UMTS, and GSM radios. The charts are all created in the same way and they are added to a dashboard. +This screenshot shows cell tower locations with LTE, UMTS, and GSM radios. The charts are all created in the same way, and they are added to a dashboard. ![Dashboard of cell towers by radio type in mcc 204](@site/docs/en/getting-started/example-datasets/images/superset-cell-tower-dashboard.png) diff --git a/docs/en/getting-started/example-datasets/covid19.md b/docs/en/getting-started/example-datasets/covid19.md index 3a7fae89ae0..da9dc4aa96b 100644 --- a/docs/en/getting-started/example-datasets/covid19.md +++ b/docs/en/getting-started/example-datasets/covid19.md @@ -132,7 +132,7 @@ FROM covid19; └────────────────────────────────────────────┘ ``` -7. You will notice the data has a lot of 0's for dates - either weekends or days where numbers were not reported each day. We can use a window function to smooth out the daily averages of new cases: +7. You will notice the data has a lot of 0's for dates - either weekends or days when numbers were not reported each day. We can use a window function to smooth out the daily averages of new cases: ```sql SELECT @@ -262,4 +262,4 @@ The results look like :::note As mentioned in the [GitHub repo](https://github.com/GoogleCloudPlatform/covid-19-open-data), the dataset is no longer updated as of September 15, 2022. -::: \ No newline at end of file +::: diff --git a/docs/en/interfaces/cli.md b/docs/en/interfaces/cli.md index 518037a2c7c..2b90d684c13 100644 --- a/docs/en/interfaces/cli.md +++ b/docs/en/interfaces/cli.md @@ -243,7 +243,7 @@ If no database is specified, the `default` database will be used. If the user name, password or database was specified in the connection string, it cannot be specified using `--user`, `--password` or `--database` (and vice versa). -The host component can either be an a host name and IP address. Put an IPv6 address in square brackets to specify it: +The host component can either be a host name and IP address. Put an IPv6 address in square brackets to specify it: ```text clickhouse://[2001:db8::1234] diff --git a/docs/en/interfaces/formats.md b/docs/en/interfaces/formats.md index 0f597282f9e..285737312bd 100644 --- a/docs/en/interfaces/formats.md +++ b/docs/en/interfaces/formats.md @@ -33,7 +33,7 @@ The supported formats are: | [JSONAsString](#jsonasstring) | ✔ | ✗ | | [JSONStrings](#jsonstrings) | ✔ | ✔ | | [JSONColumns](#jsoncolumns) | ✔ | ✔ | -| [JSONColumnsWithMetadata](#jsoncolumnsmonoblock)) | ✔ | ✔ | +| [JSONColumnsWithMetadata](#jsoncolumnsmonoblock) | ✔ | ✔ | | [JSONCompact](#jsoncompact) | ✔ | ✔ | | [JSONCompactStrings](#jsoncompactstrings) | ✗ | ✔ | | [JSONCompactColumns](#jsoncompactcolumns) | ✔ | ✔ | diff --git a/docs/en/interfaces/schema-inference.md b/docs/en/interfaces/schema-inference.md index 4db1d53987a..39ae69eaef4 100644 --- a/docs/en/interfaces/schema-inference.md +++ b/docs/en/interfaces/schema-inference.md @@ -13,7 +13,7 @@ can control it. Schema inference is used when ClickHouse needs to read the data in a specific data format and the structure is unknown. -## Table functions [file](../sql-reference/table-functions/file.md), [s3](../sql-reference/table-functions/s3.md), [url](../sql-reference/table-functions/url.md), [hdfs](../sql-reference/table-functions/hdfs.md). +## Table functions [file](../sql-reference/table-functions/file.md), [s3](../sql-reference/table-functions/s3.md), [url](../sql-reference/table-functions/url.md), [hdfs](../sql-reference/table-functions/hdfs.md), [azureBlobStorage](../sql-reference/table-functions/azureBlobStorage.md). These table functions have the optional argument `structure` with the structure of input data. If this argument is not specified or set to `auto`, the structure will be inferred from the data. @@ -55,7 +55,7 @@ DESCRIBE file('hobbies.jsonl') └─────────┴─────────────────────────┴──────────────┴────────────────────┴─────────┴──────────────────┴────────────────┘ ``` -## Table engines [File](../engines/table-engines/special/file.md), [S3](../engines/table-engines/integrations/s3.md), [URL](../engines/table-engines/special/url.md), [HDFS](../engines/table-engines/integrations/hdfs.md) +## Table engines [File](../engines/table-engines/special/file.md), [S3](../engines/table-engines/integrations/s3.md), [URL](../engines/table-engines/special/url.md), [HDFS](../engines/table-engines/integrations/hdfs.md), [azureBlobStorage](../engines/table-engines/integrations/azureBlobStorage.md) If the list of columns is not specified in `CREATE TABLE` query, the structure of the table will be inferred automatically from the data. @@ -1061,7 +1061,7 @@ $$) └──────────────┴───────────────┘ ``` -## Values {#values} +### Values {#values} In Values format ClickHouse extracts column value from the row and then parses it using the recursive parser similar to how literals are parsed. @@ -1986,3 +1986,46 @@ Note: - As some of the files may not contain some columns from the resulting schema, union mode is supported only for formats that support reading subset of columns (like JSONEachRow, Parquet, TSVWithNames, etc) and won't work for other formats (like CSV, TSV, JSONCompactEachRow, etc). - If ClickHouse cannot infer the schema from one of the files, the exception will be thrown. - If you have a lot of files, reading schema from all of them can take a lot of time. + + +## Automatic format detection {#automatic-format-detection} + +If data format is not specified and cannot be determined by the file extension, ClickHouse will try to detect the file format by its content. + +**Examples:** + +Let's say we have `data` with the following content: +``` +"a","b" +1,"Data1" +2,"Data2" +3,"Data3" +``` + +We can inspect and query this file without specifying format or structure: +```sql +:) desc file(data); +``` + +```text +┌─name─┬─type─────────────┐ +│ a │ Nullable(Int64) │ +│ b │ Nullable(String) │ +└──────┴──────────────────┘ +``` + +```sql +:) select * from file(data); +``` + +```text +┌─a─┬─b─────┐ +│ 1 │ Data1 │ +│ 2 │ Data2 │ +│ 3 │ Data3 │ +└───┴───────┘ +``` + +:::note +ClickHouse can detect only some subset of formats and this detection takes some time, it's always better to specify the format explicitly. +::: \ No newline at end of file diff --git a/docs/en/operations/configuration-files.md b/docs/en/operations/configuration-files.md index 005c7818eb1..9f17f4af1e8 100644 --- a/docs/en/operations/configuration-files.md +++ b/docs/en/operations/configuration-files.md @@ -6,15 +6,66 @@ sidebar_label: Configuration Files # Configuration Files -The ClickHouse server can be configured with configuration files in XML or YAML syntax. In most installation types, the ClickHouse server runs with `/etc/clickhouse-server/config.xml` as default configuration file but it is also possible to specify the location of the configuration file manually at server startup using command line option `--config-file=` or `-C`. Additional configuration files may be placed into directory `config.d/` relative to the main configuration file, for example into directory `/etc/clickhouse-server/config.d/`. Files in this directory and the main configuration are merged in a preprocessing step before the configuration is applied in ClickHouse server. Configuration files are merged in alphabetical order. To simplify updates and improve modularization, it is best practice to keep the default `config.xml` file unmodified and place additional customization into `config.d/`. +The ClickHouse server can be configured with configuration files in XML or YAML syntax. In most installation types, the ClickHouse server runs with `/etc/clickhouse-server/config.xml` as default configuration file, but it is also possible to specify the location of the configuration file manually at server startup using command line option `--config-file=` or `-C`. Additional configuration files may be placed into directory `config.d/` relative to the main configuration file, for example into directory `/etc/clickhouse-server/config.d/`. Files in this directory and the main configuration are merged in a preprocessing step before the configuration is applied in ClickHouse server. Configuration files are merged in alphabetical order. To simplify updates and improve modularization, it is best practice to keep the default `config.xml` file unmodified and place additional customization into `config.d/`. It is possible to mix XML and YAML configuration files, for example you could have a main configuration file `config.xml` and additional configuration files `config.d/network.xml`, `config.d/timezone.yaml` and `config.d/keeper.yaml`. Mixing XML and YAML within a single configuration file is not supported. XML configuration files should use `...` as top-level tag. In YAML configuration files, `clickhouse:` is optional, the parser inserts it implicitly if absent. -## Overriding Configuration {#override} +## Merging Configuration {#merging} -The merge of configuration files behaves as one intuitively expects: The contents of both files are combined recursively, children with the same name are replaced by the element of the more specific configuration file. The merge can be customized using attributes `replace` and `remove`. -- Attribute `replace` means that the element is replaced by the specified one. -- Attribute `remove` means that the element is deleted. +Two configuration files (usually the main configuration file and another configuration files from `config.d/`) are merged as follows: + +- If a node (i.e. a path leading to an element) appears in both files and does not have attributes `replace` or `remove`, it is included in the merged configuration file and children from both nodes are included and merged recursively. +- If one of both nodes contains attribute `replace`, it is included in the merged configuration file but only children from the node with attribute `replace` are included. +- If one of both nodes contains attribute `remove`, the node is not included in the merged configuration file (if it exists already, it is deleted). + +Example: + + +```xml + + + + 1 + + + 2 + + + 3 + + +``` + +and + +```xml + + + + 4 + + + 5 + + + 6 + + +``` + +generates merged configuration file: + +```xml + + + 1 + 4 + + + 5 + + +``` To specify that a value of an element should be replaced by the value of an environment variable, you can use attribute `from_env`. @@ -36,7 +87,7 @@ which is equal to - 150000 + 150000 @@ -63,7 +114,7 @@ XML substitution example: ``` -Substitutions can also be performed from ZooKeeper. To do this, specify the attribute `from_zk = "/path/to/node"`. The element value is replaced with the contents of the node at `/path/to/node` in ZooKeeper. You can also put an entire XML subtree on the ZooKeeper node and it will be fully inserted into the source element. +Substitutions can also be performed from ZooKeeper. To do this, specify the attribute `from_zk = "/path/to/node"`. The element value is replaced with the contents of the node at `/path/to/node` in ZooKeeper. You can also put an entire XML subtree on the ZooKeeper node, and it will be fully inserted into the source element. ## Encrypting and Hiding Configuration {#encryption} @@ -125,7 +176,7 @@ Users configuration can be split into separate files similar to `config.xml` and Directory name is defined as `users_config` setting without `.xml` postfix concatenated with `.d`. Directory `users.d` is used by default, as `users_config` defaults to `users.xml`. -Note that configuration files are first merged taking into account [Override](#override) settings and includes are processed after that. +Note that configuration files are first [merged](#merging) taking into account settings, and includes are processed after that. ## XML example {#example} diff --git a/docs/en/operations/settings/settings.md b/docs/en/operations/settings/settings.md index a275878f32e..b11a04e10ec 100644 --- a/docs/en/operations/settings/settings.md +++ b/docs/en/operations/settings/settings.md @@ -1775,6 +1775,10 @@ Default value: 0 (no restriction). ## insert_quorum {#insert_quorum} +:::note +`insert_quorum` does not apply when using the [`SharedMergeTree` table engine](/en/cloud/reference/shared-merge-tree) in ClickHouse Cloud as all inserts are quorum inserted. +::: + Enables the quorum writes. - If `insert_quorum < 2`, the quorum writes are disabled. @@ -1814,6 +1818,10 @@ See also: ## insert_quorum_parallel {#insert_quorum_parallel} +:::note +`insert_quorum_parallel` does not apply when using the [`SharedMergeTree` table engine](/en/cloud/reference/shared-merge-tree) in ClickHouse Cloud as all inserts are quorum inserted. +::: + Enables or disables parallelism for quorum `INSERT` queries. If enabled, additional `INSERT` queries can be sent while previous queries have not yet finished. If disabled, additional writes to the same table will be rejected. Possible values: @@ -4271,41 +4279,6 @@ Result: └─────┴─────┴───────┘ ``` -## enable_order_by_all {#enable-order-by-all} - -Enables or disables sorting by `ALL` columns, i.e. [ORDER BY](../../sql-reference/statements/select/order-by.md) - -Possible values: - -- 0 — Disable ORDER BY ALL. -- 1 — Enable ORDER BY ALL. - -Default value: `1`. - -**Example** - -Query: - -```sql -CREATE TABLE TAB(C1 Int, C2 Int, ALL Int) ENGINE=Memory(); - -INSERT INTO TAB VALUES (10, 20, 30), (20, 20, 10), (30, 10, 20); - -SELECT * FROM TAB ORDER BY ALL; -- returns an error that ALL is ambiguous - -SELECT * FROM TAB ORDER BY ALL SETTINGS enable_order_by_all; -``` - -Result: - -```text -┌─C1─┬─C2─┬─ALL─┐ -│ 20 │ 20 │ 10 │ -│ 30 │ 10 │ 20 │ -│ 10 │ 20 │ 30 │ -└────┴────┴─────┘ -``` - ## splitby_max_substrings_includes_remaining_string {#splitby_max_substrings_includes_remaining_string} Controls whether function [splitBy*()](../../sql-reference/functions/splitting-merging-functions.md) with argument `max_substrings` > 0 will include the remaining string in the last element of the result array. diff --git a/docs/en/operations/system-tables/asynchronous_loader.md b/docs/en/operations/system-tables/asynchronous_loader.md index af9aa4ecd09..75d98e4549d 100644 --- a/docs/en/operations/system-tables/asynchronous_loader.md +++ b/docs/en/operations/system-tables/asynchronous_loader.md @@ -49,6 +49,6 @@ Every job has a pool associated with it and is started in this pool. Each pool h Time instants during job lifetime: - `schedule_time` (`DateTime64`) - Time when job was created and scheduled to be executed (usually with all its dependencies). -- `enqueue_time` (`Nullable(DateTime64)`) - Time when job became ready and was enqueued into a ready queue of it's pool. Null if the job is not ready yet. +- `enqueue_time` (`Nullable(DateTime64)`) - Time when job became ready and was enqueued into a ready queue of its pool. Null if the job is not ready yet. - `start_time` (`Nullable(DateTime64)`) - Time when worker dequeues the job from ready queue and start its execution. Null if the job is not started yet. - `finish_time` (`Nullable(DateTime64)`) - Time when job execution is finished. Null if the job is not finished yet. diff --git a/docs/en/operations/system-tables/asynchronous_metrics.md b/docs/en/operations/system-tables/asynchronous_metrics.md index fe8f963b1ec..81725b97e41 100644 --- a/docs/en/operations/system-tables/asynchronous_metrics.md +++ b/docs/en/operations/system-tables/asynchronous_metrics.md @@ -297,11 +297,11 @@ Total number of databases on the server. ### NumberOfDetachedByUserParts -The total number of parts detached from MergeTree tables by users with the `ALTER TABLE DETACH` query (as opposed to unexpected, broken or ignored parts). The server does not care about detached parts and they can be removed. +The total number of parts detached from MergeTree tables by users with the `ALTER TABLE DETACH` query (as opposed to unexpected, broken or ignored parts). The server does not care about detached parts, and they can be removed. ### NumberOfDetachedParts -The total number of parts detached from MergeTree tables. A part can be detached by a user with the `ALTER TABLE DETACH` query or by the server itself it the part is broken, unexpected or unneeded. The server does not care about detached parts and they can be removed. +The total number of parts detached from MergeTree tables. A part can be detached by a user with the `ALTER TABLE DETACH` query or by the server itself it the part is broken, unexpected or unneeded. The server does not care about detached parts, and they can be removed. ### NumberOfTables @@ -393,7 +393,7 @@ The amount of free memory plus OS page cache memory on the host system, in bytes ### OSMemoryFreeWithoutCached -The amount of free memory on the host system, in bytes. This does not include the memory used by the OS page cache memory, in bytes. The page cache memory is also available for usage by programs, so the value of this metric can be confusing. See the `OSMemoryAvailable` metric instead. For convenience we also provide the `OSMemoryFreePlusCached` metric, that should be somewhat similar to OSMemoryAvailable. See also https://www.linuxatemyram.com/. This is a system-wide metric, it includes all the processes on the host machine, not just clickhouse-server. +The amount of free memory on the host system, in bytes. This does not include the memory used by the OS page cache memory, in bytes. The page cache memory is also available for usage by programs, so the value of this metric can be confusing. See the `OSMemoryAvailable` metric instead. For convenience, we also provide the `OSMemoryFreePlusCached` metric, that should be somewhat similar to OSMemoryAvailable. See also https://www.linuxatemyram.com/. This is a system-wide metric, it includes all the processes on the host machine, not just clickhouse-server. ### OSMemoryTotal @@ -493,7 +493,7 @@ Number of threads in the server of the PostgreSQL compatibility protocol. ### QueryCacheBytes -Total size of the query cache cache in bytes. +Total size of the query cache in bytes. ### QueryCacheEntries @@ -549,7 +549,7 @@ Total amount of bytes (compressed, including data and indices) stored in all tab ### TotalPartsOfMergeTreeTables -Total amount of data parts in all tables of MergeTree family. Numbers larger than 10 000 will negatively affect the server startup time and it may indicate unreasonable choice of the partition key. +Total amount of data parts in all tables of MergeTree family. Numbers larger than 10 000 will negatively affect the server startup time, and it may indicate unreasonable choice of the partition key. ### TotalPrimaryKeyBytesInMemory diff --git a/docs/en/operations/system-tables/clusters.md b/docs/en/operations/system-tables/clusters.md index 63cc083e4bc..7a9f1438b87 100644 --- a/docs/en/operations/system-tables/clusters.md +++ b/docs/en/operations/system-tables/clusters.md @@ -19,7 +19,7 @@ Columns: - `default_database` ([String](../../sql-reference/data-types/string.md)) — The default database name. - `errors_count` ([UInt32](../../sql-reference/data-types/int-uint.md)) — The number of times this host failed to reach replica. - `slowdowns_count` ([UInt32](../../sql-reference/data-types/int-uint.md)) — The number of slowdowns that led to changing replica when establishing a connection with hedged requests. -- `estimated_recovery_time` ([UInt32](../../sql-reference/data-types/int-uint.md)) — Seconds remaining until the replica error count is zeroed and it is considered to be back to normal. +- `estimated_recovery_time` ([UInt32](../../sql-reference/data-types/int-uint.md)) — Seconds remaining until the replica error count is zeroed, and it is considered to be back to normal. - `database_shard_name` ([String](../../sql-reference/data-types/string.md)) — The name of the `Replicated` database shard (for clusters that belong to a `Replicated` database). - `database_replica_name` ([String](../../sql-reference/data-types/string.md)) — The name of the `Replicated` database replica (for clusters that belong to a `Replicated` database). - `is_active` ([Nullable(UInt8)](../../sql-reference/data-types/int-uint.md)) — The status of the `Replicated` database replica (for clusters that belong to a `Replicated` database): 1 means "replica is online", 0 means "replica is offline", `NULL` means "unknown". diff --git a/docs/en/operations/system-tables/dictionaries.md b/docs/en/operations/system-tables/dictionaries.md index 8632581144c..c4cf7ba8bfb 100644 --- a/docs/en/operations/system-tables/dictionaries.md +++ b/docs/en/operations/system-tables/dictionaries.md @@ -18,7 +18,7 @@ Columns: - `LOADED_AND_RELOADING` — Dictionary is loaded successfully, and is being reloaded right now (frequent reasons: [SYSTEM RELOAD DICTIONARY](../../sql-reference/statements/system.md#query_language-system-reload-dictionary) query, timeout, dictionary config has changed). - `FAILED_AND_RELOADING` — Could not load the dictionary as a result of an error and is loading now. - `origin` ([String](../../sql-reference/data-types/string.md)) — Path to the configuration file that describes the dictionary. -- `type` ([String](../../sql-reference/data-types/string.md)) — Type of a dictionary allocation. [Storing Dictionaries in Memory](../../sql-reference/dictionaries/index.md#storig-dictionaries-in-memory). +- `type` ([String](../../sql-reference/data-types/string.md)) — Type of dictionary allocation. [Storing Dictionaries in Memory](../../sql-reference/dictionaries/index.md#storig-dictionaries-in-memory). - `key.names` ([Array](../../sql-reference/data-types/array.md)([String](../../sql-reference/data-types/string.md))) — Array of [key names](../../sql-reference/dictionaries/index.md#dictionary-key-and-fields#ext_dict_structure-key) provided by the dictionary. - `key.types` ([Array](../../sql-reference/data-types/array.md)([String](../../sql-reference/data-types/string.md))) — Corresponding array of [key types](../../sql-reference/dictionaries/index.md#dictionary-key-and-fields#ext_dict_structure-key) provided by the dictionary. - `attribute.names` ([Array](../../sql-reference/data-types/array.md)([String](../../sql-reference/data-types/string.md))) — Array of [attribute names](../../sql-reference/dictionaries/index.md#dictionary-key-and-fields#ext_dict_structure-attributes) provided by the dictionary. diff --git a/docs/en/operations/system-tables/tables.md b/docs/en/operations/system-tables/tables.md index 8049ab091c0..2132f69319e 100644 --- a/docs/en/operations/system-tables/tables.md +++ b/docs/en/operations/system-tables/tables.md @@ -27,6 +27,8 @@ Columns: - `metadata_modification_time` ([DateTime](../../sql-reference/data-types/datetime.md)) - Time of latest modification of the table metadata. +- `metadata_version` ([Int32](../../sql-reference/data-types/int-uint.md)) - Metadata version for ReplicatedMergeTree table, 0 for non ReplicatedMergeTree table. + - `dependencies_database` ([Array](../../sql-reference/data-types/array.md)([String](../../sql-reference/data-types/string.md))) - Database dependencies. - `dependencies_table` ([Array](../../sql-reference/data-types/array.md)([String](../../sql-reference/data-types/string.md))) - Table dependencies ([materialized views](../../sql-reference/statements/create/view.md#materialized-view) the current table). diff --git a/docs/en/operations/utilities/clickhouse-local.md b/docs/en/operations/utilities/clickhouse-local.md index c863282efc1..437a5f0fff0 100644 --- a/docs/en/operations/utilities/clickhouse-local.md +++ b/docs/en/operations/utilities/clickhouse-local.md @@ -34,7 +34,7 @@ The binary you just downloaded can run all sorts of ClickHouse tools and utiliti A common use of `clickhouse-local` is to run ad-hoc queries on files: where you don't have to insert the data into a table. `clickhouse-local` can stream the data from a file into a temporary table and execute your SQL. -If the file is sitting on the same machine as `clickhouse-local`, you can simple specify the file to load. The following `reviews.tsv` file contains a sampling of Amazon product reviews: +If the file is sitting on the same machine as `clickhouse-local`, you can simply specify the file to load. The following `reviews.tsv` file contains a sampling of Amazon product reviews: ```bash ./clickhouse local -q "SELECT * FROM 'reviews.tsv'" @@ -220,7 +220,7 @@ Arguments: - `--help` — arguments references for `clickhouse-local`. - `-V`, `--version` — print version information and exit. -Also there are arguments for each ClickHouse configuration variable which are more commonly used instead of `--config-file`. +Also, there are arguments for each ClickHouse configuration variable which are more commonly used instead of `--config-file`. ## Examples {#examples} diff --git a/docs/en/operations/utilities/clickhouse-obfuscator.md b/docs/en/operations/utilities/clickhouse-obfuscator.md index ad51e9c7776..f9a94713be7 100644 --- a/docs/en/operations/utilities/clickhouse-obfuscator.md +++ b/docs/en/operations/utilities/clickhouse-obfuscator.md @@ -38,7 +38,7 @@ For example, you have a column `IsMobile` in your table with values 0 and 1. In So, the user will be able to count the exact ratio of mobile traffic. -Let's give another example. When you have some private data in your table, like user email and you don't want to publish any single email address. +Let's give another example. When you have some private data in your table, like user email, and you don't want to publish any single email address. If your table is large enough and contains multiple different emails and no email has a very high frequency than all others, it will anonymize all data. But if you have a small number of different values in a column, it can reproduce some of them. You should look at the working algorithm of this tool works, and fine-tune its command line parameters. diff --git a/docs/en/sql-reference/aggregate-functions/reference/any.md b/docs/en/sql-reference/aggregate-functions/reference/any.md index a45eb1b409f..4631060f33f 100644 --- a/docs/en/sql-reference/aggregate-functions/reference/any.md +++ b/docs/en/sql-reference/aggregate-functions/reference/any.md @@ -9,7 +9,7 @@ Selects the first encountered value of a column. By default, it ignores NULL values and returns the first NOT NULL value found in the column. As [`first_value`](../../../sql-reference/aggregate-functions/reference/first_value.md) if supports `RESPECT NULLS`, in which case it will select the first value passed, independently on whether it's NULL or not. -The return type of the function is the same as the input, except for LowCardinality which is discarded). This means that given no rows as input it will return the default value of that type (0 for integers, or Null for a Nullable() column). You might use the `-OrNull` [combinator](../../../sql-reference/aggregate-functions/combinators.md) ) to modify this behaviour. +The return type of the function is the same as the input, except for LowCardinality which is discarded. This means that given no rows as input it will return the default value of that type (0 for integers, or Null for a Nullable() column). You might use the `-OrNull` [combinator](../../../sql-reference/aggregate-functions/combinators.md) ) to modify this behaviour. The query can be executed in any order and even in a different order each time, so the result of this function is indeterminate. To get a determinate result, you can use the ‘min’ or ‘max’ function instead of ‘any’. diff --git a/docs/en/sql-reference/aggregate-functions/reference/contingency.md b/docs/en/sql-reference/aggregate-functions/reference/contingency.md index 1b53ca1528f..902c1f4af80 100644 --- a/docs/en/sql-reference/aggregate-functions/reference/contingency.md +++ b/docs/en/sql-reference/aggregate-functions/reference/contingency.md @@ -20,7 +20,7 @@ contingency(column1, column2) **Returned value** -- a value between 0 to 1. The larger the result, the closer the association of the two columns. +- a value between 0 and 1. The larger the result, the closer the association of the two columns. **Return type** is always [Float64](../../../sql-reference/data-types/float.md). @@ -48,4 +48,4 @@ Result: ┌──────cramersV(a, b)─┬───contingency(a, b)─┐ │ 0.41171788506213564 │ 0.05812725261759165 │ └─────────────────────┴─────────────────────┘ -``` \ No newline at end of file +``` diff --git a/docs/en/sql-reference/aggregate-functions/reference/simplelinearregression.md b/docs/en/sql-reference/aggregate-functions/reference/simplelinearregression.md index bcff05ada47..ea3dbff8691 100644 --- a/docs/en/sql-reference/aggregate-functions/reference/simplelinearregression.md +++ b/docs/en/sql-reference/aggregate-functions/reference/simplelinearregression.md @@ -13,8 +13,8 @@ simpleLinearRegression(x, y) Parameters: -- `x` — Column with dependent variable values. -- `y` — Column with explanatory variable values. +- `x` — Column with explanatory variable values. +- `y` — Column with dependent variable values. Returned values: diff --git a/docs/en/sql-reference/data-types/datetime64.md b/docs/en/sql-reference/data-types/datetime64.md index 8c7fa17ae92..504d0e2b0a6 100644 --- a/docs/en/sql-reference/data-types/datetime64.md +++ b/docs/en/sql-reference/data-types/datetime64.md @@ -9,7 +9,7 @@ sidebar_label: DateTime64 Allows to store an instant in time, that can be expressed as a calendar date and a time of a day, with defined sub-second precision Tick size (precision): 10-precision seconds. Valid range: [ 0 : 9 ]. -Typically are used - 3 (milliseconds), 6 (microseconds), 9 (nanoseconds). +Typically, are used - 3 (milliseconds), 6 (microseconds), 9 (nanoseconds). **Syntax:** diff --git a/docs/en/sql-reference/data-types/decimal.md b/docs/en/sql-reference/data-types/decimal.md index e082eb29fbd..2b32e72a28f 100644 --- a/docs/en/sql-reference/data-types/decimal.md +++ b/docs/en/sql-reference/data-types/decimal.md @@ -10,7 +10,7 @@ Signed fixed-point numbers that keep precision during add, subtract and multiply ## Parameters -- P - precision. Valid range: \[ 1 : 76 \]. Determines how many decimal digits number can have (including fraction). By default the precision is 10. +- P - precision. Valid range: \[ 1 : 76 \]. Determines how many decimal digits number can have (including fraction). By default, the precision is 10. - S - scale. Valid range: \[ 0 : P \]. Determines how many decimal digits fraction can have. Decimal(P) is equivalent to Decimal(P, 0). Similarly, the syntax Decimal is equivalent to Decimal(10, 0). diff --git a/docs/en/sql-reference/distributed-ddl.md b/docs/en/sql-reference/distributed-ddl.md index d170f3765c2..7952792cbf4 100644 --- a/docs/en/sql-reference/distributed-ddl.md +++ b/docs/en/sql-reference/distributed-ddl.md @@ -6,7 +6,7 @@ sidebar_label: Distributed DDL # Distributed DDL Queries (ON CLUSTER Clause) -By default the `CREATE`, `DROP`, `ALTER`, and `RENAME` queries affect only the current server where they are executed. In a cluster setup, it is possible to run such queries in a distributed manner with the `ON CLUSTER` clause. +By default, the `CREATE`, `DROP`, `ALTER`, and `RENAME` queries affect only the current server where they are executed. In a cluster setup, it is possible to run such queries in a distributed manner with the `ON CLUSTER` clause. For example, the following query creates the `all_hits` `Distributed` table on each host in `cluster`: diff --git a/docs/en/sql-reference/functions/bitmap-functions.md b/docs/en/sql-reference/functions/bitmap-functions.md index 9b66d00656b..379be302881 100644 --- a/docs/en/sql-reference/functions/bitmap-functions.md +++ b/docs/en/sql-reference/functions/bitmap-functions.md @@ -372,7 +372,7 @@ Result: ## bitmapAnd -Computes the logical conjunction of two two bitmaps. +Computes the logical conjunction of two bitmaps. **Syntax** diff --git a/docs/en/sql-reference/functions/date-time-functions.md b/docs/en/sql-reference/functions/date-time-functions.md index 5622097537e..c5b3b4cc3ae 100644 --- a/docs/en/sql-reference/functions/date-time-functions.md +++ b/docs/en/sql-reference/functions/date-time-functions.md @@ -1564,7 +1564,7 @@ Alias: `TO_DAYS` **Arguments** - `date` — The date to calculate the number of days passed since year zero from. [Date](../../sql-reference/data-types/date.md), [Date32](../../sql-reference/data-types/date32.md), [DateTime](../../sql-reference/data-types/datetime.md) or [DateTime64](../../sql-reference/data-types/datetime64.md). -- `time_zone` — A String type const value or a expression represent the time zone. [String types](../../sql-reference/data-types/string.md) +- `time_zone` — A String type const value or an expression represent the time zone. [String types](../../sql-reference/data-types/string.md) **Returned value** @@ -2218,7 +2218,7 @@ now64([scale], [timezone]) **Arguments** -- `scale` - Tick size (precision): 10-precision seconds. Valid range: [ 0 : 9 ]. Typically are used - 3 (default) (milliseconds), 6 (microseconds), 9 (nanoseconds). +- `scale` - Tick size (precision): 10-precision seconds. Valid range: [ 0 : 9 ]. Typically, are used - 3 (default) (milliseconds), 6 (microseconds), 9 (nanoseconds). - `timezone` — [Timezone name](../../operations/server-configuration-parameters/settings.md#server_configuration_parameters-timezone) for the returned value (optional). [String](../../sql-reference/data-types/string.md). **Returned value** @@ -2305,7 +2305,7 @@ Rounds the time to the half hour. Converts a date or date with time to a UInt32 number containing the year and month number (YYYY \* 100 + MM). Accepts a second optional timezone argument. If provided, the timezone must be a string constant. -This functions is the opposite of function `YYYYMMDDToDate()`. +This function is the opposite of function `YYYYMMDDToDate()`. **Example** @@ -2362,7 +2362,7 @@ Result: Converts a number containing the year, month and day number to a [Date](../../sql-reference/data-types/date.md). -This functions is the opposite of function `toYYYYMMDD()`. +This function is the opposite of function `toYYYYMMDD()`. The output is undefined if the input does not encode a valid Date value. @@ -2406,7 +2406,7 @@ Converts a number containing the year, month, day, hours, minute and second numb The output is undefined if the input does not encode a valid DateTime value. -This functions is the opposite of function `toYYYYMMDDhhmmss()`. +This function is the opposite of function `toYYYYMMDDhhmmss()`. **Syntax** @@ -2981,8 +2981,8 @@ toUTCTimestamp(time_val, time_zone) **Arguments** -- `time_val` — A DateTime/DateTime64 type const value or a expression . [DateTime/DateTime64 types](../../sql-reference/data-types/datetime.md) -- `time_zone` — A String type const value or a expression represent the time zone. [String types](../../sql-reference/data-types/string.md) +- `time_val` — A DateTime/DateTime64 type const value or an expression . [DateTime/DateTime64 types](../../sql-reference/data-types/datetime.md) +- `time_zone` — A String type const value or an expression represent the time zone. [String types](../../sql-reference/data-types/string.md) **Returned value** @@ -3014,8 +3014,8 @@ fromUTCTimestamp(time_val, time_zone) **Arguments** -- `time_val` — A DateTime/DateTime64 type const value or a expression . [DateTime/DateTime64 types](../../sql-reference/data-types/datetime.md) -- `time_zone` — A String type const value or a expression represent the time zone. [String types](../../sql-reference/data-types/string.md) +- `time_val` — A DateTime/DateTime64 type const value or an expression . [DateTime/DateTime64 types](../../sql-reference/data-types/datetime.md) +- `time_zone` — A String type const value or an expression represent the time zone. [String types](../../sql-reference/data-types/string.md) **Returned value** diff --git a/docs/en/sql-reference/functions/distance-functions.md b/docs/en/sql-reference/functions/distance-functions.md index 1774c22014d..e20c35c6b6f 100644 --- a/docs/en/sql-reference/functions/distance-functions.md +++ b/docs/en/sql-reference/functions/distance-functions.md @@ -509,7 +509,7 @@ Result: ## cosineDistance -Calculates the cosine distance between two vectors (the values of the tuples are the coordinates). The less the returned value is, the more similar are the vectors. +Calculates the cosine distance between two vectors (the values of the tuples are the coordinates). The smaller the returned value is, the more similar are the vectors. **Syntax** diff --git a/docs/en/sql-reference/functions/string-functions.md b/docs/en/sql-reference/functions/string-functions.md index 60cb3ac4ac4..9ae403be524 100644 --- a/docs/en/sql-reference/functions/string-functions.md +++ b/docs/en/sql-reference/functions/string-functions.md @@ -4,6 +4,8 @@ sidebar_position: 170 sidebar_label: Strings --- +import VersionBadge from '@theme/badges/VersionBadge'; + # Functions for Working with Strings Functions for [searching](string-search-functions.md) in strings and for [replacing](string-replace-functions.md) in strings are described separately. @@ -783,6 +785,8 @@ SELECT startsWith('Spider-Man', 'Spi'); ## startsWithUTF8 + + Returns whether string `str` starts with `prefix`, the difference between `startsWithUTF8` and `startsWith` is that `startsWithUTF8` match `str` and `suffix` by UTF-8 characters. diff --git a/docs/en/sql-reference/functions/string-search-functions.md b/docs/en/sql-reference/functions/string-search-functions.md index d5dbca3f2b7..22f879c62ae 100644 --- a/docs/en/sql-reference/functions/string-search-functions.md +++ b/docs/en/sql-reference/functions/string-search-functions.md @@ -590,6 +590,10 @@ Result: └───────────────────────────────┘ ``` +## countMatchesCaseInsensitive + +Like `countMatches(haystack, pattern)` but matching ignores the case. + ## regexpExtract Extracts the first string in haystack that matches the regexp pattern and corresponds to the regex group index. diff --git a/docs/en/sql-reference/statements/alter/apply-deleted-mask.md b/docs/en/sql-reference/statements/alter/apply-deleted-mask.md index 7a11d66e739..1afc2a0ff5a 100644 --- a/docs/en/sql-reference/statements/alter/apply-deleted-mask.md +++ b/docs/en/sql-reference/statements/alter/apply-deleted-mask.md @@ -10,7 +10,7 @@ sidebar_label: APPLY DELETED MASK ALTER TABLE [db].name [ON CLUSTER cluster] APPLY DELETED MASK [IN PARTITION partition_id] ``` -The command applies mask created by [lightweight delete](/docs/en/sql-reference/statements/delete) and forcefully removes rows marked as deleted from disk. This command is a heavyweight mutation and it semantically equals to query ```ALTER TABLE [db].name DELETE WHERE _row_exists = 0```. +The command applies mask created by [lightweight delete](/docs/en/sql-reference/statements/delete) and forcefully removes rows marked as deleted from disk. This command is a heavyweight mutation, and it semantically equals to query ```ALTER TABLE [db].name DELETE WHERE _row_exists = 0```. :::note It only works for tables in the [`MergeTree`](../../../engines/table-engines/mergetree-family/mergetree.md) family (including [replicated](../../../engines/table-engines/mergetree-family/replication.md) tables). diff --git a/docs/en/sql-reference/statements/alter/constraint.md b/docs/en/sql-reference/statements/alter/constraint.md index 7a8f5809320..29675f704b5 100644 --- a/docs/en/sql-reference/statements/alter/constraint.md +++ b/docs/en/sql-reference/statements/alter/constraint.md @@ -15,7 +15,7 @@ ALTER TABLE [db].name [ON CLUSTER cluster] DROP CONSTRAINT constraint_name; See more on [constraints](../../../sql-reference/statements/create/table.md#constraints). -Queries will add or remove metadata about constraints from table so they are processed immediately. +Queries will add or remove metadata about constraints from table, so they are processed immediately. :::tip Constraint check **will not be executed** on existing data if it was added. diff --git a/docs/en/sql-reference/statements/detach.md b/docs/en/sql-reference/statements/detach.md index 938a5f9c3cb..e88e625aed1 100644 --- a/docs/en/sql-reference/statements/detach.md +++ b/docs/en/sql-reference/statements/detach.md @@ -16,13 +16,13 @@ DETACH TABLE|VIEW|DICTIONARY|DATABASE [IF EXISTS] [db.]name [ON CLUSTER cluster] Detaching does not delete the data or metadata of a table, a materialized view, a dictionary or a database. If an entity was not detached `PERMANENTLY`, on the next server launch the server will read the metadata and recall the table/view/dictionary/database again. If an entity was detached `PERMANENTLY`, there will be no automatic recall. Whether a table, a dictionary or a database was detached permanently or not, in both cases you can reattach them using the [ATTACH](../../sql-reference/statements/attach.md) query. -System log tables can be also attached back (e.g. `query_log`, `text_log`, etc). Other system tables can't be reattached. On the next server launch the server will recall those tables again. +System log tables can be also attached back (e.g. `query_log`, `text_log`, etc.). Other system tables can't be reattached. On the next server launch the server will recall those tables again. `ATTACH MATERIALIZED VIEW` does not work with short syntax (without `SELECT`), but you can attach it using the `ATTACH TABLE` query. Note that you can not detach permanently the table which is already detached (temporary). But you can attach it back and then detach permanently again. -Also you can not [DROP](../../sql-reference/statements/drop.md#drop-table) the detached table, or [CREATE TABLE](../../sql-reference/statements/create/table.md) with the same name as detached permanently, or replace it with the other table with [RENAME TABLE](../../sql-reference/statements/rename.md) query. +Also, you can not [DROP](../../sql-reference/statements/drop.md#drop-table) the detached table, or [CREATE TABLE](../../sql-reference/statements/create/table.md) with the same name as detached permanently, or replace it with the other table with [RENAME TABLE](../../sql-reference/statements/rename.md) query. The `SYNC` modifier executes the action without delay. diff --git a/docs/en/sql-reference/statements/insert-into.md b/docs/en/sql-reference/statements/insert-into.md index f9d93305071..f5544f96750 100644 --- a/docs/en/sql-reference/statements/insert-into.md +++ b/docs/en/sql-reference/statements/insert-into.md @@ -204,6 +204,20 @@ Result: └─────┴───────────────────────┘ ``` +## Inserts into ClickHouse Cloud + +By default, services on ClickHouse Cloud provide multiple replicas for high availability. When you connect to a service, a connection is established to one of these replicas. + +After an `INSERT` succeeds, data is written to the underlying storage. However, it may take some time for replicas to receive these updates. Therefore, if you use a different connection that executes a `SELECT` query on one of these other replicas, the updated data may not yet be reflected. + +It is possible to use the `select_sequential_consistency` to force the replica to receive the latest updates. Here is an example of a SELECT query using this setting: + +```sql +SELECT .... SETTINGS select_sequential_consistency = 1; +``` + +Note that using `select_sequential_consistency` will increase the load on ClickHouse Keeper (used by ClickHouse Cloud internally) and may result in slower performance depending on the load on the service. We recommend against enabling this setting unless necessary. The recommended approach is to execute read/writes in the same session or to use a client driver that uses the native protocol (and thus supports sticky connections). + ## Performance Considerations `INSERT` sorts the input data by primary key and splits them into partitions by a partition key. If you insert data into several partitions at once, it can significantly reduce the performance of the `INSERT` query. To avoid this: diff --git a/docs/en/sql-reference/statements/select/distinct.md b/docs/en/sql-reference/statements/select/distinct.md index 10326b0ef8f..08359b035ae 100644 --- a/docs/en/sql-reference/statements/select/distinct.md +++ b/docs/en/sql-reference/statements/select/distinct.md @@ -5,7 +5,7 @@ sidebar_label: DISTINCT # DISTINCT Clause -If `SELECT DISTINCT` is specified, only unique rows will remain in a query result. Thus only a single row will remain out of all the sets of fully matching rows in the result. +If `SELECT DISTINCT` is specified, only unique rows will remain in a query result. Thus, only a single row will remain out of all the sets of fully matching rows in the result. You can specify the list of columns that must have unique values: `SELECT DISTINCT ON (column1, column2,...)`. If the columns are not specified, all of them are taken into consideration. diff --git a/docs/en/sql-reference/statements/select/order-by.md b/docs/en/sql-reference/statements/select/order-by.md index d6432a7b4f8..29aca70762e 100644 --- a/docs/en/sql-reference/statements/select/order-by.md +++ b/docs/en/sql-reference/statements/select/order-by.md @@ -9,10 +9,9 @@ The `ORDER BY` clause contains - a list of expressions, e.g. `ORDER BY visits, search_phrase`, - a list of numbers referring to columns in the `SELECT` clause, e.g. `ORDER BY 2, 1`, or -- `ALL` which means all columns of the `SELECT` clause, e.g. `ORDER BY ALL`. +- `*` (without other expressions or numbers) which means all columns of the `SELECT` clause: `ORDER BY *`. To disable sorting by column numbers, set setting [enable_positional_arguments](../../../operations/settings/settings.md#enable-positional-arguments) = 0. -To disable sorting by `ALL`, set setting [enable_order_by_all](../../../operations/settings/settings.md#enable-order-by-all) = 0. The `ORDER BY` clause can be attributed by a `DESC` (descending) or `ASC` (ascending) modifier which determines the sorting direction. Unless an explicit sort order is specified, `ASC` is used by default. diff --git a/docs/ru/development/architecture.md b/docs/ru/development/architecture.md index b2e851a78cd..575799cccc4 100644 --- a/docs/ru/development/architecture.md +++ b/docs/ru/development/architecture.md @@ -63,7 +63,7 @@ ClickHouse — полноценная столбцовая СУБД. Данны Для байт-ориентированного ввода-вывода существуют абстрактные классы `ReadBuffer` и `WriteBuffer`. Они используются вместо `iostream`. Не волнуйтесь: каждый зрелый проект C++ использует что-то другое вместо `iostream` по уважительным причинам. -`ReadBuffer` и `WriteBuffer` — это просто непрерывный буфер и курсор, указывающий на позицию в этом буфере. Реализации могут как владеть так и не владеть памятью буфера. Существует виртуальный метод заполнения буфера следующими данными (для `ReadBuffer`) или сброса буфера куда-нибудь (например `WriteBuffer`). Виртуальные методы редко вызываются. +`ReadBuffer` и `WriteBuffer` — это просто непрерывный буфер и курсор, указывающий на позицию в этом буфере. Реализации могут как владеть, так и не владеть памятью буфера. Существует виртуальный метод заполнения буфера следующими данными (для `ReadBuffer`) или сброса буфера куда-нибудь (например `WriteBuffer`). Виртуальные методы редко вызываются. Реализации `ReadBuffer`/`WriteBuffer` используются для работы с файлами и файловыми дескрипторами, а также сетевыми сокетами, для реализации сжатия (`CompressedWriteBuffer` инициализируется вместе с другим `WriteBuffer` и осуществляет сжатие данных перед записью в него), и для других целей – названия `ConcatReadBuffer`, `LimitReadBuffer`, и `HashingWriteBuffer` говорят сами за себя. diff --git a/docs/ru/development/developer-instruction.md b/docs/ru/development/developer-instruction.md index c63622594e4..01ff4dd5f28 100644 --- a/docs/ru/development/developer-instruction.md +++ b/docs/ru/development/developer-instruction.md @@ -71,7 +71,7 @@ ClickHouse не работает и не собирается на 32-битны Please make sure you have the correct access rights and the repository exists. -Как правило это означает, что отсутствуют ssh ключи для соединения с GitHub. Ключи расположены в директории `~/.ssh`. В интерфейсе GitHub, в настройках, необходимо загрузить публичные ключи, чтобы он их понимал. +Как правило, это означает, что отсутствуют ssh ключи для соединения с GitHub. Ключи расположены в директории `~/.ssh`. В интерфейсе GitHub, в настройках, необходимо загрузить публичные ключи, чтобы он их понимал. Вы также можете клонировать репозиторий по протоколу https: @@ -199,7 +199,7 @@ sudo bash -c "$(wget -O - https://apt.llvm.org/llvm.sh)" В случае успешного запуска, вы увидите прогресс сборки - количество обработанных задач и общее количество задач. -В процессе сборки могут появится сообщения `libprotobuf WARNING` про protobuf файлы в библиотеке libhdfs2. Это не имеет значения. +В процессе сборки могут появиться сообщения `libprotobuf WARNING` про protobuf файлы в библиотеке libhdfs2. Это не имеет значения. При успешной сборке, вы получите готовый исполняемый файл `ClickHouse/build/programs/clickhouse`: @@ -207,7 +207,7 @@ sudo bash -c "$(wget -O - https://apt.llvm.org/llvm.sh)" ## Запуск собранной версии ClickHouse {#zapusk-sobrannoi-versii-clickhouse} -Для запуска сервера из под текущего пользователя, с выводом логов в терминал и с использованием примеров конфигурационных файлов, расположенных в исходниках, перейдите в директорию `ClickHouse/programs/server/` (эта директория находится не в директории build) и выполните: +Для запуска сервера из-под текущего пользователя, с выводом логов в терминал и с использованием примеров конфигурационных файлов, расположенных в исходниках, перейдите в директорию `ClickHouse/programs/server/` (эта директория находится не в директории build) и выполните: ../../build/programs/clickhouse server diff --git a/docs/ru/engines/table-engines/mergetree-family/collapsingmergetree.md b/docs/ru/engines/table-engines/mergetree-family/collapsingmergetree.md index cfafddf0bc2..4a7d81d38fc 100644 --- a/docs/ru/engines/table-engines/mergetree-family/collapsingmergetree.md +++ b/docs/ru/engines/table-engines/mergetree-family/collapsingmergetree.md @@ -37,7 +37,7 @@ CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster] **Секции запроса** -При создании таблицы с движком `CollapsingMergeTree` используются те же [секции запроса](mergetree.md#table_engine-mergetree-creating-a-table) что и при создании таблицы с движком `MergeTree`. +При создании таблицы с движком `CollapsingMergeTree` используются те же [секции запроса](mergetree.md#table_engine-mergetree-creating-a-table), что и при создании таблицы с движком `MergeTree`.
diff --git a/docs/ru/engines/table-engines/special/buffer.md b/docs/ru/engines/table-engines/special/buffer.md index 1fd8483e54d..3d2f1ee850d 100644 --- a/docs/ru/engines/table-engines/special/buffer.md +++ b/docs/ru/engines/table-engines/special/buffer.md @@ -42,7 +42,7 @@ CREATE TABLE merge.hits_buffer AS merge.hits ENGINE = Buffer(merge, hits, 16, 10 В качестве имени базы данных и имени таблицы можно указать пустые строки в одинарных кавычках. Это обозначает отсутствие таблицы назначения. В таком случае, при достижении условий на сброс данных, буфер будет просто очищаться. Это может быть полезным, чтобы хранить в оперативке некоторое окно данных. При чтении из таблицы типа Buffer, будут обработаны данные, как находящиеся в буфере, так и данные из таблицы назначения (если такая есть). -Но следует иметь ввиду, что таблица Buffer не поддерживает индекс. То есть, данные в буфере будут просканированы полностью, что может быть медленно для буферов большого размера. (Для данных в подчинённой таблице, будет использоваться тот индекс, который она поддерживает.) +Но следует иметь в виду, что таблица Buffer не поддерживает индекс. То есть, данные в буфере будут просканированы полностью, что может быть медленно для буферов большого размера. (Для данных в подчинённой таблице, будет использоваться тот индекс, который она поддерживает.) Если множество столбцов таблицы Buffer не совпадает с множеством столбцов подчинённой таблицы, то будут вставлено подмножество столбцов, которое присутствует в обеих таблицах. @@ -66,4 +66,4 @@ CREATE TABLE merge.hits_buffer AS merge.hits ENGINE = Buffer(merge, hits, 16, 10 Таблицы типа Buffer используются в тех случаях, когда от большого количества серверов поступает слишком много INSERT-ов в единицу времени, и нет возможности заранее самостоятельно буферизовать данные перед вставкой, в результате чего, INSERT-ы не успевают выполняться. -Заметим, что даже для таблиц типа Buffer не имеет смысла вставлять данные по одной строке, так как таким образом будет достигнута скорость всего лишь в несколько тысяч строк в секунду, тогда как при вставке более крупными блоками, достижимо более миллиона строк в секунду (смотрите раздел [«Производительность»](../../../introduction/performance.md). +Заметим, что даже для таблиц типа Buffer не имеет смысла вставлять данные по одной строке, так как таким образом будет достигнута скорость всего лишь в несколько тысяч строк в секунду, тогда как при вставке более крупными блоками, достижимо более миллиона строк в секунду (смотрите раздел [«Производительность»](../../../introduction/performance.md)). diff --git a/docs/ru/interfaces/cli.md b/docs/ru/interfaces/cli.md index 8910c258788..4d19cf50ae1 100644 --- a/docs/ru/interfaces/cli.md +++ b/docs/ru/interfaces/cli.md @@ -177,11 +177,11 @@ URI позволяет подключаться к нескольким хост -Строка подключения должна быть указана в первом аргументе clickhouse-client. Строка подключения может комбинироваться с другими [параметрами командной строки] (#command-line-options) кроме `--host/-h` и `--port`. +Строка подключения должна быть указана в первом аргументе clickhouse-client. Строка подключения может комбинироваться с другими [параметрами командной строки](#command-line-options) кроме `--host/-h` и `--port`. Для компонента `query_parameter` разрешены следующие ключи: -- `secure` или сокращенно `s` - без значение. Если параметр указан, то соединение с сервером будет осуществляться по защищенному каналу (TLS). См. `secure` в [command-line-options](#command-line-options). +- `secure` или сокращенно `s` - без значения. Если параметр указан, то соединение с сервером будет осуществляться по защищенному каналу (TLS). См. `secure` в [command-line-options](#command-line-options). ### Кодирование URI {#connection_string_uri_percent_encoding} @@ -206,7 +206,7 @@ clickhouse-client clickhouse://john:secret@127.0.0.1:9000 clickhouse-client clickhouse://[::1]:9000 ``` -Подключиться к localhost через порт 9000 многострочном режиме. +Подключиться к localhost через порт 9000 в многострочном режиме. ``` bash clickhouse-client clickhouse://localhost:9000 '-m' diff --git a/docs/ru/operations/clickhouse-keeper.md b/docs/ru/operations/clickhouse-keeper.md index 3a931529b32..9f1301d817d 100644 --- a/docs/ru/operations/clickhouse-keeper.md +++ b/docs/ru/operations/clickhouse-keeper.md @@ -69,7 +69,7 @@ ClickHouse Keeper может использоваться как равноце :::note -В случае изменения топологии кластера ClickHouse Keeper(например, замены сервера), удостоверьтесь, что вы сохраняеете отношение `server_id` - `hostname`, не переиспользуете существующие `server_id` для для новых серверов и не перемешиваете идентификаторы. Подобные ошибки могут случаться, если вы используете автоматизацию при разворачивании кластера без логики сохранения идентификаторов. +В случае изменения топологии кластера ClickHouse Keeper(например, замены сервера), удостоверьтесь, что вы сохраняеете отношение `server_id` - `hostname`, не переиспользуете существующие `server_id` для новых серверов и не перемешиваете идентификаторы. Подобные ошибки могут случаться, если вы используете автоматизацию при разворачивании кластера без логики сохранения идентификаторов. ::: Примеры конфигурации кворума с тремя узлами можно найти в [интеграционных тестах](https://github.com/ClickHouse/ClickHouse/tree/master/tests/integration) с префиксом `test_keeper_`. Пример конфигурации для сервера №1: @@ -337,7 +337,7 @@ clickhouse-keeper-converter --zookeeper-logs-dir /var/lib/zookeeper/version-2 -- После того, как выполнили действия выше выполните следующие шаги. 1. Выберете одну ноду Keeper, которая станет новым лидером. Учтите, что данные с этой ноды будут использованы всем кластером, поэтому рекомендуется выбрать ноду с наиболее актуальным состоянием. -2. Перед дальнейшими действиям сделайте резервную копию данных из директорий `log_storage_path` и `snapshot_storage_path`. +2. Перед дальнейшими действиями сделайте резервную копию данных из директорий `log_storage_path` и `snapshot_storage_path`. 3. Измените настройки на всех нодах кластера, которые вы собираетесь использовать. 4. Отправьте команду `rcvr` на ноду, которую вы выбрали, или остановите ее и запустите заново с аргументом `--force-recovery`. Это переведет ноду в режим восстановления. 5. Запускайте остальные ноды кластера по одной и проверяйте, что команда `mntr` возвращает `follower` в выводе состояния `zk_server_state` перед тем, как запустить следующую ноду. diff --git a/docs/ru/operations/configuration-files.md b/docs/ru/operations/configuration-files.md index 3b037521692..74f7d217fb7 100644 --- a/docs/ru/operations/configuration-files.md +++ b/docs/ru/operations/configuration-files.md @@ -89,7 +89,7 @@ $ cat /etc/clickhouse-server/users.d/alice.xml Вы можете использовать симметричное шифрование для зашифровки элемента конфигурации, например, поля password. Чтобы это сделать, сначала настройте [кодек шифрования](../sql-reference/statements/create/table.md#encryption-codecs), затем добавьте аттибут`encrypted_by` с именем кодека шифрования как значение к элементу, который надо зашифровать. -В отличии от аттрибутов `from_zk`, `from_env` и `incl` (или элемента `include`), подстановка, т.е. расшифровка зашифрованного значения, не выподняется в файле предобработки. Расшифровка происходит только во время исполнения в серверном процессе. +В отличие от аттрибутов `from_zk`, `from_env` и `incl` (или элемента `include`), подстановка, т.е. расшифровка зашифрованного значения, не выподняется в файле предобработки. Расшифровка происходит только во время исполнения в серверном процессе. Пример: @@ -110,7 +110,7 @@ $ cat /etc/clickhouse-server/users.d/alice.xml ``` -Чтобы получить зашифрованное значение может быть использовано приложение-пример `encrypt_decrypt` . +Чтобы получить зашифрованное значение, может быть использовано приложение-пример `encrypt_decrypt` . Пример: diff --git a/docs/ru/operations/utilities/clickhouse-benchmark.md b/docs/ru/operations/utilities/clickhouse-benchmark.md index 73de78d1c15..eb342bea9a7 100644 --- a/docs/ru/operations/utilities/clickhouse-benchmark.md +++ b/docs/ru/operations/utilities/clickhouse-benchmark.md @@ -50,7 +50,7 @@ clickhouse-benchmark [keys] < queries_file; - `-r`, `--randomize` — использовать случайный порядок выполнения запросов при наличии более одного входного запроса. - `-s`, `--secure` — используется `TLS` соединение. - `-t N`, `--timelimit=N` — лимит по времени в секундах. `clickhouse-benchmark` перестает отправлять запросы при достижении лимита по времени. Значение по умолчанию: 0 (лимит отключен). -- `--confidence=N` — уровень доверия для T-критерия. Возможные значения: 0 (80%), 1 (90%), 2 (95%), 3 (98%), 4 (99%), 5 (99.5%). Значение по умолчанию: 5. В [режиме сравнения](#clickhouse-benchmark-comparison-mode) `clickhouse-benchmark` проверяет [двухвыборочный t-критерий Стьюдента для независимых выборок](https://en.wikipedia.org/wiki/Student%27s_t-test#Independent_two-sample_t-test) чтобы определить, различны ли две выборки при выбранном уровне доверия. +- `--confidence=N` — уровень доверия для T-критерия. Возможные значения: 0 (80%), 1 (90%), 2 (95%), 3 (98%), 4 (99%), 5 (99.5%). Значение по умолчанию: 5. В [режиме сравнения](#clickhouse-benchmark-comparison-mode) `clickhouse-benchmark` проверяет [двухвыборочный t-критерий Стьюдента для независимых выборок](https://en.wikipedia.org/wiki/Student%27s_t-test#Independent_two-sample_t-test), чтобы определить, различны ли две выборки при выбранном уровне доверия. - `--cumulative` — выводить статистику за все время работы, а не за последний временной интервал. - `--database=DATABASE_NAME` — имя базы данных ClickHouse. Значение по умолчанию: `default`. - `--json=FILEPATH` — дополнительный вывод в формате `JSON`. Когда этот ключ указан, `clickhouse-benchmark` выводит отчет в указанный JSON-файл. diff --git a/docs/ru/sql-reference/data-types/datetime.md b/docs/ru/sql-reference/data-types/datetime.md index 80d844a1713..57f24786bb7 100644 --- a/docs/ru/sql-reference/data-types/datetime.md +++ b/docs/ru/sql-reference/data-types/datetime.md @@ -33,7 +33,7 @@ ClickHouse отображает значения в зависимости от ## Примеры {#primery} -**1.** Создание таблицы с столбцом типа `DateTime` и вставка данных в неё: +**1.** Создание таблицы со столбцом типа `DateTime` и вставка данных в неё: ``` sql CREATE TABLE dt diff --git a/docs/ru/sql-reference/functions/arithmetic-functions.md b/docs/ru/sql-reference/functions/arithmetic-functions.md index 73bac0595e1..ca7a4566c6c 100644 --- a/docs/ru/sql-reference/functions/arithmetic-functions.md +++ b/docs/ru/sql-reference/functions/arithmetic-functions.md @@ -172,7 +172,7 @@ multiplyDecimal(a, b[, result_scale]) ``` :::note -Эта функция работают гораздо медленнее обычной `multiply`. +Эта функция работает гораздо медленнее обычной `multiply`. В случае, если нет необходимости иметь фиксированную точность и/или нужны быстрые вычисления, следует использовать [multiply](#multiply). ::: diff --git a/docs/ru/sql-reference/functions/array-functions.md b/docs/ru/sql-reference/functions/array-functions.md index 659e2d3f75e..1f06bdf264a 100644 --- a/docs/ru/sql-reference/functions/array-functions.md +++ b/docs/ru/sql-reference/functions/array-functions.md @@ -488,7 +488,7 @@ arrayPushBack(array, single_value) **Аргументы** - `array` – массив. -- `single_value` – значение добавляемого элемента. В массив с числам можно добавить только числа, в массив со строками только строки. При добавлении чисел ClickHouse автоматически приводит тип `single_value` к типу данных массива. Подробнее о типах данных в ClickHouse читайте в разделе «[Типы данных](../../sql-reference/functions/array-functions.md#data_types)». Может быть равно `NULL`, в этом случае функция добавит элемент `NULL` в массив, а тип элементов массива преобразует в `Nullable`. +- `single_value` – значение добавляемого элемента. В массив с числами можно добавить только числа, в массив со строками только строки. При добавлении чисел ClickHouse автоматически приводит тип `single_value` к типу данных массива. Подробнее о типах данных в ClickHouse читайте в разделе «[Типы данных](../../sql-reference/functions/array-functions.md#data_types)». Может быть равно `NULL`, в этом случае функция добавит элемент `NULL` в массив, а тип элементов массива преобразует в `Nullable`. **Пример** @@ -513,7 +513,7 @@ arrayPushFront(array, single_value) **Аргументы** - `array` – массив. -- `single_value` – значение добавляемого элемента. В массив с числам можно добавить только числа, в массив со строками только строки. При добавлении чисел ClickHouse автоматически приводит тип `single_value` к типу данных массива. Подробнее о типах данных в ClickHouse читайте в разделе «[Типы данных](../../sql-reference/functions/array-functions.md#data_types)». Может быть равно `NULL`, в этом случае функция добавит элемент `NULL` в массив, а тип элементов массива преобразует в `Nullable`. +- `single_value` – значение добавляемого элемента. В массив с числами можно добавить только числа, в массив со строками только строки. При добавлении чисел ClickHouse автоматически приводит тип `single_value` к типу данных массива. Подробнее о типах данных в ClickHouse читайте в разделе «[Типы данных](../../sql-reference/functions/array-functions.md#data_types)». Может быть равно `NULL`, в этом случае функция добавит элемент `NULL` в массив, а тип элементов массива преобразует в `Nullable`. **Пример** diff --git a/docs/ru/sql-reference/statements/select/distinct.md b/docs/ru/sql-reference/statements/select/distinct.md index 58fe16b16d9..ad310434598 100644 --- a/docs/ru/sql-reference/statements/select/distinct.md +++ b/docs/ru/sql-reference/statements/select/distinct.md @@ -92,7 +92,7 @@ ClickHouse поддерживает использование секций `DIS ## Обработка NULL {#null-processing} -`DISTINCT` работает с [NULL](../../syntax.md#null-literal) как-будто `NULL` — обычное значение и `NULL==NULL`. Другими словами, в результате `DISTINCT`, различные комбинации с `NULL` встретятся только один раз. Это отличается от обработки `NULL` в большинстве других контекстов. +`DISTINCT` работает с [NULL](../../syntax.md#null-literal) как будто `NULL` — обычное значение и `NULL==NULL`. Другими словами, в результате `DISTINCT`, различные комбинации с `NULL` встретятся только один раз. Это отличается от обработки `NULL` в большинстве других контекстов. ## Альтернативы {#alternatives} diff --git a/docs/ru/sql-reference/table-functions/cluster.md b/docs/ru/sql-reference/table-functions/cluster.md index f148a21294a..bb22b38f8f9 100644 --- a/docs/ru/sql-reference/table-functions/cluster.md +++ b/docs/ru/sql-reference/table-functions/cluster.md @@ -33,7 +33,7 @@ clusterAllReplicas('cluster_name', db, table[, sharding_key]) **Использование макросов** -`cluster_name` может содержать макрос — подстановку в фигурных скобках. Эта подстановка заменяется на соответствующее значение из секции [macros](../../operations/server-configuration-parameters/settings.md#macros) конфигурационного файла . +`cluster_name` может содержать макрос — подстановку в фигурных скобках. Эта подстановка заменяется на соответствующее значение из секции [macros](../../operations/server-configuration-parameters/settings.md#macros) конфигурационного файла. Пример: diff --git a/docs/zh/operations/system-tables/dictionaries.md b/docs/zh/operations/system-tables/dictionaries.md index 105a591cf69..0cf91e45e86 100644 --- a/docs/zh/operations/system-tables/dictionaries.md +++ b/docs/zh/operations/system-tables/dictionaries.md @@ -20,7 +20,7 @@ machine_translated_rev: 5decc73b5dc60054f19087d3690c4eb99446a6c3 - `LOADED_AND_RELOADING` — Dictionary is loaded successfully, and is being reloaded right now (frequent reasons: [SYSTEM RELOAD DICTIONARY](../../sql-reference/statements/system.md#query_language-system-reload-dictionary) 查询,超时,字典配置已更改)。 - `FAILED_AND_RELOADING` — Could not load the dictionary as a result of an error and is loading now. - `origin` ([字符串](../../sql-reference/data-types/string.md)) — Path to the configuration file that describes the dictionary. -- `type` ([字符串](../../sql-reference/data-types/string.md)) — Type of a dictionary allocation. [在内存中存储字典](../../sql-reference/dictionaries/external-dictionaries/external-dicts-dict-layout.md). +- `type` ([字符串](../../sql-reference/data-types/string.md)) — Type of dictionary allocation. [在内存中存储字典](../../sql-reference/dictionaries/external-dictionaries/external-dicts-dict-layout.md). - `key` — [密钥类型](../../sql-reference/dictionaries/external-dictionaries/external-dicts-dict-structure.md#ext_dict_structure-key):数字键 ([UInt64](../../sql-reference/data-types/int-uint.md#uint-ranges)) or Сomposite key ([字符串](../../sql-reference/data-types/string.md)) — form “(type 1, type 2, …, type n)”. - `attribute.names` ([阵列](../../sql-reference/data-types/array.md)([字符串](../../sql-reference/data-types/string.md))) — Array of [属性名称](../../sql-reference/dictionaries/external-dictionaries/external-dicts-dict-structure.md#ext_dict_structure-attributes) 由字典提供。 - `attribute.types` ([阵列](../../sql-reference/data-types/array.md)([字符串](../../sql-reference/data-types/string.md))) — Corresponding array of [属性类型](../../sql-reference/dictionaries/external-dictionaries/external-dicts-dict-structure.md#ext_dict_structure-attributes) 这是由字典提供。 diff --git a/docs/zh/sql-reference/statements/select/order-by.md b/docs/zh/sql-reference/statements/select/order-by.md index 3286fc9f9e7..9540c96a10d 100644 --- a/docs/zh/sql-reference/statements/select/order-by.md +++ b/docs/zh/sql-reference/statements/select/order-by.md @@ -61,14 +61,14 @@ sidebar_label: ORDER BY 我们只建议使用 `COLLATE` 对于少量行的最终排序,因为排序与 `COLLATE` 比正常的按字节排序效率低。 -## ORDER BY ALL +## ORDER BY * -`ORDER BY ALL` 对所有选定的列进行升序排序。 +`ORDER BY *` 对所有选定的列进行升序排序。 示例: ``` sql -SELECT a, b, c FROM t ORDER BY ALL +SELECT a, b, c FROM t ORDER BY * ``` 等同于: diff --git a/programs/benchmark/Benchmark.cpp b/programs/benchmark/Benchmark.cpp index 961c678b936..fac88c0621f 100644 --- a/programs/benchmark/Benchmark.cpp +++ b/programs/benchmark/Benchmark.cpp @@ -2,7 +2,6 @@ #include #include #include -#include #include #include #include diff --git a/programs/client/Client.cpp b/programs/client/Client.cpp index fdd262f185d..7a77b7dd0ec 100644 --- a/programs/client/Client.cpp +++ b/programs/client/Client.cpp @@ -845,83 +845,7 @@ bool Client::processWithFuzzing(const String & full_query) have_error = true; } - // Check that after the query is formatted, we can parse it back, - // format again and get the same result. Unfortunately, we can't - // compare the ASTs, which would be more sensitive to errors. This - // double formatting check doesn't catch all errors, e.g. we can - // format query incorrectly, but to a valid SQL that we can then - // parse and format into the same SQL. - // There are some complicated cases where we can generate the SQL - // which we can't parse: - // * first argument of lambda() replaced by fuzzer with - // something else, leading to constructs such as - // arrayMap((min(x) + 3) -> x + 1, ....) - // * internals of Enum replaced, leading to: - // Enum(equals(someFunction(y), 3)). - // And there are even the cases when we can parse the query, but - // it's logically incorrect and its formatting is a mess, such as - // when `lambda()` function gets substituted into a wrong place. - // To avoid dealing with these cases, run the check only for the - // queries we were able to successfully execute. - // Another caveat is that sometimes WITH queries are not executed, - // if they are not referenced by the main SELECT, so they can still - // have the aforementioned problems. Disable this check for such - // queries, for lack of a better solution. - // There is also a problem that fuzzer substitutes positive Int64 - // literals or Decimal literals, which are then parsed back as - // UInt64, and suddenly duplicate alias substitution starts or stops - // working (ASTWithAlias::formatImpl) or something like that. - // So we compare not even the first and second formatting of the - // query, but second and third. - // If you have to add any more workarounds to this check, just remove - // it altogether, it's not so useful. - if (ast_to_process && !have_error && !queryHasWithClause(*ast_to_process)) - { - ASTPtr ast_2; - try - { - const auto * tmp_pos = query_to_execute.c_str(); - ast_2 = parseQuery(tmp_pos, tmp_pos + query_to_execute.size(), false /* allow_multi_statements */); - } - catch (Exception & e) - { - if (e.code() != ErrorCodes::SYNTAX_ERROR && - e.code() != ErrorCodes::TOO_DEEP_RECURSION) - throw; - } - - if (ast_2) - { - const auto text_2 = ast_2->formatForErrorMessage(); - const auto * tmp_pos = text_2.c_str(); - const auto ast_3 = parseQuery(tmp_pos, tmp_pos + text_2.size(), - false /* allow_multi_statements */); - const auto text_3 = ast_3 ? ast_3->formatForErrorMessage() : ""; - - if (text_3 != text_2) - { - fmt::print(stderr, "Found error: The query formatting is broken.\n"); - - printChangedSettings(); - - fmt::print(stderr, - "Got the following (different) text after formatting the fuzzed query and parsing it back:\n'{}'\n, expected:\n'{}'\n", - text_3, text_2); - fmt::print(stderr, "In more detail:\n"); - fmt::print(stderr, "AST-1 (generated by fuzzer):\n'{}'\n", ast_to_process->dumpTree()); - fmt::print(stderr, "Text-1 (AST-1 formatted):\n'{}'\n", query_to_execute); - fmt::print(stderr, "AST-2 (Text-1 parsed):\n'{}'\n", ast_2->dumpTree()); - fmt::print(stderr, "Text-2 (AST-2 formatted):\n'{}'\n", text_2); - fmt::print(stderr, "AST-3 (Text-2 parsed):\n'{}'\n", ast_3 ? ast_3->dumpTree() : ""); - fmt::print(stderr, "Text-3 (AST-3 formatted):\n'{}'\n", text_3); - fmt::print(stderr, "Text-3 must be equal to Text-2, but it is not.\n"); - - _exit(1); - } - } - } - - // The server is still alive so we're going to continue fuzzing. + // The server is still alive, so we're going to continue fuzzing. // Determine what we're going to use as the starting AST. if (have_error) { diff --git a/programs/keeper-converter/KeeperConverter.cpp b/programs/keeper-converter/KeeperConverter.cpp index 92bdea28738..8cd50d0892f 100644 --- a/programs/keeper-converter/KeeperConverter.cpp +++ b/programs/keeper-converter/KeeperConverter.cpp @@ -1,6 +1,7 @@ #include #include +#include #include #include #include @@ -39,7 +40,7 @@ int mainEntryClickHouseKeeperConverter(int argc, char ** argv) try { - auto keeper_context = std::make_shared(true); + auto keeper_context = std::make_shared(true, std::make_shared()); keeper_context->setDigestEnabled(true); keeper_context->setSnapshotDisk(std::make_shared("Keeper-snapshots", options["output-dir"].as())); diff --git a/programs/keeper/CMakeLists.txt b/programs/keeper/CMakeLists.txt index 143ded0ee85..70e0f229fd4 100644 --- a/programs/keeper/CMakeLists.txt +++ b/programs/keeper/CMakeLists.txt @@ -39,8 +39,9 @@ if (BUILD_STANDALONE_KEEPER) ${CMAKE_CURRENT_SOURCE_DIR}/../../src/Coordination/KeeperContext.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../../src/Coordination/KeeperStateManager.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../../src/Coordination/KeeperStorage.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/../../src/Coordination/KeeperConstants.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../../src/Coordination/KeeperAsynchronousMetrics.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/../../src/Coordination/pathUtils.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/../../src/Coordination/KeeperCommon.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../../src/Coordination/SessionExpiryQueue.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../../src/Coordination/SummingStateMachine.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../../src/Coordination/WriteBufferFromNuraftBuffer.cpp @@ -69,6 +70,7 @@ if (BUILD_STANDALONE_KEEPER) ${CMAKE_CURRENT_SOURCE_DIR}/../../src/Server/ServerType.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../../src/Server/HTTPRequestHandlerFactoryMain.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../../src/Server/KeeperReadinessHandler.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/../../src/Server/CloudPlacementInfo.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../../src/Server/HTTP/HTTPServer.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../../src/Server/HTTP/ReadHeaders.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../../src/Server/HTTP/HTTPServerConnection.cpp diff --git a/programs/keeper/Keeper.cpp b/programs/keeper/Keeper.cpp index c751702dc6f..8972c82eab8 100644 --- a/programs/keeper/Keeper.cpp +++ b/programs/keeper/Keeper.cpp @@ -14,6 +14,7 @@ #include #include #include +#include #include #include #include @@ -31,9 +32,10 @@ #include #include -#include #include #include +#include +#include #include "Core/Defines.h" #include "config.h" @@ -352,6 +354,11 @@ try std::string include_from_path = config().getString("include_from", "/etc/metrika.xml"); + if (config().has(DB::PlacementInfo::PLACEMENT_CONFIG_PREFIX)) + { + PlacementInfo::PlacementInfo::instance().initialize(config()); + } + GlobalThreadPool::initialize( config().getUInt("max_thread_pool_size", 100), config().getUInt("max_thread_pool_free_size", 1000), @@ -482,19 +489,28 @@ try /// Prometheus (if defined and not setup yet with http_port) port_name = "prometheus.port"; - createServer(listen_host, port_name, listen_try, [&, my_http_context = std::move(http_context)](UInt16 port) mutable - { - Poco::Net::ServerSocket socket; - auto address = socketBindListen(socket, listen_host, port); - socket.setReceiveTimeout(my_http_context->getReceiveTimeout()); - socket.setSendTimeout(my_http_context->getSendTimeout()); - servers->emplace_back( - listen_host, - port_name, - "Prometheus: http://" + address.toString(), - std::make_unique( - std::move(my_http_context), createPrometheusMainHandlerFactory(*this, config_getter(), async_metrics, "PrometheusHandler-factory"), server_pool, socket, http_params)); - }); + createServer( + listen_host, + port_name, + listen_try, + [&, my_http_context = std::move(http_context)](UInt16 port) mutable + { + Poco::Net::ServerSocket socket; + auto address = socketBindListen(socket, listen_host, port); + socket.setReceiveTimeout(my_http_context->getReceiveTimeout()); + socket.setSendTimeout(my_http_context->getSendTimeout()); + auto metrics_writer = std::make_shared(config, "prometheus", async_metrics); + servers->emplace_back( + listen_host, + port_name, + "Prometheus: http://" + address.toString(), + std::make_unique( + std::move(my_http_context), + createPrometheusMainHandlerFactory(*this, config_getter(), metrics_writer, "PrometheusHandler-factory"), + server_pool, + socket, + http_params)); + }); /// HTTP control endpoints port_name = "keeper_server.http_control.port"; @@ -544,7 +560,7 @@ try auto main_config_reloader = std::make_unique( config_path, extra_paths, - config().getString("path", ""), + config().getString("path", KEEPER_DEFAULT_PATH), std::move(unused_cache), unused_event, [&](ConfigurationPtr config, bool /* initial_loading */) diff --git a/programs/local/LocalServer.cpp b/programs/local/LocalServer.cpp index d7acf4112a5..e19cf17c9f4 100644 --- a/programs/local/LocalServer.cpp +++ b/programs/local/LocalServer.cpp @@ -249,7 +249,7 @@ void LocalServer::tryInitPath() default_path = parent_folder / fmt::format("clickhouse-local-{}-{}-{}", getpid(), time(nullptr), randomSeed()); if (exists(default_path)) - throw Exception(ErrorCodes::FILE_ALREADY_EXISTS, "Unsuccessful attempt to create working directory: {} exist!", default_path.string()); + throw Exception(ErrorCodes::FILE_ALREADY_EXISTS, "Unsuccessful attempt to create working directory: {} already exists.", default_path.string()); create_directory(default_path); temporary_directory_to_delete = default_path; @@ -336,23 +336,23 @@ std::string LocalServer::getInitialCreateTableQuery() auto table_structure = config().getString("table-structure", "auto"); String table_file; - String format_from_file_name; + std::optional format_from_file_name; if (!config().has("table-file") || config().getString("table-file") == "-") { /// Use Unix tools stdin naming convention table_file = "stdin"; - format_from_file_name = FormatFactory::instance().getFormatFromFileDescriptor(STDIN_FILENO); + format_from_file_name = FormatFactory::instance().tryGetFormatFromFileDescriptor(STDIN_FILENO); } else { /// Use regular file auto file_name = config().getString("table-file"); table_file = quoteString(file_name); - format_from_file_name = FormatFactory::instance().getFormatFromFileName(file_name, false); + format_from_file_name = FormatFactory::instance().tryGetFormatFromFileName(file_name); } auto data_format = backQuoteIfNeed( - config().getString("table-data-format", config().getString("format", format_from_file_name.empty() ? "TSV" : format_from_file_name))); + config().getString("table-data-format", config().getString("format", format_from_file_name ? *format_from_file_name : "TSV"))); if (table_structure == "auto") diff --git a/programs/obfuscator/Obfuscator.cpp b/programs/obfuscator/Obfuscator.cpp index 7e09d5e8046..242e995e466 100644 --- a/programs/obfuscator/Obfuscator.cpp +++ b/programs/obfuscator/Obfuscator.cpp @@ -1310,7 +1310,7 @@ try throw ErrnoException(ErrorCodes::CANNOT_SEEK_THROUGH_FILE, "Input must be seekable file (it will be read twice)"); SingleReadBufferIterator read_buffer_iterator(std::move(file)); - schema_columns = readSchemaFromFormat(input_format, {}, read_buffer_iterator, false, context_const); + schema_columns = readSchemaFromFormat(input_format, {}, read_buffer_iterator, context_const); } else { diff --git a/programs/server/Server.cpp b/programs/server/Server.cpp index f00da445c16..74fcc7326fc 100644 --- a/programs/server/Server.cpp +++ b/programs/server/Server.cpp @@ -44,6 +44,7 @@ #include #include #include +#include #include #include #include @@ -97,6 +98,7 @@ #include #include #include +#include #include #include #include @@ -711,6 +713,22 @@ try getNumberOfPhysicalCPUCores(), // on ARM processors it can show only enabled at current moment cores std::thread::hardware_concurrency()); +#if defined(__x86_64__) + String cpu_info; +#define COLLECT_FLAG(X) \ + if (CPU::have##X()) \ + { \ + if (!cpu_info.empty()) \ + cpu_info += ", "; \ + cpu_info += #X; \ + } + + CPU_ID_ENUMERATE(COLLECT_FLAG) +#undef COLLECT_FLAG + + LOG_INFO(log, "Available CPU instruction sets: {}", cpu_info); +#endif + sanityChecks(*this); // Initialize global thread pool. Do it before we fetch configs from zookeeper @@ -1274,7 +1292,7 @@ try auto main_config_reloader = std::make_unique( config_path, extra_paths, - config().getString("path", ""), + config().getString("path", DBMS_DEFAULT_PATH), std::move(main_config_zk_node_cache), main_config_zk_changed_event, [&](ConfigurationPtr config, bool initial_loading) @@ -1373,7 +1391,7 @@ try global_context->setMaxDatabaseNumToWarn(new_server_settings.max_database_num_to_warn); global_context->setMaxPartNumToWarn(new_server_settings.max_part_num_to_warn); - ConcurrencyControl::SlotCount concurrent_threads_soft_limit = ConcurrencyControl::Unlimited; + SlotCount concurrent_threads_soft_limit = UnlimitedSlots; if (new_server_settings.concurrent_threads_soft_limit_num > 0 && new_server_settings.concurrent_threads_soft_limit_num < concurrent_threads_soft_limit) concurrent_threads_soft_limit = new_server_settings.concurrent_threads_soft_limit_num; if (new_server_settings.concurrent_threads_soft_limit_ratio_to_cores > 0) @@ -1960,6 +1978,11 @@ try load_metadata_tasks); } + if (config().has(DB::PlacementInfo::PLACEMENT_CONFIG_PREFIX)) + { + PlacementInfo::PlacementInfo::instance().initialize(config()); + } + /// Do not keep tasks in server, they should be kept inside databases. Used here to make dependent tasks only. load_metadata_tasks.clear(); load_metadata_tasks.shrink_to_fit(); diff --git a/programs/server/dashboard.html b/programs/server/dashboard.html index 3c2916b6a16..5d51b8056e7 100644 --- a/programs/server/dashboard.html +++ b/programs/server/dashboard.html @@ -521,7 +521,7 @@ const errorMessages = [ /// Query to fill `queries` list for the dashboard -let search_query = `SELECT title, query FROM system.dashboards WHERE dashboard = 'overview'`; +let search_query = `SELECT title, query FROM system.dashboards WHERE dashboard = 'Overview'`; let customized = false; let queries = []; diff --git a/src/AggregateFunctions/AggregateFunctionSum.h b/src/AggregateFunctions/AggregateFunctionSum.h index 5781ab69c6b..58aaddf357a 100644 --- a/src/AggregateFunctions/AggregateFunctionSum.h +++ b/src/AggregateFunctions/AggregateFunctionSum.h @@ -146,9 +146,7 @@ struct AggregateFunctionSumData size_t count = end - start; const auto * end_ptr = ptr + count; - if constexpr ( - (is_integer && !is_big_int_v) - || (is_decimal && !std::is_same_v && !std::is_same_v)) + if constexpr ((is_integer || is_decimal) && !is_over_big_int) { /// For integers we can vectorize the operation if we replace the null check using a multiplication (by 0 for null, 1 for not null) /// https://quick-bench.com/q/MLTnfTvwC2qZFVeWHfOBR3U7a8I @@ -163,8 +161,39 @@ struct AggregateFunctionSumData Impl::add(sum, local_sum); return; } + else if constexpr (is_over_big_int) + { + /// Use a mask to discard or keep the value to reduce branch miss. + /// Notice that for (U)Int128 or Decimal128, MaskType is Int8 instead of Int64, otherwise extra branches will be introduced by compiler (for unknown reason) and performance will be worse. + using MaskType = std::conditional_t; + alignas(64) const MaskType masks[2] = {0, -1}; + T local_sum{}; + while (ptr < end_ptr) + { + Value v = *ptr; + if constexpr (!add_if_zero) + { + if constexpr (is_integer) + v &= masks[!!*condition_map]; + else + v.value &= masks[!!*condition_map]; + } + else + { + if constexpr (is_integer) + v &= masks[!*condition_map]; + else + v.value &= masks[!*condition_map]; + } - if constexpr (std::is_floating_point_v) + Impl::add(local_sum, v); + ++ptr; + ++condition_map; + } + Impl::add(sum, local_sum); + return; + } + else if constexpr (std::is_floating_point_v) { /// For floating point we use a similar trick as above, except that now we reinterpret the floating point number as an unsigned /// integer of the same size and use a mask instead (0 to discard, 0xFF..FF to keep) diff --git a/src/Analyzer/Passes/ArrayExistsToHasPass.cpp b/src/Analyzer/Passes/ArrayExistsToHasPass.cpp index 36c3df4d93a..62db502e1dc 100644 --- a/src/Analyzer/Passes/ArrayExistsToHasPass.cpp +++ b/src/Analyzer/Passes/ArrayExistsToHasPass.cpp @@ -1,6 +1,7 @@ #include #include +#include #include @@ -83,7 +84,8 @@ public: return; } - auto has_function = FunctionFactory::instance().get("has", getContext()); + auto has_function = createInternalFunctionHasOverloadResolver(); + array_exists_function_arguments_nodes[0] = std::move(array_exists_function_arguments_nodes[1]); array_exists_function_arguments_nodes[1] = std::move(has_constant_element_argument); array_exists_function_node->resolveAsFunction(has_function->build(array_exists_function_node->getArgumentColumns())); diff --git a/src/Analyzer/Passes/CNF.cpp b/src/Analyzer/Passes/CNF.cpp index aa6ee539934..71549f9e71d 100644 --- a/src/Analyzer/Passes/CNF.cpp +++ b/src/Analyzer/Passes/CNF.cpp @@ -10,6 +10,7 @@ #include #include +#include #include @@ -79,7 +80,7 @@ public: if (name == "and" || name == "or") { - auto function_resolver = FunctionFactory::instance().get(name, current_context); + auto function_resolver = name == "and" ? createInternalFunctionAndOverloadResolver() : createInternalFunctionOrOverloadResolver(); const auto & arguments = function_node->getArguments().getNodes(); if (arguments.size() > 2) @@ -110,10 +111,10 @@ private: class PushNotVisitor { public: - explicit PushNotVisitor(const ContextPtr & context) - : not_function_resolver(FunctionFactory::instance().get("not", context)) - , or_function_resolver(FunctionFactory::instance().get("or", context)) - , and_function_resolver(FunctionFactory::instance().get("and", context)) + explicit PushNotVisitor() + : not_function_resolver(createInternalFunctionNotOverloadResolver()) + , or_function_resolver(createInternalFunctionOrOverloadResolver()) + , and_function_resolver(createInternalFunctionAndOverloadResolver()) {} void visit(QueryTreeNodePtr & node, bool add_negation) @@ -162,10 +163,10 @@ private: class PushOrVisitor { public: - PushOrVisitor(ContextPtr context, size_t max_atoms_) + explicit PushOrVisitor(size_t max_atoms_) : max_atoms(max_atoms_) - , and_resolver(FunctionFactory::instance().get("and", context)) - , or_resolver(FunctionFactory::instance().get("or", context)) + , and_resolver(createInternalFunctionAndOverloadResolver()) + , or_resolver(createInternalFunctionOrOverloadResolver()) {} bool visit(QueryTreeNodePtr & node, size_t num_atoms) @@ -513,11 +514,11 @@ std::optional CNF::tryBuildCNF(const QueryTreeNodePtr & node, ContextPtr co } { - PushNotVisitor visitor(context); + PushNotVisitor visitor; visitor.visit(node_cloned, false); } - if (PushOrVisitor visitor(context, max_atoms); + if (PushOrVisitor visitor(max_atoms); !visitor.visit(node_cloned, atom_count)) return std::nullopt; @@ -542,7 +543,7 @@ CNF CNF::toCNF(const QueryTreeNodePtr & node, ContextPtr context, size_t max_gro return *cnf; } -QueryTreeNodePtr CNF::toQueryTree(ContextPtr context) const +QueryTreeNodePtr CNF::toQueryTree() const { if (statements.empty()) return nullptr; @@ -550,9 +551,9 @@ QueryTreeNodePtr CNF::toQueryTree(ContextPtr context) const QueryTreeNodes and_arguments; and_arguments.reserve(statements.size()); - auto not_resolver = FunctionFactory::instance().get("not", context); - auto or_resolver = FunctionFactory::instance().get("or", context); - auto and_resolver = FunctionFactory::instance().get("and", context); + auto not_resolver = createInternalFunctionNotOverloadResolver(); + auto or_resolver = createInternalFunctionOrOverloadResolver(); + auto and_resolver = createInternalFunctionAndOverloadResolver(); const auto function_node_from_atom = [&](const auto & atom) -> QueryTreeNodePtr { diff --git a/src/Analyzer/Passes/CNF.h b/src/Analyzer/Passes/CNF.h index ec639cd6679..9325d97d2f2 100644 --- a/src/Analyzer/Passes/CNF.h +++ b/src/Analyzer/Passes/CNF.h @@ -54,7 +54,7 @@ public: static std::optional tryBuildCNF(const QueryTreeNodePtr & node, ContextPtr context, size_t max_growth_multiplier = DEFAULT_MAX_GROWTH_MULTIPLIER); static CNF toCNF(const QueryTreeNodePtr & node, ContextPtr context, size_t max_growth_multiplier = DEFAULT_MAX_GROWTH_MULTIPLIER); - QueryTreeNodePtr toQueryTree(ContextPtr context) const; + QueryTreeNodePtr toQueryTree() const; const auto & getStatements() const { diff --git a/src/Analyzer/Passes/ConvertOrLikeChainPass.cpp b/src/Analyzer/Passes/ConvertOrLikeChainPass.cpp index 905819bf49f..eb897ef8746 100644 --- a/src/Analyzer/Passes/ConvertOrLikeChainPass.cpp +++ b/src/Analyzer/Passes/ConvertOrLikeChainPass.cpp @@ -11,6 +11,8 @@ #include #include +#include +#include #include @@ -134,8 +136,10 @@ private: void ConvertOrLikeChainPass::run(QueryTreeNodePtr & query_tree_node, ContextPtr context) { - auto or_function_resolver = FunctionFactory::instance().get("or", context); - auto match_function_resolver = FunctionFactory::instance().get("multiMatchAny", context); + const auto & settings = context->getSettingsRef(); + auto match_function_resolver = createInternalMultiMatchAnyOverloadResolver(settings.allow_hyperscan, settings.max_hyperscan_regexp_length, settings.max_hyperscan_regexp_total_length, settings.reject_expensive_hyperscan_regexps); + auto or_function_resolver = createInternalFunctionOrOverloadResolver(); + ConvertOrLikeChainVisitor visitor(std::move(or_function_resolver), std::move(match_function_resolver), std::move(context)); visitor.visit(query_tree_node); } diff --git a/src/Analyzer/Passes/ConvertQueryToCNFPass.cpp b/src/Analyzer/Passes/ConvertQueryToCNFPass.cpp index 5ce1ea43f2f..96bc62212fd 100644 --- a/src/Analyzer/Passes/ConvertQueryToCNFPass.cpp +++ b/src/Analyzer/Passes/ConvertQueryToCNFPass.cpp @@ -339,7 +339,7 @@ void addIndexConstraint(Analyzer::CNF & cnf, const QueryTreeNodes & table_expres { Analyzer::CNF::OrGroup new_group; auto index_hint_node = std::make_shared("indexHint"); - index_hint_node->getArguments().getNodes().push_back(Analyzer::CNF{std::move(and_group)}.toQueryTree(context)); + index_hint_node->getArguments().getNodes().push_back(Analyzer::CNF{std::move(and_group)}.toQueryTree()); index_hint_node->resolveAsFunction(FunctionFactory::instance().get("indexHint", context)); new_group.insert({false, QueryTreeNodePtrWithHash{std::move(index_hint_node)}}); @@ -676,7 +676,7 @@ void optimizeNode(QueryTreeNodePtr & node, const QueryTreeNodes & table_expressi if (settings.optimize_using_constraints) optimizeWithConstraints(*cnf, table_expressions, context); - auto new_node = cnf->toQueryTree(context); + auto new_node = cnf->toQueryTree(); node = std::move(new_node); } diff --git a/src/Analyzer/Passes/CrossToInnerJoinPass.cpp b/src/Analyzer/Passes/CrossToInnerJoinPass.cpp index 154babf3d9a..d0a5656d334 100644 --- a/src/Analyzer/Passes/CrossToInnerJoinPass.cpp +++ b/src/Analyzer/Passes/CrossToInnerJoinPass.cpp @@ -12,6 +12,7 @@ #include #include +#include #include @@ -256,7 +257,7 @@ private: for (const auto & node : nodes) function_node->getArguments().getNodes().push_back(node); - const auto & function = FunctionFactory::instance().get("and", getContext()); + const auto & function = createInternalFunctionAndOverloadResolver(); function_node->resolveAsFunction(function->build(function_node->getArgumentColumns())); return function_node; } diff --git a/src/Analyzer/Passes/IfChainToMultiIfPass.cpp b/src/Analyzer/Passes/IfChainToMultiIfPass.cpp index 88e350ffa2e..70b717f3108 100644 --- a/src/Analyzer/Passes/IfChainToMultiIfPass.cpp +++ b/src/Analyzer/Passes/IfChainToMultiIfPass.cpp @@ -5,6 +5,7 @@ #include #include #include +#include namespace DB { @@ -75,7 +76,8 @@ private: void IfChainToMultiIfPass::run(QueryTreeNodePtr & query_tree_node, ContextPtr context) { - auto multi_if_function_ptr = FunctionFactory::instance().get("multiIf", context); + const auto & settings = context->getSettingsRef(); + auto multi_if_function_ptr = createInternalMultiIfOverloadResolver(settings.allow_execute_multiif_columnar, settings.allow_experimental_variant_type, settings.use_variant_as_common_type); IfChainToMultiIfPassVisitor visitor(std::move(multi_if_function_ptr), std::move(context)); visitor.visit(query_tree_node); } diff --git a/src/Analyzer/Passes/MultiIfToIfPass.cpp b/src/Analyzer/Passes/MultiIfToIfPass.cpp index 8e09d5cab38..c42ea61b34a 100644 --- a/src/Analyzer/Passes/MultiIfToIfPass.cpp +++ b/src/Analyzer/Passes/MultiIfToIfPass.cpp @@ -3,6 +3,7 @@ #include #include #include +#include namespace DB { @@ -54,7 +55,8 @@ private: void MultiIfToIfPass::run(QueryTreeNodePtr & query_tree_node, ContextPtr context) { - auto if_function_ptr = FunctionFactory::instance().get("if", context); + const auto & settings = context->getSettingsRef(); + auto if_function_ptr = createInternalFunctionIfOverloadResolver(settings.allow_experimental_variant_type, settings.use_variant_as_common_type); MultiIfToIfVisitor visitor(std::move(if_function_ptr), std::move(context)); visitor.visit(query_tree_node); } diff --git a/src/Analyzer/Passes/NormalizeCountVariantsPass.cpp b/src/Analyzer/Passes/NormalizeCountVariantsPass.cpp index ce368a69ba9..0d6f3fc2d87 100644 --- a/src/Analyzer/Passes/NormalizeCountVariantsPass.cpp +++ b/src/Analyzer/Passes/NormalizeCountVariantsPass.cpp @@ -7,6 +7,7 @@ #include #include #include +#include namespace DB { @@ -32,6 +33,11 @@ public: if (function_node->getArguments().getNodes().size() != 1) return; + /// forbid the optimization if return value of sum() and count() differs: + /// count() returns only UInt64 type, while sum() could return Nullable(). + if (!function_node->getResultType()->equals(DataTypeUInt64())) + return; + auto & first_argument = function_node->getArguments().getNodes()[0]; auto * first_argument_constant_node = first_argument->as(); if (!first_argument_constant_node) diff --git a/src/Analyzer/Passes/QueryAnalysisPass.cpp b/src/Analyzer/Passes/QueryAnalysisPass.cpp index 2510d35f720..376701f777e 100644 --- a/src/Analyzer/Passes/QueryAnalysisPass.cpp +++ b/src/Analyzer/Passes/QueryAnalysisPass.cpp @@ -79,8 +79,6 @@ #include #include #include -#include -#include namespace ProfileEvents { @@ -122,7 +120,6 @@ namespace ErrorCodes extern const int NUMBER_OF_COLUMNS_DOESNT_MATCH; extern const int FUNCTION_CANNOT_HAVE_PARAMETERS; extern const int SYNTAX_ERROR; - extern const int UNEXPECTED_EXPRESSION; extern const int INVALID_IDENTIFIER; } @@ -1217,7 +1214,7 @@ private: static void expandGroupByAll(QueryNode & query_tree_node_typed); - void expandOrderByAll(QueryNode & query_tree_node_typed, const Settings & settings); + void expandOrderByAll(QueryNode & query_tree_node_typed); static std::string rewriteAggregateFunctionNameIfNeeded(const std::string & aggregate_function_name, NullsAction action, const ContextPtr & context); @@ -1380,6 +1377,8 @@ private: ProjectionNames resolveSortNodeList(QueryTreeNodePtr & sort_node_list, IdentifierResolveScope & scope); + void resolveGroupByNode(QueryNode & query_node_typed, IdentifierResolveScope & scope); + void resolveInterpolateColumnsNodeList(QueryTreeNodePtr & interpolate_node_list, IdentifierResolveScope & scope); void resolveWindowNodeList(QueryTreeNodePtr & window_node_list, IdentifierResolveScope & scope); @@ -2367,9 +2366,9 @@ void QueryAnalyzer::expandGroupByAll(QueryNode & query_tree_node_typed) query_tree_node_typed.setIsGroupByAll(false); } -void QueryAnalyzer::expandOrderByAll(QueryNode & query_tree_node_typed, const Settings & settings) +void QueryAnalyzer::expandOrderByAll(QueryNode & query_tree_node_typed) { - if (!settings.enable_order_by_all || !query_tree_node_typed.isOrderByAll()) + if (!query_tree_node_typed.isOrderByAll()) return; auto * all_node = query_tree_node_typed.getOrderBy().getNodes()[0]->as(); @@ -2390,9 +2389,6 @@ void QueryAnalyzer::expandOrderByAll(QueryNode & query_tree_node_typed, const Se throw Exception(ErrorCodes::LOGICAL_ERROR, "Expression nodes list expected 1 projection names. Actual {}", projection_names.size()); - if (Poco::toUpper(projection_names[0]) == "ALL") - throw Exception(ErrorCodes::UNEXPECTED_EXPRESSION, - "Cannot use ORDER BY ALL to sort a column with name 'all', please disable setting `enable_order_by_all` and try again"); } auto sort_node = std::make_shared(node, all_node->getSortDirection(), all_node->getNullsSortDirection()); @@ -5667,7 +5663,7 @@ ProjectionNames QueryAnalyzer::resolveFunction(QueryTreeNodePtr & node, Identifi /// Do not constant fold get scalar functions bool disable_constant_folding = function_name == "__getScalar" || function_name == "shardNum" || - function_name == "shardCount" || function_name == "hostName"; + function_name == "shardCount" || function_name == "hostName" || function_name == "tcpPort"; /** If function is suitable for constant folding try to convert it to constant. * Example: SELECT plus(1, 1); @@ -6263,6 +6259,77 @@ ProjectionNames QueryAnalyzer::resolveSortNodeList(QueryTreeNodePtr & sort_node_ return result_projection_names; } +namespace +{ + +void expandTuplesInList(QueryTreeNodes & key_list) +{ + QueryTreeNodes expanded_keys; + expanded_keys.reserve(key_list.size()); + for (auto const & key : key_list) + { + if (auto * function = key->as(); function != nullptr && function->getFunctionName() == "tuple") + { + std::copy(function->getArguments().begin(), function->getArguments().end(), std::back_inserter(expanded_keys)); + } + else + expanded_keys.push_back(key); + } + key_list = std::move(expanded_keys); +} + +} + +/** Resolve GROUP BY clause. + */ +void QueryAnalyzer::resolveGroupByNode(QueryNode & query_node_typed, IdentifierResolveScope & scope) +{ + const auto & settings = scope.context->getSettingsRef(); + + if (query_node_typed.isGroupByWithGroupingSets()) + { + for (auto & grouping_sets_keys_list_node : query_node_typed.getGroupBy().getNodes()) + { + if (settings.enable_positional_arguments) + replaceNodesWithPositionalArguments(grouping_sets_keys_list_node, query_node_typed.getProjection().getNodes(), scope); + + resolveExpressionNodeList(grouping_sets_keys_list_node, scope, false /*allow_lambda_expression*/, false /*allow_table_expression*/); + + // Remove redundant calls to `tuple` function. It simplifies checking if expression is an aggregation key. + // It's required to support queries like: SELECT number FROM numbers(3) GROUP BY (number, number % 2) + auto & group_by_list = grouping_sets_keys_list_node->as().getNodes(); + expandTuplesInList(group_by_list); + } + + if (scope.group_by_use_nulls) + { + for (const auto & grouping_set : query_node_typed.getGroupBy().getNodes()) + { + for (const auto & group_by_elem : grouping_set->as()->getNodes()) + scope.nullable_group_by_keys.insert(group_by_elem); + } + } + } + else + { + if (settings.enable_positional_arguments) + replaceNodesWithPositionalArguments(query_node_typed.getGroupByNode(), query_node_typed.getProjection().getNodes(), scope); + + resolveExpressionNodeList(query_node_typed.getGroupByNode(), scope, false /*allow_lambda_expression*/, false /*allow_table_expression*/); + + // Remove redundant calls to `tuple` function. It simplifies checking if expression is an aggregation key. + // It's required to support queries like: SELECT number FROM numbers(3) GROUP BY (number, number % 2) + auto & group_by_list = query_node_typed.getGroupBy().getNodes(); + expandTuplesInList(group_by_list); + + if (scope.group_by_use_nulls) + { + for (const auto & group_by_elem : query_node_typed.getGroupBy().getNodes()) + scope.nullable_group_by_keys.insert(group_by_elem); + } + } +} + /** Resolve interpolate columns nodes list. */ void QueryAnalyzer::resolveInterpolateColumnsNodeList(QueryTreeNodePtr & interpolate_node_list, IdentifierResolveScope & scope) @@ -6664,6 +6731,28 @@ void QueryAnalyzer::resolveTableFunction(QueryTreeNodePtr & table_function_node, TableFunctionPtr table_function_ptr = TableFunctionFactory::instance().tryGet(table_function_name, scope_context); if (!table_function_ptr) { + String database_name = scope_context->getCurrentDatabase(); + String table_name; + + auto function_ast = table_function_node->toAST(); + Identifier table_identifier{table_function_name}; + if (table_identifier.getPartsSize() == 1) + { + table_name = table_identifier[0]; + } + else if (table_identifier.getPartsSize() == 2) + { + database_name = table_identifier[0]; + table_name = table_identifier[1]; + } + + auto parametrized_view_storage = scope_context->getQueryContext()->buildParametrizedViewStorage(function_ast, database_name, table_name); + if (parametrized_view_storage) + { + table_function_node = std::make_shared(parametrized_view_storage, scope_context); + return; + } + auto hints = TableFunctionFactory::instance().getHints(table_function_name); if (!hints.empty()) throw Exception(ErrorCodes::UNKNOWN_FUNCTION, @@ -7453,40 +7542,7 @@ void QueryAnalyzer::resolveQuery(const QueryTreeNodePtr & query_node, Identifier resolveExpressionNode(query_node_typed.getWhere(), scope, false /*allow_lambda_expression*/, false /*allow_table_expression*/); if (query_node_typed.hasGroupBy()) - { - if (query_node_typed.isGroupByWithGroupingSets()) - { - for (auto & grouping_sets_keys_list_node : query_node_typed.getGroupBy().getNodes()) - { - if (settings.enable_positional_arguments) - replaceNodesWithPositionalArguments(grouping_sets_keys_list_node, query_node_typed.getProjection().getNodes(), scope); - - resolveExpressionNodeList(grouping_sets_keys_list_node, scope, false /*allow_lambda_expression*/, false /*allow_table_expression*/); - } - - if (scope.group_by_use_nulls) - { - for (const auto & grouping_set : query_node_typed.getGroupBy().getNodes()) - { - for (const auto & group_by_elem : grouping_set->as()->getNodes()) - scope.nullable_group_by_keys.insert(group_by_elem); - } - } - } - else - { - if (settings.enable_positional_arguments) - replaceNodesWithPositionalArguments(query_node_typed.getGroupByNode(), query_node_typed.getProjection().getNodes(), scope); - - resolveExpressionNodeList(query_node_typed.getGroupByNode(), scope, false /*allow_lambda_expression*/, false /*allow_table_expression*/); - - if (scope.group_by_use_nulls) - { - for (const auto & group_by_elem : query_node_typed.getGroupBy().getNodes()) - scope.nullable_group_by_keys.insert(group_by_elem); - } - } - } + resolveGroupByNode(query_node_typed, scope); if (query_node_typed.hasHaving()) resolveExpressionNode(query_node_typed.getHaving(), scope, false /*allow_lambda_expression*/, false /*allow_table_expression*/); @@ -7499,7 +7555,7 @@ void QueryAnalyzer::resolveQuery(const QueryTreeNodePtr & query_node, Identifier if (settings.enable_positional_arguments) replaceNodesWithPositionalArguments(query_node_typed.getOrderByNode(), query_node_typed.getProjection().getNodes(), scope); - expandOrderByAll(query_node_typed, settings); + expandOrderByAll(query_node_typed); resolveSortNodeList(query_node_typed.getOrderByNode(), scope); } diff --git a/src/Analyzer/QueryNode.cpp b/src/Analyzer/QueryNode.cpp index d0bff759dea..bc7a29247e4 100644 --- a/src/Analyzer/QueryNode.cpp +++ b/src/Analyzer/QueryNode.cpp @@ -421,11 +421,8 @@ ASTPtr QueryNode::toASTImpl(const ConvertToASTOptions & options) const if (is_subquery) { - auto subquery = std::make_shared(); - + auto subquery = std::make_shared(std::move(result_select_query)); subquery->cte_name = cte_name; - subquery->children.push_back(std::move(result_select_query)); - return subquery; } diff --git a/src/Analyzer/QueryNode.h b/src/Analyzer/QueryNode.h index d8b8741afb2..1b389572e42 100644 --- a/src/Analyzer/QueryNode.h +++ b/src/Analyzer/QueryNode.h @@ -219,13 +219,13 @@ public: is_group_by_all = is_group_by_all_value; } - /// Returns true, if query node has ORDER BY ALL modifier, false otherwise + /// Returns true, if query node has ORDER BY * modifier, false otherwise bool isOrderByAll() const { return is_order_by_all; } - /// Set query node ORDER BY ALL modifier value + /// Set query node ORDER BY * modifier value void setIsOrderByAll(bool is_order_by_all_value) { is_order_by_all = is_order_by_all_value; diff --git a/src/Analyzer/QueryTreePassManager.cpp b/src/Analyzer/QueryTreePassManager.cpp index 33411488d66..91e48fe4e86 100644 --- a/src/Analyzer/QueryTreePassManager.cpp +++ b/src/Analyzer/QueryTreePassManager.cpp @@ -61,7 +61,7 @@ namespace ErrorCodes namespace { -#ifndef NDEBUG +#if defined(ABORT_ON_LOGICAL_ERROR) /** This visitor checks if Query Tree structure is valid after each pass * in debug build. @@ -184,7 +184,7 @@ void QueryTreePassManager::run(QueryTreeNodePtr query_tree_node) for (size_t i = 0; i < passes_size; ++i) { passes[i]->run(query_tree_node, current_context); -#ifndef NDEBUG +#if defined(ABORT_ON_LOGICAL_ERROR) ValidationChecker(passes[i]->getName()).visit(query_tree_node); #endif } @@ -209,7 +209,7 @@ void QueryTreePassManager::run(QueryTreeNodePtr query_tree_node, size_t up_to_pa for (size_t i = 0; i < up_to_pass_index; ++i) { passes[i]->run(query_tree_node, current_context); -#ifndef NDEBUG +#if defined(ABORT_ON_LOGICAL_ERROR) ValidationChecker(passes[i]->getName()).visit(query_tree_node); #endif } diff --git a/src/Analyzer/UnionNode.cpp b/src/Analyzer/UnionNode.cpp index 5d2ac128abe..c6003126554 100644 --- a/src/Analyzer/UnionNode.cpp +++ b/src/Analyzer/UnionNode.cpp @@ -185,11 +185,8 @@ ASTPtr UnionNode::toASTImpl(const ConvertToASTOptions & options) const if (is_subquery) { - auto subquery = std::make_shared(); - + auto subquery = std::make_shared(std::move(select_with_union_query)); subquery->cte_name = cte_name; - subquery->children.push_back(std::move(select_with_union_query)); - return subquery; } diff --git a/src/Backups/BackupIO_AzureBlobStorage.cpp b/src/Backups/BackupIO_AzureBlobStorage.cpp index 52ce20d5108..b3b92323109 100644 --- a/src/Backups/BackupIO_AzureBlobStorage.cpp +++ b/src/Backups/BackupIO_AzureBlobStorage.cpp @@ -2,7 +2,7 @@ #if USE_AZURE_BLOB_STORAGE #include -#include +#include #include #include #include diff --git a/src/Backups/BackupIO_S3.cpp b/src/Backups/BackupIO_S3.cpp index fa4c1af3698..9359602a651 100644 --- a/src/Backups/BackupIO_S3.cpp +++ b/src/Backups/BackupIO_S3.cpp @@ -2,7 +2,7 @@ #if USE_AWS_S3 #include -#include +#include #include #include #include @@ -127,7 +127,7 @@ BackupReaderS3::BackupReaderS3( : BackupReaderDefault(read_settings_, write_settings_, getLogger("BackupReaderS3")) , s3_uri(s3_uri_) , data_source_description{DataSourceType::ObjectStorage, ObjectStorageType::S3, MetadataStorageType::None, s3_uri.endpoint, false, false} - , s3_settings(context_->getStorageS3Settings().getSettings(s3_uri.uri.toString())) + , s3_settings(context_->getStorageS3Settings().getSettings(s3_uri.uri.toString(), context_->getUserName())) { auto & request_settings = s3_settings.request_settings; request_settings.updateFromSettings(context_->getSettingsRef()); @@ -217,7 +217,7 @@ BackupWriterS3::BackupWriterS3( : BackupWriterDefault(read_settings_, write_settings_, getLogger("BackupWriterS3")) , s3_uri(s3_uri_) , data_source_description{DataSourceType::ObjectStorage, ObjectStorageType::S3, MetadataStorageType::None, s3_uri.endpoint, false, false} - , s3_settings(context_->getStorageS3Settings().getSettings(s3_uri.uri.toString())) + , s3_settings(context_->getStorageS3Settings().getSettings(s3_uri.uri.toString(), context_->getUserName())) { auto & request_settings = s3_settings.request_settings; request_settings.updateFromSettings(context_->getSettingsRef()); diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 08913ed1b5a..dff70e06ce4 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -506,6 +506,10 @@ if (TARGET ch_contrib::s2) dbms_target_link_libraries (PUBLIC ch_contrib::s2) endif() +if (TARGET ch_contrib::vectorscan) + dbms_target_link_libraries (PRIVATE ch_contrib::vectorscan) +endif() + if (TARGET ch_contrib::brotli) target_link_libraries (clickhouse_common_io PRIVATE ch_contrib::brotli) endif() diff --git a/src/Client/ClientBase.cpp b/src/Client/ClientBase.cpp index e099aac0de9..6c39c3ebc95 100644 --- a/src/Client/ClientBase.cpp +++ b/src/Client/ClientBase.cpp @@ -632,9 +632,9 @@ try } else if (query_with_output->out_file) { - const auto & format_name = FormatFactory::instance().getFormatFromFileName(out_file); - if (!format_name.empty()) - current_format = format_name; + auto format_name = FormatFactory::instance().tryGetFormatFromFileName(out_file); + if (format_name) + current_format = *format_name; } } @@ -1508,7 +1508,7 @@ void ClientBase::sendData(Block & sample, const ColumnsDescription & columns_des String current_format = parsed_insert_query->format; if (current_format.empty()) - current_format = FormatFactory::instance().getFormatFromFileName(in_file, true); + current_format = FormatFactory::instance().getFormatFromFileName(in_file); /// Create temporary storage file, to support globs and parallel reading /// StorageFile doesn't support ephemeral/materialized/alias columns. diff --git a/src/Client/ConnectionParameters.cpp b/src/Client/ConnectionParameters.cpp index a0ae47f0620..5c22b6c6d3f 100644 --- a/src/Client/ConnectionParameters.cpp +++ b/src/Client/ConnectionParameters.cpp @@ -125,7 +125,7 @@ ConnectionParameters::ConnectionParameters(const Poco::Util::AbstractConfigurati Poco::Timespan(config.getInt("send_timeout", DBMS_DEFAULT_SEND_TIMEOUT_SEC), 0)) .withReceiveTimeout( Poco::Timespan(config.getInt("receive_timeout", DBMS_DEFAULT_RECEIVE_TIMEOUT_SEC), 0)) - .withTcpKeepAliveTimeout( + .withTCPKeepAliveTimeout( Poco::Timespan(config.getInt("tcp_keep_alive_timeout", DEFAULT_TCP_KEEP_ALIVE_TIMEOUT), 0)) .withHandshakeTimeout( Poco::Timespan(config.getInt("handshake_timeout_ms", DBMS_DEFAULT_RECEIVE_TIMEOUT_SEC * 1000) * 1000)) diff --git a/src/Client/ConnectionPool.h b/src/Client/ConnectionPool.h index 8e707e8190f..574c4992d75 100644 --- a/src/Client/ConnectionPool.h +++ b/src/Client/ConnectionPool.h @@ -28,7 +28,10 @@ public: using Entry = PoolBase::Entry; IConnectionPool() = default; - IConnectionPool(String host_, UInt16 port_) : host(host_), port(port_), address(host + ":" + toString(port_)) {} + IConnectionPool(String host_, UInt16 port_, Priority config_priority_) + : host(host_), port(port_), address(host + ":" + toString(port_)), config_priority(config_priority_) + { + } virtual ~IConnectionPool() = default; @@ -42,12 +45,13 @@ public: const std::string & getHost() const { return host; } UInt16 getPort() const { return port; } const String & getAddress() const { return address; } - virtual Priority getPriority() const { return Priority{1}; } + Priority getConfigPriority() const { return config_priority; } protected: const String host; const UInt16 port = 0; const String address; + const Priority config_priority; }; using ConnectionPoolPtr = std::shared_ptr; @@ -61,32 +65,31 @@ public: using Entry = IConnectionPool::Entry; using Base = PoolBase; - ConnectionPool(unsigned max_connections_, - const String & host_, - UInt16 port_, - const String & default_database_, - const String & user_, - const String & password_, - const String & quota_key_, - const String & cluster_, - const String & cluster_secret_, - const String & client_name_, - Protocol::Compression compression_, - Protocol::Secure secure_, - Priority priority_ = Priority{1}) - : IConnectionPool(host_, port_), - Base(max_connections_, - getLogger("ConnectionPool (" + host_ + ":" + toString(port_) + ")")), - default_database(default_database_), - user(user_), - password(password_), - quota_key(quota_key_), - cluster(cluster_), - cluster_secret(cluster_secret_), - client_name(client_name_), - compression(compression_), - secure(secure_), - priority(priority_) + ConnectionPool( + unsigned max_connections_, + const String & host_, + UInt16 port_, + const String & default_database_, + const String & user_, + const String & password_, + const String & quota_key_, + const String & cluster_, + const String & cluster_secret_, + const String & client_name_, + Protocol::Compression compression_, + Protocol::Secure secure_, + Priority config_priority_ = Priority{1}) + : IConnectionPool(host_, port_, config_priority_) + , Base(max_connections_, getLogger("ConnectionPool (" + host_ + ":" + toString(port_) + ")")) + , default_database(default_database_) + , user(user_) + , password(password_) + , quota_key(quota_key_) + , cluster(cluster_) + , cluster_secret(cluster_secret_) + , client_name(client_name_) + , compression(compression_) + , secure(secure_) { } @@ -114,11 +117,6 @@ public: return host + ":" + toString(port); } - Priority getPriority() const override - { - return priority; - } - protected: /** Creates a new object to put in the pool. */ ConnectionPtr allocObject() override @@ -143,7 +141,6 @@ private: String client_name; Protocol::Compression compression; /// Whether to compress data when interacting with the server. Protocol::Secure secure; /// Whether to encrypt data when interacting with the server. - Priority priority; /// priority from }; /** diff --git a/src/Client/ConnectionPoolWithFailover.cpp b/src/Client/ConnectionPoolWithFailover.cpp index fdc0a11e533..492fd4ae9e2 100644 --- a/src/Client/ConnectionPoolWithFailover.cpp +++ b/src/Client/ConnectionPoolWithFailover.cpp @@ -79,14 +79,6 @@ IConnectionPool::Entry ConnectionPoolWithFailover::get(const ConnectionTimeouts return Base::get(max_ignored_errors, fallback_to_stale_replicas, try_get_entry, get_priority); } -Priority ConnectionPoolWithFailover::getPriority() const -{ - return (*std::max_element(nested_pools.begin(), nested_pools.end(), [](const auto & a, const auto & b) - { - return a->getPriority() < b->getPriority(); - }))->getPriority(); -} - ConnectionPoolWithFailover::Status ConnectionPoolWithFailover::getStatus() const { const auto [states, pools, error_decrease_time] = getPoolExtendedStates(); @@ -253,13 +245,13 @@ ConnectionPoolWithFailover::tryGetEntry( } std::vector -ConnectionPoolWithFailover::getShuffledPools(const Settings & settings, GetPriorityForLoadBalancing::Func priority_func) +ConnectionPoolWithFailover::getShuffledPools(const Settings & settings, GetPriorityForLoadBalancing::Func priority_func, bool use_slowdown_count) { if (!priority_func) priority_func = makeGetPriorityFunc(settings); UInt64 max_ignored_errors = settings.distributed_replica_max_ignored_errors.value; - return Base::getShuffledPools(max_ignored_errors, priority_func); + return Base::getShuffledPools(max_ignored_errors, priority_func, use_slowdown_count); } } diff --git a/src/Client/ConnectionPoolWithFailover.h b/src/Client/ConnectionPoolWithFailover.h index 7ccdd4787a4..edfcbe6e4df 100644 --- a/src/Client/ConnectionPoolWithFailover.h +++ b/src/Client/ConnectionPoolWithFailover.h @@ -49,8 +49,6 @@ public: const Settings & settings, bool force_connected) override; /// From IConnectionPool - Priority getPriority() const override; /// From IConnectionPool - /** Allocates up to the specified number of connections to work. * Connections provide access to different replicas of one shard. */ @@ -83,15 +81,15 @@ public: struct NestedPoolStatus { const Base::NestedPoolPtr pool; - size_t error_count; - size_t slowdown_count; + size_t error_count = 0; + size_t slowdown_count = 0; std::chrono::seconds estimated_recovery_time; }; using Status = std::vector; Status getStatus() const; - std::vector getShuffledPools(const Settings & settings, GetPriorityFunc priority_func = {}); + std::vector getShuffledPools(const Settings & settings, GetPriorityFunc priority_func = {}, bool use_slowdown_count = false); size_t getMaxErrorCup() const { return Base::max_error_cap; } diff --git a/src/Client/HedgedConnectionsFactory.cpp b/src/Client/HedgedConnectionsFactory.cpp index 82bacece415..f5b074a0257 100644 --- a/src/Client/HedgedConnectionsFactory.cpp +++ b/src/Client/HedgedConnectionsFactory.cpp @@ -40,7 +40,8 @@ HedgedConnectionsFactory::HedgedConnectionsFactory( , max_parallel_replicas(max_parallel_replicas_) , skip_unavailable_shards(skip_unavailable_shards_) { - shuffled_pools = pool->getShuffledPools(settings_, priority_func); + shuffled_pools = pool->getShuffledPools(settings_, priority_func, /* use_slowdown_count */ true); + for (const auto & shuffled_pool : shuffled_pools) replicas.emplace_back( std::make_unique(shuffled_pool.pool, &timeouts, settings_, log, table_to_check.get())); diff --git a/src/Client/QueryFuzzer.cpp b/src/Client/QueryFuzzer.cpp index 24be7491ec7..0597a7c1eed 100644 --- a/src/Client/QueryFuzzer.cpp +++ b/src/Client/QueryFuzzer.cpp @@ -1168,23 +1168,13 @@ void QueryFuzzer::fuzz(ASTPtr & ast) fuzz(select->children); } - /* - * The time to fuzz the settings has not yet come. - * Apparently we don't have any infrastructure to validate the values of - * the settings, and the first query with max_block_size = -1 breaks - * because of overflows here and there. - *//* - * else if (auto * set = typeid_cast(ast.get())) - * { - * for (auto & c : set->changes) - * { - * if (fuzz_rand() % 50 == 0) - * { - * c.value = fuzzField(c.value); - * } - * } - * } - */ + else if (auto * set = typeid_cast(ast.get())) + { + /// Fuzz settings + for (auto & c : set->changes) + if (fuzz_rand() % 50 == 0) + c.value = fuzzField(c.value); + } else if (auto * literal = typeid_cast(ast.get())) { // There is a caveat with fuzzing the children: many ASTs also keep the diff --git a/src/Common/CpuId.h b/src/Common/CPUID.h similarity index 50% rename from src/Common/CpuId.h rename to src/Common/CPUID.h index 1d15867289d..d7a714ec5af 100644 --- a/src/Common/CpuId.h +++ b/src/Common/CPUID.h @@ -2,7 +2,7 @@ #include -#if defined(__x86_64__) || defined(__i386__) +#if defined(__x86_64__) #include #endif @@ -11,10 +11,10 @@ namespace DB { -namespace Cpu +namespace CPU { -#if (defined(__x86_64__) || defined(__i386__)) +#if (defined(__x86_64__)) /// Our version is independent of -mxsave option, because we do dynamic dispatch. inline UInt64 our_xgetbv(UInt32 xcr) noexcept { @@ -30,7 +30,7 @@ inline UInt64 our_xgetbv(UInt32 xcr) noexcept inline bool cpuid(UInt32 op, UInt32 sub_op, UInt32 * res) noexcept /// NOLINT { -#if defined(__x86_64__) || defined(__i386__) +#if defined(__x86_64__) __cpuid_count(op, sub_op, res[0], res[1], res[2], res[3]); return true; #else @@ -45,7 +45,7 @@ inline bool cpuid(UInt32 op, UInt32 sub_op, UInt32 * res) noexcept /// NOLINT inline bool cpuid(UInt32 op, UInt32 * res) noexcept /// NOLINT { -#if defined(__x86_64__) || defined(__i386__) +#if defined(__x86_64__) __cpuid(op, res[0], res[1], res[2], res[3]); return true; #else @@ -57,6 +57,249 @@ inline bool cpuid(UInt32 op, UInt32 * res) noexcept /// NOLINT #endif } +union CPUInfo +{ + UInt32 info[4]; + + struct Registers + { + UInt32 eax; + UInt32 ebx; + UInt32 ecx; + UInt32 edx; + } registers; + + inline explicit CPUInfo(UInt32 op) noexcept { cpuid(op, info); } + + inline CPUInfo(UInt32 op, UInt32 sub_op) noexcept { cpuid(op, sub_op, info); } +}; + +inline bool haveRDTSCP() noexcept +{ + return (CPUInfo(0x80000001).registers.edx >> 27) & 1u; +} + +inline bool haveSSE() noexcept +{ + return (CPUInfo(0x1).registers.edx >> 25) & 1u; +} + +inline bool haveSSE2() noexcept +{ + return (CPUInfo(0x1).registers.edx >> 26) & 1u; +} + +inline bool haveSSE3() noexcept +{ + return CPUInfo(0x1).registers.ecx & 1u; +} + +inline bool havePCLMUL() noexcept +{ + return (CPUInfo(0x1).registers.ecx >> 1) & 1u; +} + +inline bool haveSSSE3() noexcept +{ + return (CPUInfo(0x1).registers.ecx >> 9) & 1u; +} + +inline bool haveSSE41() noexcept +{ + return (CPUInfo(0x1).registers.ecx >> 19) & 1u; +} + +inline bool haveSSE42() noexcept +{ + return (CPUInfo(0x1).registers.ecx >> 20) & 1u; +} + +inline bool haveF16C() noexcept +{ + return (CPUInfo(0x1).registers.ecx >> 29) & 1u; +} + +inline bool havePOPCNT() noexcept +{ + return (CPUInfo(0x1).registers.ecx >> 23) & 1u; +} + +inline bool haveAES() noexcept +{ + return (CPUInfo(0x1).registers.ecx >> 25) & 1u; +} + +inline bool haveXSAVE() noexcept +{ + return (CPUInfo(0x1).registers.ecx >> 26) & 1u; +} + +inline bool haveOSXSAVE() noexcept +{ + return (CPUInfo(0x1).registers.ecx >> 27) & 1u; +} + +inline bool haveAVX() noexcept +{ +#if defined(__x86_64__) + // http://www.intel.com/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-optimization-manual.pdf + // https://bugs.chromium.org/p/chromium/issues/detail?id=375968 + return haveOSXSAVE() // implies haveXSAVE() + && (our_xgetbv(0) & 6u) == 6u // XMM state and YMM state are enabled by OS + && ((CPUInfo(0x1).registers.ecx >> 28) & 1u); // AVX bit +#else + return false; +#endif +} + +inline bool haveFMA() noexcept +{ + return haveAVX() && ((CPUInfo(0x1).registers.ecx >> 12) & 1u); +} + +inline bool haveAVX2() noexcept +{ + return haveAVX() && ((CPUInfo(0x7, 0).registers.ebx >> 5) & 1u); +} + +inline bool haveBMI1() noexcept +{ + return (CPUInfo(0x7, 0).registers.ebx >> 3) & 1u; +} + +inline bool haveBMI2() noexcept +{ + return (CPUInfo(0x7, 0).registers.ebx >> 8) & 1u; +} + +inline bool haveAVX512F() noexcept +{ +#if defined(__x86_64__) + // https://software.intel.com/en-us/articles/how-to-detect-knl-instruction-support + return haveOSXSAVE() // implies haveXSAVE() + && (our_xgetbv(0) & 6u) == 6u // XMM state and YMM state are enabled by OS + && ((our_xgetbv(0) >> 5) & 7u) == 7u // ZMM state is enabled by OS + && CPUInfo(0x0).registers.eax >= 0x7 // leaf 7 is present + && ((CPUInfo(0x7, 0).registers.ebx >> 16) & 1u); // AVX512F bit +#else + return false; +#endif +} + +inline bool haveAVX512DQ() noexcept +{ + return haveAVX512F() && ((CPUInfo(0x7, 0).registers.ebx >> 17) & 1u); +} + +inline bool haveRDSEED() noexcept +{ + return CPUInfo(0x0).registers.eax >= 0x7 && ((CPUInfo(0x7, 0).registers.ebx >> 18) & 1u); +} + +inline bool haveADX() noexcept +{ + return CPUInfo(0x0).registers.eax >= 0x7 && ((CPUInfo(0x7, 0).registers.ebx >> 19) & 1u); +} + +inline bool haveAVX512IFMA() noexcept +{ + return haveAVX512F() && ((CPUInfo(0x7, 0).registers.ebx >> 21) & 1u); +} + +inline bool havePCOMMIT() noexcept +{ + return CPUInfo(0x0).registers.eax >= 0x7 && ((CPUInfo(0x7, 0).registers.ebx >> 22) & 1u); +} + +inline bool haveCLFLUSHOPT() noexcept +{ + return CPUInfo(0x0).registers.eax >= 0x7 && ((CPUInfo(0x7, 0).registers.ebx >> 23) & 1u); +} + +inline bool haveCLWB() noexcept +{ + return CPUInfo(0x0).registers.eax >= 0x7 && ((CPUInfo(0x7, 0).registers.ebx >> 24) & 1u); +} + +inline bool haveAVX512PF() noexcept +{ + return haveAVX512F() && ((CPUInfo(0x7, 0).registers.ebx >> 26) & 1u); +} + +inline bool haveAVX512ER() noexcept +{ + return haveAVX512F() && ((CPUInfo(0x7, 0).registers.ebx >> 27) & 1u); +} + +inline bool haveAVX512CD() noexcept +{ + return haveAVX512F() && ((CPUInfo(0x7, 0).registers.ebx >> 28) & 1u); +} + +inline bool haveSHA() noexcept +{ + return CPUInfo(0x0).registers.eax >= 0x7 && ((CPUInfo(0x7, 0).registers.ebx >> 29) & 1u); +} + +inline bool haveAVX512BW() noexcept +{ + return haveAVX512F() && ((CPUInfo(0x7, 0).registers.ebx >> 30) & 1u); +} + +inline bool haveAVX512VL() noexcept +{ + return haveAVX512F() && ((CPUInfo(0x7, 0).registers.ebx >> 31) & 1u); +} + +inline bool havePREFETCHWT1() noexcept +{ + return CPUInfo(0x0).registers.eax >= 0x7 && ((CPUInfo(0x7, 0).registers.ecx >> 0) & 1u); +} + +inline bool haveAVX512VBMI() noexcept +{ + return haveAVX512F() && ((CPUInfo(0x7, 0).registers.ecx >> 1) & 1u); +} + +inline bool haveAVX512VBMI2() noexcept +{ + return haveAVX512F() && ((CPUInfo(0x7, 0).registers.ecx >> 6) & 1u); +} + +inline bool haveRDRAND() noexcept +{ + return CPUInfo(0x0).registers.eax >= 0x7 && ((CPUInfo(0x1).registers.ecx >> 30) & 1u); +} + +inline bool haveAMX() noexcept +{ +#if defined(__x86_64__) + // http://www.intel.com/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-optimization-manual.pdf + return haveOSXSAVE() // implies haveXSAVE() + && ((our_xgetbv(0) >> 17) & 0x3) == 0x3; // AMX state are enabled by OS +#else + return false; +#endif +} + +inline bool haveAMXBF16() noexcept +{ + return haveAMX() + && ((CPUInfo(0x7, 0).registers.edx >> 22) & 1u); // AMX-BF16 bit +} + +inline bool haveAMXTILE() noexcept +{ + return haveAMX() + && ((CPUInfo(0x7, 0).registers.edx >> 24) & 1u); // AMX-TILE bit +} + +inline bool haveAMXINT8() noexcept +{ + return haveAMX() + && ((CPUInfo(0x7, 0).registers.edx >> 25) & 1u); // AMX-INT8 bit +} + #define CPU_ID_ENUMERATE(OP) \ OP(SSE) \ OP(SSE2) \ @@ -98,254 +341,7 @@ inline bool cpuid(UInt32 op, UInt32 * res) noexcept /// NOLINT OP(AMXTILE) \ OP(AMXINT8) -union CpuInfo -{ - UInt32 info[4]; - - struct Registers - { - UInt32 eax; - UInt32 ebx; - UInt32 ecx; - UInt32 edx; - } registers; - - inline explicit CpuInfo(UInt32 op) noexcept { cpuid(op, info); } - - inline CpuInfo(UInt32 op, UInt32 sub_op) noexcept { cpuid(op, sub_op, info); } -}; - -#define DEF_NAME(X) inline bool have##X() noexcept; - CPU_ID_ENUMERATE(DEF_NAME) -#undef DEF_NAME - -bool haveRDTSCP() noexcept -{ - return (CpuInfo(0x80000001).registers.edx >> 27) & 1u; -} - -bool haveSSE() noexcept -{ - return (CpuInfo(0x1).registers.edx >> 25) & 1u; -} - -bool haveSSE2() noexcept -{ - return (CpuInfo(0x1).registers.edx >> 26) & 1u; -} - -bool haveSSE3() noexcept -{ - return CpuInfo(0x1).registers.ecx & 1u; -} - -bool havePCLMUL() noexcept -{ - return (CpuInfo(0x1).registers.ecx >> 1) & 1u; -} - -bool haveSSSE3() noexcept -{ - return (CpuInfo(0x1).registers.ecx >> 9) & 1u; -} - -bool haveSSE41() noexcept -{ - return (CpuInfo(0x1).registers.ecx >> 19) & 1u; -} - -bool haveSSE42() noexcept -{ - return (CpuInfo(0x1).registers.ecx >> 20) & 1u; -} - -bool haveF16C() noexcept -{ - return (CpuInfo(0x1).registers.ecx >> 29) & 1u; -} - -bool havePOPCNT() noexcept -{ - return (CpuInfo(0x1).registers.ecx >> 23) & 1u; -} - -bool haveAES() noexcept -{ - return (CpuInfo(0x1).registers.ecx >> 25) & 1u; -} - -bool haveXSAVE() noexcept -{ - return (CpuInfo(0x1).registers.ecx >> 26) & 1u; -} - -bool haveOSXSAVE() noexcept -{ - return (CpuInfo(0x1).registers.ecx >> 27) & 1u; -} - -bool haveAVX() noexcept -{ -#if defined(__x86_64__) || defined(__i386__) - // http://www.intel.com/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-optimization-manual.pdf - // https://bugs.chromium.org/p/chromium/issues/detail?id=375968 - return haveOSXSAVE() // implies haveXSAVE() - && (our_xgetbv(0) & 6u) == 6u // XMM state and YMM state are enabled by OS - && ((CpuInfo(0x1).registers.ecx >> 28) & 1u); // AVX bit -#else - return false; -#endif -} - -bool haveFMA() noexcept -{ - return haveAVX() && ((CpuInfo(0x1).registers.ecx >> 12) & 1u); -} - -bool haveAVX2() noexcept -{ - return haveAVX() && ((CpuInfo(0x7, 0).registers.ebx >> 5) & 1u); -} - -bool haveBMI1() noexcept -{ - return (CpuInfo(0x7, 0).registers.ebx >> 3) & 1u; -} - -bool haveBMI2() noexcept -{ - return (CpuInfo(0x7, 0).registers.ebx >> 8) & 1u; -} - -bool haveAVX512F() noexcept -{ -#if defined(__x86_64__) || defined(__i386__) - // https://software.intel.com/en-us/articles/how-to-detect-knl-instruction-support - return haveOSXSAVE() // implies haveXSAVE() - && (our_xgetbv(0) & 6u) == 6u // XMM state and YMM state are enabled by OS - && ((our_xgetbv(0) >> 5) & 7u) == 7u // ZMM state is enabled by OS - && CpuInfo(0x0).registers.eax >= 0x7 // leaf 7 is present - && ((CpuInfo(0x7, 0).registers.ebx >> 16) & 1u); // AVX512F bit -#else - return false; -#endif -} - -bool haveAVX512DQ() noexcept -{ - return haveAVX512F() && ((CpuInfo(0x7, 0).registers.ebx >> 17) & 1u); -} - -bool haveRDSEED() noexcept -{ - return CpuInfo(0x0).registers.eax >= 0x7 && ((CpuInfo(0x7, 0).registers.ebx >> 18) & 1u); -} - -bool haveADX() noexcept -{ - return CpuInfo(0x0).registers.eax >= 0x7 && ((CpuInfo(0x7, 0).registers.ebx >> 19) & 1u); -} - -bool haveAVX512IFMA() noexcept -{ - return haveAVX512F() && ((CpuInfo(0x7, 0).registers.ebx >> 21) & 1u); -} - -bool havePCOMMIT() noexcept -{ - return CpuInfo(0x0).registers.eax >= 0x7 && ((CpuInfo(0x7, 0).registers.ebx >> 22) & 1u); -} - -bool haveCLFLUSHOPT() noexcept -{ - return CpuInfo(0x0).registers.eax >= 0x7 && ((CpuInfo(0x7, 0).registers.ebx >> 23) & 1u); -} - -bool haveCLWB() noexcept -{ - return CpuInfo(0x0).registers.eax >= 0x7 && ((CpuInfo(0x7, 0).registers.ebx >> 24) & 1u); -} - -bool haveAVX512PF() noexcept -{ - return haveAVX512F() && ((CpuInfo(0x7, 0).registers.ebx >> 26) & 1u); -} - -bool haveAVX512ER() noexcept -{ - return haveAVX512F() && ((CpuInfo(0x7, 0).registers.ebx >> 27) & 1u); -} - -bool haveAVX512CD() noexcept -{ - return haveAVX512F() && ((CpuInfo(0x7, 0).registers.ebx >> 28) & 1u); -} - -bool haveSHA() noexcept -{ - return CpuInfo(0x0).registers.eax >= 0x7 && ((CpuInfo(0x7, 0).registers.ebx >> 29) & 1u); -} - -bool haveAVX512BW() noexcept -{ - return haveAVX512F() && ((CpuInfo(0x7, 0).registers.ebx >> 30) & 1u); -} - -bool haveAVX512VL() noexcept -{ - return haveAVX512F() && ((CpuInfo(0x7, 0).registers.ebx >> 31) & 1u); -} - -bool havePREFETCHWT1() noexcept -{ - return CpuInfo(0x0).registers.eax >= 0x7 && ((CpuInfo(0x7, 0).registers.ecx >> 0) & 1u); -} - -bool haveAVX512VBMI() noexcept -{ - return haveAVX512F() && ((CpuInfo(0x7, 0).registers.ecx >> 1) & 1u); -} - -bool haveAVX512VBMI2() noexcept -{ - return haveAVX512F() && ((CpuInfo(0x7, 0).registers.ecx >> 6) & 1u); -} - -bool haveRDRAND() noexcept -{ - return CpuInfo(0x0).registers.eax >= 0x7 && ((CpuInfo(0x1).registers.ecx >> 30) & 1u); -} - -inline bool haveAMX() noexcept -{ -#if defined(__x86_64__) || defined(__i386__) - // http://www.intel.com/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-optimization-manual.pdf - return haveOSXSAVE() // implies haveXSAVE() - && ((our_xgetbv(0) >> 17) & 0x3) == 0x3; // AMX state are enabled by OS -#else - return false; -#endif -} - -bool haveAMXBF16() noexcept -{ - return haveAMX() - && ((CpuInfo(0x7, 0).registers.edx >> 22) & 1u); // AMX-BF16 bit -} - -bool haveAMXTILE() noexcept -{ - return haveAMX() - && ((CpuInfo(0x7, 0).registers.edx >> 24) & 1u); // AMX-TILE bit -} - -bool haveAMXINT8() noexcept -{ - return haveAMX() - && ((CpuInfo(0x7, 0).registers.edx >> 25) & 1u); // AMX-INT8 bit -} - -struct CpuFlagsCache +struct CPUFlagsCache { #define DEF_NAME(X) static inline bool have_##X = have##X(); CPU_ID_ENUMERATE(DEF_NAME) @@ -354,4 +350,3 @@ struct CpuFlagsCache } } - diff --git a/src/Common/ConcurrencyControl.cpp b/src/Common/ConcurrencyControl.cpp index c9fe51550dc..0893cfce955 100644 --- a/src/Common/ConcurrencyControl.cpp +++ b/src/Common/ConcurrencyControl.cpp @@ -12,10 +12,10 @@ namespace ErrorCodes ConcurrencyControl::Slot::~Slot() { - allocation->release(); + static_cast(*allocation).release(); } -ConcurrencyControl::Slot::Slot(AllocationPtr && allocation_) +ConcurrencyControl::Slot::Slot(SlotAllocationPtr && allocation_) : allocation(std::move(allocation_)) { } @@ -27,7 +27,7 @@ ConcurrencyControl::Allocation::~Allocation() parent.free(this); } -[[nodiscard]] ConcurrencyControl::SlotPtr ConcurrencyControl::Allocation::tryAcquire() +[[nodiscard]] AcquiredSlotPtr ConcurrencyControl::Allocation::tryAcquire() { SlotCount value = granted.load(); while (value) @@ -35,15 +35,21 @@ ConcurrencyControl::Allocation::~Allocation() if (granted.compare_exchange_strong(value, value - 1)) { std::unique_lock lock{mutex}; - return SlotPtr(new Slot(shared_from_this())); // can't use std::make_shared due to private ctor + return AcquiredSlotPtr(new Slot(shared_from_this())); // can't use std::make_shared due to private ctor } } return {}; // avoid unnecessary locking } -ConcurrencyControl::SlotCount ConcurrencyControl::Allocation::grantedCount() const +SlotCount ConcurrencyControl::Allocation::grantedCount() const { - return granted; + return granted.load(); +} + +SlotCount ConcurrencyControl::Allocation::allocatedCount() const +{ + std::unique_lock lock{mutex}; + return allocated; } ConcurrencyControl::Allocation::Allocation(ConcurrencyControl & parent_, SlotCount limit_, SlotCount granted_, Waiters::iterator waiter_) @@ -87,7 +93,7 @@ ConcurrencyControl::~ConcurrencyControl() abort(); } -[[nodiscard]] ConcurrencyControl::AllocationPtr ConcurrencyControl::allocate(SlotCount min, SlotCount max) +[[nodiscard]] SlotAllocationPtr ConcurrencyControl::allocate(SlotCount min, SlotCount max) { if (min > max) throw Exception(ErrorCodes::LOGICAL_ERROR, "ConcurrencyControl: invalid allocation requirements"); @@ -100,13 +106,13 @@ ConcurrencyControl::~ConcurrencyControl() // Create allocation and start waiting if more slots are required if (granted < max) - return AllocationPtr(new Allocation(*this, max, granted, + return SlotAllocationPtr(new Allocation(*this, max, granted, waiters.insert(cur_waiter, nullptr /* pointer is set by Allocation ctor */))); else - return AllocationPtr(new Allocation(*this, max, granted)); + return SlotAllocationPtr(new Allocation(*this, max, granted)); } -void ConcurrencyControl::setMaxConcurrency(ConcurrencyControl::SlotCount value) +void ConcurrencyControl::setMaxConcurrency(SlotCount value) { std::unique_lock lock{mutex}; max_concurrency = std::max(1, value); // never allow max_concurrency to be zero @@ -162,7 +168,7 @@ void ConcurrencyControl::schedule(std::unique_lock &) } } -ConcurrencyControl::SlotCount ConcurrencyControl::available(std::unique_lock &) const +SlotCount ConcurrencyControl::available(std::unique_lock &) const { if (cur_concurrency < max_concurrency) return max_concurrency - cur_concurrency; diff --git a/src/Common/ConcurrencyControl.h b/src/Common/ConcurrencyControl.h index 7e20384aa2a..ba94502962c 100644 --- a/src/Common/ConcurrencyControl.h +++ b/src/Common/ConcurrencyControl.h @@ -7,6 +7,7 @@ #include #include +#include namespace DB { @@ -34,41 +35,35 @@ namespace DB * Oversubscription is possible: total amount of allocated slots can exceed `setMaxConcurrency(limit)` * because `min` amount of slots is allocated for each query unconditionally. */ -class ConcurrencyControl : boost::noncopyable +class ConcurrencyControl : public ISlotControl { public: struct Allocation; - using AllocationPtr = std::shared_ptr; - using SlotCount = UInt64; using Waiters = std::list; - static constexpr SlotCount Unlimited = std::numeric_limits::max(); - // Scoped guard for acquired slot, see Allocation::tryAcquire() - struct Slot : boost::noncopyable + struct Slot : public IAcquiredSlot { - ~Slot(); + ~Slot() override; private: friend struct Allocation; // for ctor - explicit Slot(AllocationPtr && allocation_); + explicit Slot(SlotAllocationPtr && allocation_); - AllocationPtr allocation; + SlotAllocationPtr allocation; }; - // FIXME: have to be unique_ptr, but ThreadFromGlobalPool does not support move semantics yet - using SlotPtr = std::shared_ptr; - // Manages group of slots for a single query, see ConcurrencyControl::allocate(min, max) - struct Allocation : std::enable_shared_from_this, boost::noncopyable + struct Allocation : public ISlotAllocation { - ~Allocation(); + ~Allocation() override; // Take one already granted slot if available. Lock-free iff there is no granted slot. - [[nodiscard]] SlotPtr tryAcquire(); + [[nodiscard]] AcquiredSlotPtr tryAcquire() override; - SlotCount grantedCount() const; + SlotCount grantedCount() const override; + SlotCount allocatedCount() const override; private: friend struct Slot; // for release() @@ -94,7 +89,7 @@ public: ConcurrencyControl & parent; const SlotCount limit; - std::mutex mutex; // the following values must be accessed under this mutex + mutable std::mutex mutex; // the following values must be accessed under this mutex SlotCount allocated; // allocated total (including already `released`) SlotCount released = 0; @@ -103,17 +98,16 @@ public: const Waiters::iterator waiter; // iterator to itself in Waiters list; valid iff allocated < limit }; -public: ConcurrencyControl(); // WARNING: all Allocation objects MUST be destructed before ConcurrencyControl // NOTE: Recommended way to achieve this is to use `instance()` and do graceful shutdown of queries - ~ConcurrencyControl(); + ~ConcurrencyControl() override; // Allocate at least `min` and at most `max` slots. // If not all `max` slots were successfully allocated, a subscription for later allocation is created // Use `Allocation::tryAcquire()` to acquire allocated slot, before running a thread. - [[nodiscard]] AllocationPtr allocate(SlotCount min, SlotCount max); + [[nodiscard]] SlotAllocationPtr allocate(SlotCount min, SlotCount max) override; void setMaxConcurrency(SlotCount value); @@ -134,7 +128,7 @@ private: std::mutex mutex; Waiters waiters; Waiters::iterator cur_waiter; // round-robin pointer - SlotCount max_concurrency = Unlimited; + SlotCount max_concurrency = UnlimitedSlots; SlotCount cur_concurrency = 0; }; diff --git a/src/Common/CurrentMetrics.cpp b/src/Common/CurrentMetrics.cpp index f56149ed464..6931001202d 100644 --- a/src/Common/CurrentMetrics.cpp +++ b/src/Common/CurrentMetrics.cpp @@ -2,6 +2,8 @@ /// Available metrics. Add something here as you wish. +/// If the metric is generic (i.e. not server specific) +/// it should be also added to src/Coordination/KeeperConstant.cpp #define APPLY_FOR_BUILTIN_METRICS(M) \ M(Query, "Number of executing queries") \ M(Merge, "Number of executing background merges") \ @@ -260,6 +262,9 @@ M(ActiveTimersInQueryProfiler, "Number of Active thread local timers in QueryProfiler") \ M(RefreshableViews, "Number materialized views with periodic refreshing (REFRESH)") \ M(RefreshingViews, "Number of materialized views currently executing a refresh") \ + M(StorageBufferFlushThreads, "Number of threads for background flushes in StorageBuffer") \ + M(StorageBufferFlushThreadsActive, "Number of threads for background flushes in StorageBuffer running a task") \ + M(StorageBufferFlushThreadsScheduled, "Number of queued or active threads for background flushes in StorageBuffer") #ifdef APPLY_FOR_EXTERNAL_METRICS #define APPLY_FOR_METRICS(M) APPLY_FOR_BUILTIN_METRICS(M) APPLY_FOR_EXTERNAL_METRICS(M) diff --git a/src/Common/Dwarf.cpp b/src/Common/Dwarf.cpp index a405f73e35e..99da3b75429 100644 --- a/src/Common/Dwarf.cpp +++ b/src/Common/Dwarf.cpp @@ -2067,8 +2067,8 @@ Dwarf::LineNumberVM::StepResult Dwarf::LineNumberVM::step(std::string_view & pro if (opcode != 0) { // standard opcode // Only interpret opcodes that are recognized by the version we're parsing; - // the others are vendor extensions and we should ignore them. - switch (opcode) // NOLINT(bugprone-switch-missing-default-case) + // the others are vendor extensions, and we should ignore them. + switch (opcode) { case DW_LNS_copy: basicBlock_ = false; @@ -2121,6 +2121,7 @@ Dwarf::LineNumberVM::StepResult Dwarf::LineNumberVM::step(std::string_view & pro } isa_ = readULEB(program); return CONTINUE; + default: } // Unrecognized standard opcode, slurp the appropriate number of LEB diff --git a/src/Common/ErrorCodes.cpp b/src/Common/ErrorCodes.cpp index 8e81a626b41..aeace0abdda 100644 --- a/src/Common/ErrorCodes.cpp +++ b/src/Common/ErrorCodes.cpp @@ -594,6 +594,7 @@ M(712, TOO_MANY_MATERIALIZED_VIEWS) \ M(713, BROKEN_PROJECTION) \ M(714, UNEXPECTED_CLUSTER) \ + M(715, CANNOT_DETECT_FORMAT) \ \ M(999, KEEPER_EXCEPTION) \ M(1000, POCO_EXCEPTION) \ diff --git a/src/Common/FailPoint.cpp b/src/Common/FailPoint.cpp index f29aee0cdcc..a23133b7522 100644 --- a/src/Common/FailPoint.cpp +++ b/src/Common/FailPoint.cpp @@ -6,7 +6,6 @@ #include #include #include -#include namespace DB { @@ -44,7 +43,8 @@ static struct InitFiu REGULAR(dummy_failpoint) \ REGULAR(prefetched_reader_pool_failpoint) \ PAUSEABLE_ONCE(dummy_pausable_failpoint_once) \ - PAUSEABLE(dummy_pausable_failpoint) + PAUSEABLE(dummy_pausable_failpoint) \ + ONCE(execute_query_calling_empty_set_result_func_on_exception) namespace FailPoints { diff --git a/src/Common/FailPoint.h b/src/Common/FailPoint.h index b9eb13903a6..613cfb15322 100644 --- a/src/Common/FailPoint.h +++ b/src/Common/FailPoint.h @@ -18,7 +18,6 @@ #pragma clang diagnostic pop #endif -#include #include namespace DB diff --git a/src/Common/FieldVisitorConvertToNumber.h b/src/Common/FieldVisitorConvertToNumber.h index bf8c8c8638e..47a1e669969 100644 --- a/src/Common/FieldVisitorConvertToNumber.h +++ b/src/Common/FieldVisitorConvertToNumber.h @@ -91,7 +91,7 @@ public: if constexpr (std::is_floating_point_v) return x.getValue().template convertTo() / x.getScaleMultiplier().template convertTo(); else - return (x.getValue() / x.getScaleMultiplier()). template convertTo(); + return (x.getValue() / x.getScaleMultiplier()).template convertTo(); } T operator() (const AggregateFunctionStateData &) const diff --git a/src/Common/ISlotControl.h b/src/Common/ISlotControl.h new file mode 100644 index 00000000000..daeb956f5a8 --- /dev/null +++ b/src/Common/ISlotControl.h @@ -0,0 +1,76 @@ +#pragma once + +#include +#include +#include +#include + + +namespace DB +{ + +// Interfaces for abstract "slot" allocation and control. +// Slot is a virtual entity existing in a limited amount (CPUs or memory chunks, etc). +// +// Every slot can be in one of the following states: +// * free: slot is available to be allocated. +// * allocated: slot is allocated to a specific ISlotAllocation. +// +// Allocated slots can be in one of the following states: +// * granted: allocated, but not yet acquired. +// * acquired: a granted slot becomes acquired by using IAcquiredSlot. +// +// Example for CPU (see ConcurrencyControl.h). Every slot represents one CPU in the system. +// Slot allocation is a request to allocate specific number of CPUs for a specific query. +// Acquired slot is an entity that is held by a thread as long as it is running. This allows +// total number of threads in the system to be limited and the distribution process to be controlled. +// +// TODO: +// - for preemption - ability to return granted slot back and reacquire it later. +// - for memory allocations - variable size of slots (in bytes). + +/// Number of slots +using SlotCount = UInt64; + +/// Unlimited number of slots +constexpr SlotCount UnlimitedSlots = std::numeric_limits::max(); + +/// Acquired slot holder. Slot is considered to be acquired as long as the object exists. +class IAcquiredSlot : public std::enable_shared_from_this, boost::noncopyable +{ +public: + virtual ~IAcquiredSlot() = default; +}; + +using AcquiredSlotPtr = std::shared_ptr; + +/// Request for allocation of slots from ISlotControl. +/// Allows for more slots to be acquired and the whole request to be canceled. +class ISlotAllocation : public std::enable_shared_from_this, boost::noncopyable +{ +public: + virtual ~ISlotAllocation() = default; + + /// Take one already granted slot if available. + [[nodiscard]] virtual AcquiredSlotPtr tryAcquire() = 0; + + /// Returns the number of granted slots for given allocation (i.e. available to be acquired) + virtual SlotCount grantedCount() const = 0; + + /// Returns the total number of slots allocated at the moment (acquired and granted) + virtual SlotCount allocatedCount() const = 0; +}; + +using SlotAllocationPtr = std::shared_ptr; + +class ISlotControl : boost::noncopyable +{ +public: + virtual ~ISlotControl() = default; + + // Allocate at least `min` and at most `max` slots. + // If not all `max` slots were successfully allocated, a "subscription" for later allocation is created + [[nodiscard]] virtual SlotAllocationPtr allocate(SlotCount min, SlotCount max) = 0; +}; + +} diff --git a/src/Common/LoggingFormatStringHelpers.cpp b/src/Common/LoggingFormatStringHelpers.cpp index b3353a59010..7cbef779f28 100644 --- a/src/Common/LoggingFormatStringHelpers.cpp +++ b/src/Common/LoggingFormatStringHelpers.cpp @@ -130,13 +130,12 @@ LogSeriesLimiter::LogSeriesLimiter(LoggerPtr logger_, size_t allowed_count_, tim if (last_time + interval_s_ <= now) { debug_message = fmt::format( - " (LogSeriesLimiter: on interval from {} to {} accepted series {} / {} for the logger {} : {})", + " (LogSeriesLimiter: on interval from {} to {} accepted series {} / {} for the logger {})", DateLUT::instance().timeToString(last_time), DateLUT::instance().timeToString(now), accepted_count, total_count, - logger->name(), - double(name_hash)); + logger->name()); register_as_first(); return; diff --git a/src/Common/PoolWithFailoverBase.h b/src/Common/PoolWithFailoverBase.h index 8fd83300eff..cf270c9dad0 100644 --- a/src/Common/PoolWithFailoverBase.h +++ b/src/Common/PoolWithFailoverBase.h @@ -66,7 +66,7 @@ public: , log(log_) { for (size_t i = 0;i < nested_pools.size(); ++i) - shared_pool_states[i].config_priority = nested_pools[i]->getPriority(); + shared_pool_states[i].config_priority = nested_pools[i]->getConfigPriority(); } struct TryResult @@ -133,7 +133,7 @@ protected: void updateErrorCounts(PoolStates & states, time_t & last_decrease_time) const; - std::vector getShuffledPools(size_t max_ignored_errors, const GetPriorityFunc & get_priority); + std::vector getShuffledPools(size_t max_ignored_errors, const GetPriorityFunc & get_priority, bool use_slowdown_count = false); inline void updateSharedErrorCounts(std::vector & shuffled_pools); @@ -160,7 +160,7 @@ protected: template std::vector::ShuffledPool> PoolWithFailoverBase::getShuffledPools( - size_t max_ignored_errors, const PoolWithFailoverBase::GetPriorityFunc & get_priority) + size_t max_ignored_errors, const PoolWithFailoverBase::GetPriorityFunc & get_priority, bool use_slowdown_count) { /// Update random numbers and error counts. PoolStates pool_states = updatePoolStates(max_ignored_errors); @@ -175,13 +175,13 @@ PoolWithFailoverBase::getShuffledPools( std::vector shuffled_pools; shuffled_pools.reserve(nested_pools.size()); for (size_t i = 0; i < nested_pools.size(); ++i) - shuffled_pools.push_back(ShuffledPool{nested_pools[i], &pool_states[i], i, /* error_count = */ 0, /* slowdown_count = */ 0}); + shuffled_pools.emplace_back(ShuffledPool{.pool = nested_pools[i], .state = &pool_states[i], .index = i}); ::sort( shuffled_pools.begin(), shuffled_pools.end(), - [](const ShuffledPool & lhs, const ShuffledPool & rhs) + [use_slowdown_count](const ShuffledPool & lhs, const ShuffledPool & rhs) { - return PoolState::compare(*lhs.state, *rhs.state); + return PoolState::compare(*lhs.state, *rhs.state, use_slowdown_count); }); return shuffled_pools; @@ -344,10 +344,14 @@ struct PoolWithFailoverBase::PoolState random = rng(); } - static bool compare(const PoolState & lhs, const PoolState & rhs) + static bool compare(const PoolState & lhs, const PoolState & rhs, bool use_slowdown_count) { - return std::forward_as_tuple(lhs.error_count, lhs.slowdown_count, lhs.config_priority, lhs.priority, lhs.random) - < std::forward_as_tuple(rhs.error_count, rhs.slowdown_count, rhs.config_priority, rhs.priority, rhs.random); + if (use_slowdown_count) + return std::forward_as_tuple(lhs.error_count, lhs.slowdown_count, lhs.config_priority, lhs.priority, lhs.random) + < std::forward_as_tuple(rhs.error_count, rhs.slowdown_count, rhs.config_priority, rhs.priority, rhs.random); + else + return std::forward_as_tuple(lhs.error_count, lhs.config_priority, lhs.priority, lhs.random) + < std::forward_as_tuple(rhs.error_count, rhs.config_priority, rhs.priority, rhs.random); } private: diff --git a/src/Common/ProfileEvents.cpp b/src/Common/ProfileEvents.cpp index bdc5d2d88a8..f14223ec644 100644 --- a/src/Common/ProfileEvents.cpp +++ b/src/Common/ProfileEvents.cpp @@ -4,6 +4,8 @@ /// Available events. Add something here as you wish. +/// If the event is generic (i.e. not server specific) +/// it should be also added to src/Coordination/KeeperConstant.cpp #define APPLY_FOR_BUILTIN_EVENTS(M) \ M(Query, "Number of queries to be interpreted and potentially executed. Does not include queries that failed to parse or were rejected due to AST size limits, quota limits or limits on the number of simultaneously running queries. May include internal queries initiated by ClickHouse itself. Does not count subqueries.") \ M(SelectQuery, "Same as Query, but only for SELECT queries.") \ @@ -310,7 +312,7 @@ The server successfully detected this situation and will download merged part fr M(ParallelReplicasStealingLeftoversMicroseconds, "Time spent collecting orphaned segments") \ M(ParallelReplicasCollectingOwnedSegmentsMicroseconds, "Time spent collecting segments meant by hash") \ \ - M(PerfCpuCycles, "Total cycles. Be wary of what happens during CPU frequency scaling.") \ + M(PerfCPUCycles, "Total cycles. Be wary of what happens during CPU frequency scaling.") \ M(PerfInstructions, "Retired instructions. Be careful, these can be affected by various issues, most notably hardware interrupt counts.") \ M(PerfCacheReferences, "Cache accesses. Usually, this indicates Last Level Cache accesses, but this may vary depending on your CPU. This may include prefetches and coherency messages; again this depends on the design of your CPU.") \ M(PerfCacheMisses, "Cache misses. Usually this indicates Last Level Cache misses; this is intended to be used in conjunction with the PERFCOUNTHWCACHEREFERENCES event to calculate cache miss rates.") \ @@ -319,12 +321,12 @@ The server successfully detected this situation and will download merged part fr M(PerfBusCycles, "Bus cycles, which can be different from total cycles.") \ M(PerfStalledCyclesFrontend, "Stalled cycles during issue.") \ M(PerfStalledCyclesBackend, "Stalled cycles during retirement.") \ - M(PerfRefCpuCycles, "Total cycles; not affected by CPU frequency scaling.") \ + M(PerfRefCPUCycles, "Total cycles; not affected by CPU frequency scaling.") \ \ - M(PerfCpuClock, "The CPU clock, a high-resolution per-CPU timer") \ + M(PerfCPUClock, "The CPU clock, a high-resolution per-CPU timer") \ M(PerfTaskClock, "A clock count specific to the task that is running") \ M(PerfContextSwitches, "Number of context switches") \ - M(PerfCpuMigrations, "Number of times the process has migrated to a new CPU") \ + M(PerfCPUMigrations, "Number of times the process has migrated to a new CPU") \ M(PerfAlignmentFaults, "Number of alignment faults. These happen when unaligned memory accesses happen; the kernel can handle these but it reduces performance. This happens only on some architectures (never on x86).") \ M(PerfEmulationFaults, "Number of emulation faults. The kernel sometimes traps on unimplemented instructions and emulates them for user space. This can negatively impact performance.") \ M(PerfMinEnabledTime, "For all events, minimum time that an event was enabled. Used to track event multiplexing influence") \ diff --git a/src/Common/TargetSpecific.cpp b/src/Common/TargetSpecific.cpp index b115d3a8734..49f396c0926 100644 --- a/src/Common/TargetSpecific.cpp +++ b/src/Common/TargetSpecific.cpp @@ -1,7 +1,7 @@ #include #include -#include +#include namespace DB { @@ -9,25 +9,25 @@ namespace DB UInt32 getSupportedArchs() { UInt32 result = 0; - if (Cpu::CpuFlagsCache::have_SSE42) + if (CPU::CPUFlagsCache::have_SSE42) result |= static_cast(TargetArch::SSE42); - if (Cpu::CpuFlagsCache::have_AVX) + if (CPU::CPUFlagsCache::have_AVX) result |= static_cast(TargetArch::AVX); - if (Cpu::CpuFlagsCache::have_AVX2) + if (CPU::CPUFlagsCache::have_AVX2) result |= static_cast(TargetArch::AVX2); - if (Cpu::CpuFlagsCache::have_AVX512F) + if (CPU::CPUFlagsCache::have_AVX512F) result |= static_cast(TargetArch::AVX512F); - if (Cpu::CpuFlagsCache::have_AVX512BW) + if (CPU::CPUFlagsCache::have_AVX512BW) result |= static_cast(TargetArch::AVX512BW); - if (Cpu::CpuFlagsCache::have_AVX512VBMI) + if (CPU::CPUFlagsCache::have_AVX512VBMI) result |= static_cast(TargetArch::AVX512VBMI); - if (Cpu::CpuFlagsCache::have_AVX512VBMI2) + if (CPU::CPUFlagsCache::have_AVX512VBMI2) result |= static_cast(TargetArch::AVX512VBMI2); - if (Cpu::CpuFlagsCache::have_AMXBF16) + if (CPU::CPUFlagsCache::have_AMXBF16) result |= static_cast(TargetArch::AMXBF16); - if (Cpu::CpuFlagsCache::have_AMXTILE) + if (CPU::CPUFlagsCache::have_AMXTILE) result |= static_cast(TargetArch::AMXTILE); - if (Cpu::CpuFlagsCache::have_AMXINT8) + if (CPU::CPUFlagsCache::have_AMXINT8) result |= static_cast(TargetArch::AMXINT8); return result; } diff --git a/src/Common/ThreadProfileEvents.cpp b/src/Common/ThreadProfileEvents.cpp index 990151d73ff..6a63d484cd9 100644 --- a/src/Common/ThreadProfileEvents.cpp +++ b/src/Common/ThreadProfileEvents.cpp @@ -6,10 +6,8 @@ #include "ProcfsMetricsProvider.h" #include "hasLinuxCapability.h" -#include #include #include -#include #include #include @@ -36,7 +34,7 @@ namespace ProfileEvents extern const Event OSReadBytes; extern const Event OSWriteBytes; - extern const Event PerfCpuCycles; + extern const Event PerfCPUCycles; extern const Event PerfInstructions; extern const Event PerfCacheReferences; extern const Event PerfCacheMisses; @@ -45,12 +43,12 @@ namespace ProfileEvents extern const Event PerfBusCycles; extern const Event PerfStalledCyclesFrontend; extern const Event PerfStalledCyclesBackend; - extern const Event PerfRefCpuCycles; + extern const Event PerfRefCPUCycles; - extern const Event PerfCpuClock; + extern const Event PerfCPUClock; extern const Event PerfTaskClock; extern const Event PerfContextSwitches; - extern const Event PerfCpuMigrations; + extern const Event PerfCPUMigrations; extern const Event PerfAlignmentFaults; extern const Event PerfEmulationFaults; extern const Event PerfMinEnabledTime; @@ -218,7 +216,7 @@ thread_local PerfEventsCounters current_thread_counters; // descriptions' source: http://man7.org/linux/man-pages/man2/perf_event_open.2.html static const PerfEventInfo raw_events_info[] = { - HARDWARE_EVENT(PERF_COUNT_HW_CPU_CYCLES, PerfCpuCycles), + HARDWARE_EVENT(PERF_COUNT_HW_CPU_CYCLES, PerfCPUCycles), HARDWARE_EVENT(PERF_COUNT_HW_INSTRUCTIONS, PerfInstructions), HARDWARE_EVENT(PERF_COUNT_HW_CACHE_REFERENCES, PerfCacheReferences), HARDWARE_EVENT(PERF_COUNT_HW_CACHE_MISSES, PerfCacheMisses), @@ -227,13 +225,13 @@ static const PerfEventInfo raw_events_info[] = { HARDWARE_EVENT(PERF_COUNT_HW_BUS_CYCLES, PerfBusCycles), HARDWARE_EVENT(PERF_COUNT_HW_STALLED_CYCLES_FRONTEND, PerfStalledCyclesFrontend), HARDWARE_EVENT(PERF_COUNT_HW_STALLED_CYCLES_BACKEND, PerfStalledCyclesBackend), - HARDWARE_EVENT(PERF_COUNT_HW_REF_CPU_CYCLES, PerfRefCpuCycles), + HARDWARE_EVENT(PERF_COUNT_HW_REF_CPU_CYCLES, PerfRefCPUCycles), // `cpu-clock` is a bit broken according to this: https://stackoverflow.com/a/56967896 - SOFTWARE_EVENT(PERF_COUNT_SW_CPU_CLOCK, PerfCpuClock), + SOFTWARE_EVENT(PERF_COUNT_SW_CPU_CLOCK, PerfCPUClock), SOFTWARE_EVENT(PERF_COUNT_SW_TASK_CLOCK, PerfTaskClock), SOFTWARE_EVENT(PERF_COUNT_SW_CONTEXT_SWITCHES, PerfContextSwitches), - SOFTWARE_EVENT(PERF_COUNT_SW_CPU_MIGRATIONS, PerfCpuMigrations), + SOFTWARE_EVENT(PERF_COUNT_SW_CPU_MIGRATIONS, PerfCPUMigrations), SOFTWARE_EVENT(PERF_COUNT_SW_ALIGNMENT_FAULTS, PerfAlignmentFaults), SOFTWARE_EVENT(PERF_COUNT_SW_EMULATION_FAULTS, PerfEmulationFaults), diff --git a/src/Common/tests/gtest_async_loader.cpp b/src/Common/tests/gtest_async_loader.cpp index 950c7bbab76..fc2537abcfc 100644 --- a/src/Common/tests/gtest_async_loader.cpp +++ b/src/Common/tests/gtest_async_loader.cpp @@ -427,9 +427,7 @@ TEST(AsyncLoader, CancelExecutingTask) } } -// This test is disabled due to `MemorySanitizer: use-of-uninitialized-value` issue in `collectSymbolsFromProgramHeaders` function -// More details: https://github.com/ClickHouse/ClickHouse/pull/48923#issuecomment-1545415482 -TEST(AsyncLoader, DISABLED_JobFailure) +TEST(AsyncLoader, JobFailure) { AsyncLoaderTest t; t.loader.start(); diff --git a/src/Common/tests/gtest_concurrency_control.cpp b/src/Common/tests/gtest_concurrency_control.cpp index 8e5b89a72a0..5e579317ade 100644 --- a/src/Common/tests/gtest_concurrency_control.cpp +++ b/src/Common/tests/gtest_concurrency_control.cpp @@ -15,7 +15,7 @@ struct ConcurrencyControlTest { ConcurrencyControl cc; - explicit ConcurrencyControlTest(ConcurrencyControl::SlotCount limit = ConcurrencyControl::Unlimited) + explicit ConcurrencyControlTest(SlotCount limit = UnlimitedSlots) { cc.setMaxConcurrency(limit); } @@ -25,7 +25,7 @@ TEST(ConcurrencyControl, Unlimited) { ConcurrencyControlTest t; // unlimited number of slots auto slots = t.cc.allocate(0, 100500); - std::vector acquired; + std::vector acquired; while (auto slot = slots->tryAcquire()) acquired.emplace_back(std::move(slot)); ASSERT_TRUE(acquired.size() == 100500); @@ -34,14 +34,14 @@ TEST(ConcurrencyControl, Unlimited) TEST(ConcurrencyControl, Fifo) { ConcurrencyControlTest t(1); // use single slot - std::vector allocations; + std::vector allocations; constexpr int count = 42; allocations.reserve(count); for (int i = 0; i < count; i++) allocations.emplace_back(t.cc.allocate(0, 1)); for (int i = 0; i < count; i++) { - ConcurrencyControl::SlotPtr holder; + AcquiredSlotPtr holder; for (int j = 0; j < count; j++) { auto slot = allocations[j]->tryAcquire(); @@ -60,11 +60,11 @@ TEST(ConcurrencyControl, Fifo) TEST(ConcurrencyControl, Oversubscription) { ConcurrencyControlTest t(10); - std::vector allocations; + std::vector allocations; allocations.reserve(10); for (int i = 0; i < 10; i++) allocations.emplace_back(t.cc.allocate(1, 2)); - std::vector slots; + std::vector slots; // Normal allocation using maximum amount of slots for (int i = 0; i < 5; i++) { @@ -90,7 +90,7 @@ TEST(ConcurrencyControl, ReleaseUnacquiredSlots) { ConcurrencyControlTest t(10); { - std::vector allocations; + std::vector allocations; allocations.reserve(10); for (int i = 0; i < 10; i++) allocations.emplace_back(t.cc.allocate(1, 2)); @@ -98,7 +98,7 @@ TEST(ConcurrencyControl, ReleaseUnacquiredSlots) } // Check that slots were actually released auto allocation = t.cc.allocate(0, 20); - std::vector acquired; + std::vector acquired; while (auto slot = allocation->tryAcquire()) acquired.emplace_back(std::move(slot)); ASSERT_TRUE(acquired.size() == 10); @@ -110,7 +110,7 @@ TEST(ConcurrencyControl, DestroyNotFullyAllocatedAllocation) for (int i = 0; i < 3; i++) { auto allocation = t.cc.allocate(5, 20); - std::vector acquired; + std::vector acquired; while (auto slot = allocation->tryAcquire()) acquired.emplace_back(std::move(slot)); ASSERT_TRUE(acquired.size() == 10); @@ -122,7 +122,7 @@ TEST(ConcurrencyControl, DestroyAllocationBeforeSlots) ConcurrencyControlTest t(10); for (int i = 0; i < 3; i++) { - std::vector acquired; + std::vector acquired; auto allocation = t.cc.allocate(5, 20); while (auto slot = allocation->tryAcquire()) acquired.emplace_back(std::move(slot)); @@ -135,7 +135,7 @@ TEST(ConcurrencyControl, GrantReleasedToTheSameAllocation) { ConcurrencyControlTest t(3); auto allocation = t.cc.allocate(0, 10); - std::list acquired; + std::list acquired; while (auto slot = allocation->tryAcquire()) acquired.emplace_back(std::move(slot)); ASSERT_TRUE(acquired.size() == 3); // 0 1 2 @@ -183,7 +183,7 @@ TEST(ConcurrencyControl, SetSlotCount) { ConcurrencyControlTest t(10); auto allocation = t.cc.allocate(5, 30); - std::vector acquired; + std::vector acquired; while (auto slot = allocation->tryAcquire()) acquired.emplace_back(std::move(slot)); ASSERT_TRUE(acquired.size() == 10); @@ -200,7 +200,7 @@ TEST(ConcurrencyControl, SetSlotCount) ASSERT_TRUE(acquired.size() == 5); // Check that newly added slots are equally distributed over waiting allocations - std::vector acquired2; + std::vector acquired2; auto allocation2 = t.cc.allocate(0, 30); ASSERT_TRUE(!allocation->tryAcquire()); t.cc.setMaxConcurrency(15); // 10 slots added: 5 to the first allocation and 5 to the second one @@ -224,7 +224,7 @@ TEST(ConcurrencyControl, MultipleThreads) auto run_query = [&] (size_t max_threads) { - ConcurrencyControl::AllocationPtr slots = t.cc.allocate(1, max_threads); + SlotAllocationPtr slots = t.cc.allocate(1, max_threads); std::mutex threads_mutex; std::vector threads; threads.reserve(max_threads); diff --git a/src/Interpreters/threadPoolCallbackRunner.h b/src/Common/threadPoolCallbackRunner.h similarity index 100% rename from src/Interpreters/threadPoolCallbackRunner.h rename to src/Common/threadPoolCallbackRunner.h diff --git a/src/Compression/CompressionCodecT64.cpp b/src/Compression/CompressionCodecT64.cpp index bf9a9414bc1..3ddc56fe4f6 100644 --- a/src/Compression/CompressionCodecT64.cpp +++ b/src/Compression/CompressionCodecT64.cpp @@ -91,6 +91,7 @@ enum class MagicNumber : uint8_t Decimal32 = 19, Decimal64 = 20, IPv4 = 21, + Date32 = 22, }; MagicNumber serializeTypeId(std::optional type_id) @@ -109,6 +110,7 @@ MagicNumber serializeTypeId(std::optional type_id) case TypeIndex::Int32: return MagicNumber::Int32; case TypeIndex::Int64: return MagicNumber::Int64; case TypeIndex::Date: return MagicNumber::Date; + case TypeIndex::Date32: return MagicNumber::Date32; case TypeIndex::DateTime: return MagicNumber::DateTime; case TypeIndex::DateTime64: return MagicNumber::DateTime64; case TypeIndex::Enum8: return MagicNumber::Enum8; @@ -137,6 +139,7 @@ TypeIndex deserializeTypeId(uint8_t serialized_type_id) case MagicNumber::Int32: return TypeIndex::Int32; case MagicNumber::Int64: return TypeIndex::Int64; case MagicNumber::Date: return TypeIndex::Date; + case MagicNumber::Date32: return TypeIndex::Date32; case MagicNumber::DateTime: return TypeIndex::DateTime; case MagicNumber::DateTime64: return TypeIndex::DateTime64; case MagicNumber::Enum8: return TypeIndex::Enum8; @@ -165,6 +168,7 @@ TypeIndex baseType(TypeIndex type_idx) return TypeIndex::Int16; case TypeIndex::Int32: case TypeIndex::Decimal32: + case TypeIndex::Date32: return TypeIndex::Int32; case TypeIndex::Int64: case TypeIndex::Decimal64: @@ -205,6 +209,7 @@ TypeIndex typeIdx(const IDataType * data_type) case TypeIndex::UInt16: case TypeIndex::Enum16: case TypeIndex::Date: + case TypeIndex::Date32: case TypeIndex::Int32: case TypeIndex::UInt32: case TypeIndex::IPv4: diff --git a/src/Coordination/Changelog.cpp b/src/Coordination/Changelog.cpp index 5a58932606e..cf0ea2193c8 100644 --- a/src/Coordination/Changelog.cpp +++ b/src/Coordination/Changelog.cpp @@ -1,5 +1,7 @@ #include #include +#include +#include #include #include #include @@ -33,23 +35,15 @@ namespace ErrorCodes namespace { -constexpr std::string_view tmp_prefix = "tmp_"; - -void moveFileBetweenDisks(DiskPtr disk_from, ChangelogFileDescriptionPtr description, DiskPtr disk_to, const std::string & path_to) +void moveChangelogBetweenDisks( + DiskPtr disk_from, + ChangelogFileDescriptionPtr description, + DiskPtr disk_to, + const std::string & path_to, + const KeeperContextPtr & keeper_context) { - /// we use empty file with prefix tmp_ to detect incomplete copies - /// if a copy is complete we don't care from which disk we use the same file - /// so it's okay if a failure happens after removing of tmp file but before we remove - /// the changelog from the source disk - auto from_path = fs::path(description->path); - auto tmp_changelog_name = from_path.parent_path() / (std::string{tmp_prefix} + from_path.filename().string()); - { - auto buf = disk_to->writeFile(tmp_changelog_name); - buf->finalize(); - } - disk_from->copyFile(from_path, *disk_to, path_to, {}); - disk_to->removeFile(tmp_changelog_name); - disk_from->removeFile(description->path); + moveFileBetweenDisks( + disk_from, description->path, disk_to, path_to, getLogger("Changelog"), keeper_context); description->path = path_to; description->disk = disk_to; } @@ -173,7 +167,7 @@ public: } else { - moveFileBetweenDisks(log_disk, current_file_description, disk, new_path); + moveChangelogBetweenDisks(log_disk, current_file_description, disk, new_path, keeper_context); } } } @@ -196,7 +190,7 @@ public: } catch (...) { - tryLogCurrentException(log); + tryLogCurrentException(log, "While setting new changelog file"); throw; } } @@ -648,9 +642,9 @@ Changelog::Changelog( if (file_name == changelogs_detached_dir) continue; - if (file_name.starts_with(tmp_prefix)) + if (file_name.starts_with(tmp_keeper_file_prefix)) { - incomplete_files.emplace(file_name.substr(tmp_prefix.size()), it->path()); + incomplete_files.emplace(file_name.substr(tmp_keeper_file_prefix.size()), it->path()); continue; } @@ -813,7 +807,7 @@ void Changelog::readChangelogAndInitWriter(uint64_t last_commited_log_index, uin auto disk = getDisk(); if (latest_log_disk != disk && latest_log_disk == description->disk) - moveFileBetweenDisks(latest_log_disk, description, disk, description->path); + moveChangelogBetweenDisks(latest_log_disk, description, disk, description->path, keeper_context); }; /// we can have empty log (with zero entries) and last_log_read_result will be initialized @@ -899,7 +893,7 @@ void Changelog::readChangelogAndInitWriter(uint64_t last_commited_log_index, uin } if (description->disk != disk) - moveFileBetweenDisks(description->disk, description, disk, description->path); + moveChangelogBetweenDisks(description->disk, description, disk, description->path, keeper_context); } @@ -921,7 +915,7 @@ void Changelog::initWriter(ChangelogFileDescriptionPtr description) auto log_disk = description->disk; auto latest_log_disk = getLatestLogDisk(); if (log_disk != latest_log_disk) - moveFileBetweenDisks(log_disk, description, latest_log_disk, description->path); + moveChangelogBetweenDisks(log_disk, description, latest_log_disk, description->path, keeper_context); current_writer->setFile(std::move(description), WriteMode::Append); } @@ -984,11 +978,11 @@ void Changelog::removeExistingLogs(ChangelogIter begin, ChangelogIter end) catch (const DB::Exception & e) { if (e.code() == DB::ErrorCodes::NOT_IMPLEMENTED) - moveFileBetweenDisks(changelog_disk, changelog_description, disk, new_path); + moveChangelogBetweenDisks(changelog_disk, changelog_description, disk, new_path, keeper_context); } } else - moveFileBetweenDisks(changelog_disk, changelog_description, disk, new_path); + moveChangelogBetweenDisks(changelog_disk, changelog_description, disk, new_path, keeper_context); itr = existing_changelogs.erase(itr); } @@ -1085,70 +1079,78 @@ void Changelog::writeThread() LOG_WARNING(log, "Changelog is shut down"); }; - /// NuRaft writes a batch of request by first calling multiple store requests, i.e. AppendLog - /// finished by a flush request - /// We assume that after some number of appends, we always get flush request - while (true) + try { - if (try_batch_flush) + /// NuRaft writes a batch of request by first calling multiple store requests, i.e. AppendLog + /// finished by a flush request + /// We assume that after some number of appends, we always get flush request + while (true) { - try_batch_flush = false; - /// we have Flush request stored in write operation - /// but we try to get new append operations - /// if there are none, we apply the currently set Flush - chassert(std::holds_alternative(write_operation)); - if (!write_operations.tryPop(write_operation)) + if (try_batch_flush) { - chassert(batch_append_ok); - const auto & flush = std::get(write_operation); - flush_logs(flush); - notify_append_completion(); - if (!write_operations.pop(write_operation)) - break; - } - } - else if (!write_operations.pop(write_operation)) - { - break; - } - - assert(initialized); - - if (auto * append_log = std::get_if(&write_operation)) - { - if (!batch_append_ok) - continue; - - std::lock_guard writer_lock(writer_mutex); - assert(current_writer); - - batch_append_ok = current_writer->appendRecord(buildRecord(append_log->index, append_log->log_entry)); - ++pending_appends; - } - else - { - const auto & flush = std::get(write_operation); - - if (batch_append_ok) - { - /// we can try batching more logs for flush - if (pending_appends < flush_settings.max_flush_batch_size) + try_batch_flush = false; + /// we have Flush request stored in write operation + /// but we try to get new append operations + /// if there are none, we apply the currently set Flush + chassert(std::holds_alternative(write_operation)); + if (!write_operations.tryPop(write_operation)) { - try_batch_flush = true; - continue; + chassert(batch_append_ok); + const auto & flush = std::get(write_operation); + flush_logs(flush); + notify_append_completion(); + if (!write_operations.pop(write_operation)) + break; } - /// we need to flush because we have maximum allowed pending records - flush_logs(flush); + } + else if (!write_operations.pop(write_operation)) + { + break; + } + + assert(initialized); + + if (auto * append_log = std::get_if(&write_operation)) + { + if (!batch_append_ok) + continue; + + std::lock_guard writer_lock(writer_mutex); + assert(current_writer); + + batch_append_ok = current_writer->appendRecord(buildRecord(append_log->index, append_log->log_entry)); + ++pending_appends; } else { - std::lock_guard lock{durable_idx_mutex}; - *flush.failed = true; + const auto & flush = std::get(write_operation); + + if (batch_append_ok) + { + /// we can try batching more logs for flush + if (pending_appends < flush_settings.max_flush_batch_size) + { + try_batch_flush = true; + continue; + } + /// we need to flush because we have maximum allowed pending records + flush_logs(flush); + } + else + { + std::lock_guard lock{durable_idx_mutex}; + *flush.failed = true; + } + notify_append_completion(); + batch_append_ok = true; } - notify_append_completion(); - batch_append_ok = true; } } + catch (...) + { + tryLogCurrentException(log, "Write thread failed, aborting"); + std::abort(); + } } @@ -1191,7 +1193,7 @@ void Changelog::writeAt(uint64_t index, const LogEntryPtr & log_entry) auto log_disk = description->disk; auto latest_log_disk = getLatestLogDisk(); if (log_disk != latest_log_disk) - moveFileBetweenDisks(log_disk, description, latest_log_disk, description->path); + moveChangelogBetweenDisks(log_disk, description, latest_log_disk, description->path, keeper_context); current_writer->setFile(std::move(description), WriteMode::Append); diff --git a/src/Coordination/CoordinationSettings.h b/src/Coordination/CoordinationSettings.h index a58f2b04797..358c6c4097e 100644 --- a/src/Coordination/CoordinationSettings.h +++ b/src/Coordination/CoordinationSettings.h @@ -52,7 +52,10 @@ struct Settings; M(UInt64, log_file_overallocate_size, 50 * 1024 * 1024, "If max_log_file_size is not set to 0, this value will be added to it for preallocating bytes on disk. If a log record is larger than this value, it could lead to uncaught out-of-space issues so a larger value is preferred", 0) \ M(UInt64, min_request_size_for_cache, 50 * 1024, "Minimal size of the request to cache the deserialization result. Caching can have negative effect on latency for smaller requests, set to 0 to disable", 0) \ M(UInt64, raft_limits_reconnect_limit, 50, "If connection to a peer is silent longer than this limit * (multiplied by heartbeat interval), we re-establish the connection.", 0) \ - M(Bool, async_replication, false, "Enable async replication. All write and read guarantees are preserved while better performance is achieved. Settings is disabled by default to not break backwards compatibility.", 0) + M(Bool, async_replication, false, "Enable async replication. All write and read guarantees are preserved while better performance is achieved. Settings is disabled by default to not break backwards compatibility.", 0) \ + M(UInt64, disk_move_retries_wait_ms, 1000, "How long to wait between retries after a failure which happened while a file was being moved between disks.", 0) \ + M(UInt64, disk_move_retries_during_init, 100, "The amount of retries after a failure which happened while a file was being moved between disks during initialization.", 0) + DECLARE_SETTINGS_TRAITS(CoordinationSettingsTraits, LIST_OF_COORDINATION_SETTINGS) diff --git a/src/Coordination/KeeperAsynchronousMetrics.cpp b/src/Coordination/KeeperAsynchronousMetrics.cpp index 8f6e1dec6c1..96d4df39721 100644 --- a/src/Coordination/KeeperAsynchronousMetrics.cpp +++ b/src/Coordination/KeeperAsynchronousMetrics.cpp @@ -20,7 +20,6 @@ void updateKeeperInformation(KeeperDispatcher & keeper_dispatcher, AsynchronousM size_t ephemerals_count = 0; size_t approximate_data_size = 0; size_t key_arena_size = 0; - size_t latest_snapshot_size = 0; size_t open_file_descriptor_count = 0; std::optional max_file_descriptor_count = 0; size_t followers = 0; @@ -46,11 +45,8 @@ void updateKeeperInformation(KeeperDispatcher & keeper_dispatcher, AsynchronousM ephemerals_count = state_machine.getTotalEphemeralNodesCount(); approximate_data_size = state_machine.getApproximateDataSize(); key_arena_size = state_machine.getKeyArenaSize(); - latest_snapshot_size = state_machine.getLatestSnapshotBufSize(); session_with_watches = state_machine.getSessionsWithWatchesCount(); paths_watched = state_machine.getWatchedPathsCount(); - //snapshot_dir_size = keeper_dispatcher.getSnapDirSize(); - //log_dir_size = keeper_dispatcher.getLogDirSize(); # if defined(__linux__) || defined(__APPLE__) open_file_descriptor_count = getCurrentProcessFDCount(); @@ -76,7 +72,9 @@ void updateKeeperInformation(KeeperDispatcher & keeper_dispatcher, AsynchronousM new_values["KeeperApproximateDataSize"] = { approximate_data_size, "The approximate data size of ClickHouse Keeper, in bytes." }; new_values["KeeperKeyArenaSize"] = { key_arena_size, "The size in bytes of the memory arena for keys in ClickHouse Keeper." }; - new_values["KeeperLatestSnapshotSize"] = { latest_snapshot_size, "The uncompressed size in bytes of the latest snapshot created by ClickHouse Keeper." }; + /// TODO: value was incorrectly set to 0 previously for local snapshots + /// it needs to be fixed and it needs to be atomic to avoid deadlock + ///new_values["KeeperLatestSnapshotSize"] = { latest_snapshot_size, "The uncompressed size in bytes of the latest snapshot created by ClickHouse Keeper." }; new_values["KeeperOpenFileDescriptorCount"] = { open_file_descriptor_count, "The number of open file descriptors in ClickHouse Keeper." }; if (max_file_descriptor_count.has_value()) diff --git a/src/Coordination/KeeperCommon.cpp b/src/Coordination/KeeperCommon.cpp new file mode 100644 index 00000000000..820abf1bbbe --- /dev/null +++ b/src/Coordination/KeeperCommon.cpp @@ -0,0 +1,118 @@ +#include + +#include +#include + +#include +#include +#include +#include + +namespace DB +{ + +static size_t findLastSlash(StringRef path) +{ + if (path.size == 0) + return std::string::npos; + + for (size_t i = path.size - 1; i > 0; --i) + { + if (path.data[i] == '/') + return i; + } + + if (path.data[0] == '/') + return 0; + + return std::string::npos; +} + +StringRef parentNodePath(StringRef path) +{ + auto rslash_pos = findLastSlash(path); + if (rslash_pos > 0) + return StringRef{path.data, rslash_pos}; + return "/"; +} + +StringRef getBaseNodeName(StringRef path) +{ + size_t basename_start = findLastSlash(path); + return StringRef{path.data + basename_start + 1, path.size - basename_start - 1}; +} + +void moveFileBetweenDisks( + DiskPtr disk_from, + const std::string & path_from, + DiskPtr disk_to, + const std::string & path_to, + LoggerPtr logger, + const KeeperContextPtr & keeper_context) +{ + LOG_TRACE(logger, "Moving {} to {} from disk {} to disk {}", path_from, path_to, disk_from->getName(), disk_to->getName()); + /// we use empty file with prefix tmp_ to detect incomplete copies + /// if a copy is complete we don't care from which disk we use the same file + /// so it's okay if a failure happens after removing of tmp file but before we remove + /// the file from the source disk + auto from_path = fs::path(path_from); + auto tmp_file_name = from_path.parent_path() / (std::string{tmp_keeper_file_prefix} + from_path.filename().string()); + + const auto & coordination_settings = keeper_context->getCoordinationSettings(); + auto max_retries_on_init = coordination_settings->disk_move_retries_during_init.value; + auto retries_sleep = std::chrono::milliseconds(coordination_settings->disk_move_retries_wait_ms); + auto run_with_retries = [&](const auto & op, std::string_view operation_description) + { + size_t retry_num = 0; + do + { + try + { + op(); + return true; + } + catch (...) + { + tryLogCurrentException( + logger, + fmt::format( + "While moving file {} to disk {} and running '{}'", path_from, disk_to->getName(), operation_description)); + std::this_thread::sleep_for(retries_sleep); + } + + ++retry_num; + if (keeper_context->getServerState() == KeeperContext::Phase::INIT && retry_num == max_retries_on_init) + { + LOG_ERROR(logger, "Operation '{}' failed too many times", operation_description); + break; + } + } while (!keeper_context->isShutdownCalled()); + + LOG_ERROR( + logger, + "Failed to run '{}' while moving file {} to disk {}", + operation_description, + path_from, + disk_to->getName()); + return false; + }; + + if (!run_with_retries( + [&] + { + auto buf = disk_to->writeFile(tmp_file_name); + buf->finalize(); + }, + "creating temporary file")) + return; + + if (!run_with_retries([&] { disk_from->copyFile(from_path, *disk_to, path_to, {}); }, "copying file")) + return; + + if (!run_with_retries([&] { disk_to->removeFileIfExists(tmp_file_name); }, "removing temporary file")) + return; + + if (!run_with_retries([&] { disk_from->removeFileIfExists(path_from); }, "removing file from source disk")) + return; +} +} diff --git a/src/Coordination/KeeperCommon.h b/src/Coordination/KeeperCommon.h new file mode 100644 index 00000000000..179d80b295f --- /dev/null +++ b/src/Coordination/KeeperCommon.h @@ -0,0 +1,28 @@ +#pragma once + +#include +#include "Common/Logger.h" + +namespace DB +{ + +class IDisk; +using DiskPtr = std::shared_ptr; +class KeeperContext; +using KeeperContextPtr = std::shared_ptr; + +StringRef parentNodePath(StringRef path); + +StringRef getBaseNodeName(StringRef path); + +inline static constexpr std::string_view tmp_keeper_file_prefix = "tmp_"; + +void moveFileBetweenDisks( + DiskPtr disk_from, + const std::string & path_from, + DiskPtr disk_to, + const std::string & path_to, + LoggerPtr logger, + const KeeperContextPtr & keeper_context); + +} diff --git a/src/Coordination/KeeperConstants.cpp b/src/Coordination/KeeperConstants.cpp new file mode 100644 index 00000000000..f788095334e --- /dev/null +++ b/src/Coordination/KeeperConstants.cpp @@ -0,0 +1,376 @@ +#include +#include + +/// Events which are useful for Keeper. +/// New events should be added manually. +#define APPLY_FOR_KEEPER_PROFILE_EVENTS(M) \ + M(FileOpen) \ + M(Seek) \ + M(ReadBufferFromFileDescriptorRead) \ + M(ReadBufferFromFileDescriptorReadFailed) \ + M(ReadBufferFromFileDescriptorReadBytes) \ + M(WriteBufferFromFileDescriptorWrite) \ + M(WriteBufferFromFileDescriptorWriteFailed) \ + M(WriteBufferFromFileDescriptorWriteBytes) \ + M(FileSync) \ + M(DirectorySync) \ + M(FileSyncElapsedMicroseconds) \ + M(DirectorySyncElapsedMicroseconds) \ + M(ReadCompressedBytes) \ + M(CompressedReadBufferBlocks) \ + M(CompressedReadBufferBytes) \ + M(AIOWrite) \ + M(AIOWriteBytes) \ + M(AIORead) \ + M(AIOReadBytes) \ + M(IOBufferAllocs) \ + M(IOBufferAllocBytes) \ + M(ArenaAllocChunks) \ + M(ArenaAllocBytes) \ + M(CreatedReadBufferOrdinary) \ + M(CreatedReadBufferDirectIO) \ + M(CreatedReadBufferDirectIOFailed) \ + M(CreatedReadBufferMMap) \ + M(CreatedReadBufferMMapFailed) \ + M(DiskReadElapsedMicroseconds) \ + M(DiskWriteElapsedMicroseconds) \ + M(NetworkReceiveElapsedMicroseconds) \ + M(NetworkSendElapsedMicroseconds) \ + M(NetworkReceiveBytes) \ + M(NetworkSendBytes) \ +\ + M(DiskS3GetRequestThrottlerCount) \ + M(DiskS3GetRequestThrottlerSleepMicroseconds) \ + M(DiskS3PutRequestThrottlerCount) \ + M(DiskS3PutRequestThrottlerSleepMicroseconds) \ + M(S3GetRequestThrottlerCount) \ + M(S3GetRequestThrottlerSleepMicroseconds) \ + M(S3PutRequestThrottlerCount) \ + M(S3PutRequestThrottlerSleepMicroseconds) \ + M(RemoteReadThrottlerBytes) \ + M(RemoteReadThrottlerSleepMicroseconds) \ + M(RemoteWriteThrottlerBytes) \ + M(RemoteWriteThrottlerSleepMicroseconds) \ + M(LocalReadThrottlerBytes) \ + M(LocalReadThrottlerSleepMicroseconds) \ + M(LocalWriteThrottlerBytes) \ + M(LocalWriteThrottlerSleepMicroseconds) \ + M(ThrottlerSleepMicroseconds) \ +\ + M(SlowRead) \ + M(ReadBackoff) \ +\ + M(ContextLock) \ + M(ContextLockWaitMicroseconds) \ +\ + M(RWLockAcquiredReadLocks) \ + M(RWLockAcquiredWriteLocks) \ + M(RWLockReadersWaitMilliseconds) \ + M(RWLockWritersWaitMilliseconds) \ + M(DNSError) \ + M(RealTimeMicroseconds) \ + M(UserTimeMicroseconds) \ + M(SystemTimeMicroseconds) \ + M(MemoryOvercommitWaitTimeMicroseconds) \ + M(MemoryAllocatorPurge) \ + M(MemoryAllocatorPurgeTimeMicroseconds) \ + M(SoftPageFaults) \ + M(HardPageFaults) \ +\ + M(OSIOWaitMicroseconds) \ + M(OSCPUWaitMicroseconds) \ + M(OSCPUVirtualTimeMicroseconds) \ + M(OSReadBytes) \ + M(OSWriteBytes) \ + M(OSReadChars) \ + M(OSWriteChars) \ +\ + M(PerfCPUCycles) \ + M(PerfInstructions) \ + M(PerfCacheReferences) \ + M(PerfCacheMisses) \ + M(PerfBranchInstructions) \ + M(PerfBranchMisses) \ + M(PerfBusCycles) \ + M(PerfStalledCyclesFrontend) \ + M(PerfStalledCyclesBackend) \ + M(PerfRefCPUCycles) \ +\ + M(PerfCPUClock) \ + M(PerfTaskClock) \ + M(PerfContextSwitches) \ + M(PerfCPUMigrations) \ + M(PerfAlignmentFaults) \ + M(PerfEmulationFaults) \ + M(PerfMinEnabledTime) \ + M(PerfMinEnabledRunningTime) \ + M(PerfDataTLBReferences) \ + M(PerfDataTLBMisses) \ + M(PerfInstructionTLBReferences) \ + M(PerfInstructionTLBMisses) \ + M(PerfLocalMemoryReferences) \ + M(PerfLocalMemoryMisses) \ +\ + M(CreatedHTTPConnections) \ + M(CannotWriteToWriteBufferDiscard) \ +\ + M(S3ReadMicroseconds) \ + M(S3ReadRequestsCount) \ + M(S3ReadRequestsErrors) \ + M(S3ReadRequestsThrottling) \ + M(S3ReadRequestsRedirects) \ +\ + M(S3WriteMicroseconds) \ + M(S3WriteRequestsCount) \ + M(S3WriteRequestsErrors) \ + M(S3WriteRequestsThrottling) \ + M(S3WriteRequestsRedirects) \ +\ + M(DiskS3ReadMicroseconds) \ + M(DiskS3ReadRequestsCount) \ + M(DiskS3ReadRequestsErrors) \ + M(DiskS3ReadRequestsThrottling) \ + M(DiskS3ReadRequestsRedirects) \ +\ + M(DiskS3WriteMicroseconds) \ + M(DiskS3WriteRequestsCount) \ + M(DiskS3WriteRequestsErrors) \ + M(DiskS3WriteRequestsThrottling) \ + M(DiskS3WriteRequestsRedirects) \ +\ + M(S3DeleteObjects) \ + M(S3CopyObject) \ + M(S3ListObjects) \ + M(S3HeadObject) \ + M(S3GetObjectAttributes) \ + M(S3CreateMultipartUpload) \ + M(S3UploadPartCopy) \ + M(S3UploadPart) \ + M(S3AbortMultipartUpload) \ + M(S3CompleteMultipartUpload) \ + M(S3PutObject) \ + M(S3GetObject) \ +\ + M(AzureUploadPart) \ + M(DiskAzureUploadPart) \ + M(AzureCopyObject) \ + M(DiskAzureCopyObject) \ + M(AzureDeleteObjects) \ + M(AzureListObjects) \ +\ + M(DiskS3DeleteObjects) \ + M(DiskS3CopyObject) \ + M(DiskS3ListObjects) \ + M(DiskS3HeadObject) \ + M(DiskS3GetObjectAttributes) \ + M(DiskS3CreateMultipartUpload) \ + M(DiskS3UploadPartCopy) \ + M(DiskS3UploadPart) \ + M(DiskS3AbortMultipartUpload) \ + M(DiskS3CompleteMultipartUpload) \ + M(DiskS3PutObject) \ + M(DiskS3GetObject) \ +\ + M(S3Clients) \ + M(TinyS3Clients) \ +\ + M(ReadBufferFromS3Microseconds) \ + M(ReadBufferFromS3InitMicroseconds) \ + M(ReadBufferFromS3Bytes) \ + M(ReadBufferFromS3RequestsErrors) \ + M(ReadBufferFromS3ResetSessions) \ + M(ReadBufferFromS3PreservedSessions) \ +\ + M(ReadWriteBufferFromHTTPPreservedSessions) \ +\ + M(WriteBufferFromS3Microseconds) \ + M(WriteBufferFromS3Bytes) \ + M(WriteBufferFromS3RequestsErrors) \ + M(WriteBufferFromS3WaitInflightLimitMicroseconds) \ + M(RemoteFSSeeks) \ + M(RemoteFSPrefetches) \ + M(RemoteFSCancelledPrefetches) \ + M(RemoteFSUnusedPrefetches) \ + M(RemoteFSPrefetchedReads) \ + M(RemoteFSPrefetchedBytes) \ + M(RemoteFSUnprefetchedReads) \ + M(RemoteFSUnprefetchedBytes) \ + M(RemoteFSLazySeeks) \ + M(RemoteFSSeeksWithReset) \ + M(RemoteFSBuffers) \ +\ + M(ThreadpoolReaderTaskMicroseconds) \ + M(ThreadpoolReaderPrepareMicroseconds) \ + M(ThreadpoolReaderReadBytes) \ + M(ThreadpoolReaderSubmit) \ + M(ThreadpoolReaderSubmitReadSynchronously) \ + M(ThreadpoolReaderSubmitReadSynchronouslyBytes) \ + M(ThreadpoolReaderSubmitReadSynchronouslyMicroseconds) \ + M(ThreadpoolReaderSubmitLookupInCacheMicroseconds) \ + M(AsynchronousReaderIgnoredBytes) \ +\ + M(FileSegmentWaitReadBufferMicroseconds) \ + M(FileSegmentReadMicroseconds) \ + M(FileSegmentCacheWriteMicroseconds) \ + M(FileSegmentPredownloadMicroseconds) \ + M(FileSegmentUsedBytes) \ +\ + M(ReadBufferSeekCancelConnection) \ +\ + M(SleepFunctionCalls) \ + M(SleepFunctionMicroseconds) \ + M(SleepFunctionElapsedMicroseconds) \ +\ + M(ThreadPoolReaderPageCacheHit) \ + M(ThreadPoolReaderPageCacheHitBytes) \ + M(ThreadPoolReaderPageCacheHitElapsedMicroseconds) \ + M(ThreadPoolReaderPageCacheMiss) \ + M(ThreadPoolReaderPageCacheMissBytes) \ + M(ThreadPoolReaderPageCacheMissElapsedMicroseconds) \ +\ + M(AsynchronousReadWaitMicroseconds) \ + M(SynchronousReadWaitMicroseconds) \ + M(AsynchronousRemoteReadWaitMicroseconds) \ + M(SynchronousRemoteReadWaitMicroseconds) \ +\ + M(ExternalDataSourceLocalCacheReadBytes) \ +\ + M(MainConfigLoads) \ +\ + M(KeeperPacketsSent) \ + M(KeeperPacketsReceived) \ + M(KeeperRequestTotal) \ + M(KeeperLatency) \ + M(KeeperCommits) \ + M(KeeperCommitsFailed) \ + M(KeeperSnapshotCreations) \ + M(KeeperSnapshotCreationsFailed) \ + M(KeeperSnapshotApplys) \ + M(KeeperSnapshotApplysFailed) \ + M(KeeperReadSnapshot) \ + M(KeeperSaveSnapshot) \ + M(KeeperCreateRequest) \ + M(KeeperRemoveRequest) \ + M(KeeperSetRequest) \ + M(KeeperReconfigRequest) \ + M(KeeperCheckRequest) \ + M(KeeperMultiRequest) \ + M(KeeperMultiReadRequest) \ + M(KeeperGetRequest) \ + M(KeeperListRequest) \ + M(KeeperExistsRequest) \ +\ + M(IOUringSQEsSubmitted) \ + M(IOUringSQEsResubmits) \ + M(IOUringCQEsCompleted) \ + M(IOUringCQEsFailed) \ +\ + M(LogTest) \ + M(LogTrace) \ + M(LogDebug) \ + M(LogInfo) \ + M(LogWarning) \ + M(LogError) \ + M(LogFatal) \ +\ + M(InterfaceHTTPSendBytes) \ + M(InterfaceHTTPReceiveBytes) \ + M(InterfaceNativeSendBytes) \ + M(InterfaceNativeReceiveBytes) \ + M(InterfacePrometheusSendBytes) \ + M(InterfacePrometheusReceiveBytes) \ + M(InterfaceInterserverSendBytes) \ + M(InterfaceInterserverReceiveBytes) \ + M(InterfaceMySQLSendBytes) \ + M(InterfaceMySQLReceiveBytes) \ + M(InterfacePostgreSQLSendBytes) \ + M(InterfacePostgreSQLReceiveBytes) + +namespace ProfileEvents +{ +#define M(NAME) extern const Event NAME; + APPLY_FOR_KEEPER_PROFILE_EVENTS(M) +#undef M + +#define M(NAME) NAME, +extern const std::vector keeper_profile_events +{ + APPLY_FOR_KEEPER_PROFILE_EVENTS(M) +}; +#undef M +} + +/// Metrics which are useful for Keeper. +/// New metrics should be added manually. +#define APPLY_FOR_KEEPER_METRICS(M) \ + M(BackgroundCommonPoolTask) \ + M(BackgroundCommonPoolSize) \ + M(TCPConnection) \ + M(HTTPConnection) \ + M(OpenFileForRead) \ + M(OpenFileForWrite) \ + M(Read) \ + M(RemoteRead) \ + M(Write) \ + M(NetworkReceive) \ + M(NetworkSend) \ + M(MemoryTracking) \ + M(ContextLockWait) \ + M(Revision) \ + M(VersionInteger) \ + M(RWLockWaitingReaders) \ + M(RWLockWaitingWriters) \ + M(RWLockActiveReaders) \ + M(RWLockActiveWriters) \ + M(GlobalThread) \ + M(GlobalThreadActive) \ + M(GlobalThreadScheduled) \ + M(LocalThread) \ + M(LocalThreadActive) \ + M(LocalThreadScheduled) \ + M(IOPrefetchThreads) \ + M(IOPrefetchThreadsActive) \ + M(IOPrefetchThreadsScheduled) \ + M(IOWriterThreads) \ + M(IOWriterThreadsActive) \ + M(IOWriterThreadsScheduled) \ + M(IOThreads) \ + M(IOThreadsActive) \ + M(IOThreadsScheduled) \ + M(ThreadPoolRemoteFSReaderThreads) \ + M(ThreadPoolRemoteFSReaderThreadsActive) \ + M(ThreadPoolRemoteFSReaderThreadsScheduled) \ + M(ThreadPoolFSReaderThreads) \ + M(ThreadPoolFSReaderThreadsActive) \ + M(ThreadPoolFSReaderThreadsScheduled) \ + M(DiskObjectStorageAsyncThreads) \ + M(DiskObjectStorageAsyncThreadsActive) \ + M(ObjectStorageS3Threads) \ + M(ObjectStorageS3ThreadsActive) \ + M(ObjectStorageS3ThreadsScheduled) \ + M(ObjectStorageAzureThreads) \ + M(ObjectStorageAzureThreadsActive) \ + M(ObjectStorageAzureThreadsScheduled) \ + M(MMappedFiles) \ + M(MMappedFileBytes) \ + M(AsynchronousReadWait) \ + M(S3Requests) \ + M(KeeperAliveConnections) \ + M(KeeperOutstandingRequets) \ + M(ThreadsInOvercommitTracker) \ + M(IOUringPendingEvents) \ + M(IOUringInFlightEvents) \ + +namespace CurrentMetrics +{ +#define M(NAME) extern const Metric NAME; + APPLY_FOR_KEEPER_METRICS(M) +#undef M + +#define M(NAME) NAME, +extern const std::vector keeper_metrics +{ + APPLY_FOR_KEEPER_METRICS(M) +}; +#undef M +} diff --git a/src/Coordination/KeeperContext.cpp b/src/Coordination/KeeperContext.cpp index baad8d98e6a..b06e321aeec 100644 --- a/src/Coordination/KeeperContext.cpp +++ b/src/Coordination/KeeperContext.cpp @@ -1,13 +1,16 @@ #include #include -#include -#include -#include -#include #include -#include +#include #include +#include +#include +#include +#include +#include +#include + #include namespace DB @@ -20,9 +23,10 @@ extern const int BAD_ARGUMENTS; } -KeeperContext::KeeperContext(bool standalone_keeper_) +KeeperContext::KeeperContext(bool standalone_keeper_, CoordinationSettingsPtr coordination_settings_) : disk_selector(std::make_shared()) , standalone_keeper(standalone_keeper_) + , coordination_settings(std::move(coordination_settings_)) { /// enable by default some feature flags feature_flags.enableFeatureFlag(KeeperFeatureFlag::FILTERED_LIST); @@ -37,26 +41,11 @@ void KeeperContext::initialize(const Poco::Util::AbstractConfiguration & config, { dispatcher = dispatcher_; - if (config.hasProperty("keeper_server.availability_zone")) + const auto keeper_az = PlacementInfo::PlacementInfo::instance().getAvailabilityZone(); + if (!keeper_az.empty()) { - auto keeper_az = config.getString("keeper_server.availability_zone.value", ""); - const auto auto_detect_for_cloud = config.getBool("keeper_server.availability_zone.enable_auto_detection_on_cloud", false); - if (keeper_az.empty() && auto_detect_for_cloud) - { - try - { - keeper_az = DB::S3::getRunningAvailabilityZone(); - } - catch (...) - { - tryLogCurrentException(__PRETTY_FUNCTION__); - } - } - if (!keeper_az.empty()) - { - system_nodes_with_data[keeper_availability_zone_path] = keeper_az; - LOG_INFO(getLogger("KeeperContext"), "Initialize the KeeperContext with availability zone: '{}'", keeper_az); - } + system_nodes_with_data[keeper_availability_zone_path] = keeper_az; + LOG_INFO(getLogger("KeeperContext"), "Initialize the KeeperContext with availability zone: '{}'", keeper_az); } updateKeeperMemorySoftLimit(config); @@ -416,4 +405,9 @@ void KeeperContext::waitLocalLogsPreprocessedOrShutdown() local_logs_preprocessed_cv.wait(lock, [this]{ return shutdown_called || local_logs_preprocessed; }); } +const CoordinationSettingsPtr & KeeperContext::getCoordinationSettings() const +{ + return coordination_settings; +} + } diff --git a/src/Coordination/KeeperContext.h b/src/Coordination/KeeperContext.h index 891bef00446..a5cc2db49a2 100644 --- a/src/Coordination/KeeperContext.h +++ b/src/Coordination/KeeperContext.h @@ -1,7 +1,5 @@ #pragma once #include -#include -#include #include #include #include @@ -12,10 +10,19 @@ namespace DB class KeeperDispatcher; +struct CoordinationSettings; +using CoordinationSettingsPtr = std::shared_ptr; + +class DiskSelector; +class IDisk; +using DiskPtr = std::shared_ptr; + +class WriteBufferFromOwnString; + class KeeperContext { public: - explicit KeeperContext(bool standalone_keeper_); + KeeperContext(bool standalone_keeper_, CoordinationSettingsPtr coordination_settings_); enum class Phase : uint8_t { @@ -68,6 +75,8 @@ public: void waitLocalLogsPreprocessedOrShutdown(); + const CoordinationSettingsPtr & getCoordinationSettings() const; + private: /// local disk defined using path or disk name using Storage = std::variant; @@ -89,7 +98,7 @@ private: std::atomic local_logs_preprocessed = false; std::atomic shutdown_called = false; - Phase server_state{Phase::INIT}; + std::atomic server_state{Phase::INIT}; bool ignore_system_path_on_startup{false}; bool digest_enabled{true}; @@ -113,6 +122,8 @@ private: KeeperDispatcher * dispatcher{nullptr}; std::atomic memory_soft_limit = 0; + + CoordinationSettingsPtr coordination_settings; }; using KeeperContextPtr = std::shared_ptr; diff --git a/src/Coordination/KeeperDispatcher.cpp b/src/Coordination/KeeperDispatcher.cpp index 35bc953a705..4bd10352d3e 100644 --- a/src/Coordination/KeeperDispatcher.cpp +++ b/src/Coordination/KeeperDispatcher.cpp @@ -414,8 +414,8 @@ void KeeperDispatcher::initialize(const Poco::Util::AbstractConfiguration & conf { LOG_DEBUG(log, "Initializing storage dispatcher"); - keeper_context = std::make_shared(standalone_keeper); configuration_and_settings = KeeperConfigurationAndSettings::loadFromConfig(config, standalone_keeper); + keeper_context = std::make_shared(standalone_keeper, configuration_and_settings->coordination_settings); keeper_context->initialize(config, this); diff --git a/src/Coordination/KeeperServer.cpp b/src/Coordination/KeeperServer.cpp index 722b1303cc8..e3fd14c0e1a 100644 --- a/src/Coordination/KeeperServer.cpp +++ b/src/Coordination/KeeperServer.cpp @@ -119,20 +119,18 @@ KeeperServer::KeeperServer( KeeperSnapshotManagerS3 & snapshot_manager_s3, KeeperStateMachine::CommitCallback commit_callback) : server_id(configuration_and_settings_->server_id) - , coordination_settings(configuration_and_settings_->coordination_settings) , log(getLogger("KeeperServer")) , is_recovering(config.getBool("keeper_server.force_recovery", false)) , keeper_context{std::move(keeper_context_)} , create_snapshot_on_exit(config.getBool("keeper_server.create_snapshot_on_exit", true)) , enable_reconfiguration(config.getBool("keeper_server.enable_reconfiguration", false)) { - if (coordination_settings->quorum_reads) + if (keeper_context->getCoordinationSettings()->quorum_reads) LOG_WARNING(log, "Quorum reads enabled, Keeper will work slower."); state_machine = nuraft::cs_new( responses_queue_, snapshots_queue_, - coordination_settings, keeper_context, config.getBool("keeper_server.upload_snapshot_on_exit", true) ? &snapshot_manager_s3 : nullptr, commit_callback, @@ -143,7 +141,6 @@ KeeperServer::KeeperServer( "keeper_server", "state", config, - coordination_settings, keeper_context); } @@ -226,7 +223,7 @@ void KeeperServer::loadLatestConfig() { auto latest_snapshot_config = state_machine->getClusterConfig(); auto latest_log_store_config = state_manager->getLatestConfigFromLogStore(); - auto async_replication = coordination_settings->async_replication; + auto async_replication = keeper_context->getCoordinationSettings()->async_replication; if (latest_snapshot_config && latest_log_store_config) { @@ -293,6 +290,8 @@ void KeeperServer::forceRecovery() void KeeperServer::launchRaftServer(const Poco::Util::AbstractConfiguration & config, bool enable_ipv6) { + const auto & coordination_settings = keeper_context->getCoordinationSettings(); + nuraft::raft_params params; params.parallel_log_appending_ = true; params.heart_beat_interval_ @@ -427,6 +426,7 @@ void KeeperServer::startup(const Poco::Util::AbstractConfiguration & config, boo { state_machine->init(); + const auto & coordination_settings = keeper_context->getCoordinationSettings(); state_manager->loadLogStore(state_machine->last_commit_index() + 1, coordination_settings->reserved_log_items); auto log_store = state_manager->load_log_store(); @@ -446,7 +446,7 @@ void KeeperServer::startup(const Poco::Util::AbstractConfiguration & config, boo void KeeperServer::shutdownRaftServer() { - size_t timeout = coordination_settings->shutdown_timeout.totalSeconds(); + size_t timeout = keeper_context->getCoordinationSettings()->shutdown_timeout.totalSeconds(); if (!raft_instance) { @@ -870,7 +870,7 @@ nuraft::cb_func::ReturnCode KeeperServer::callbackFunc(nuraft::cb_func::Type typ /// Node first became leader, and after that some other node became leader. /// BecameFresh for this node will not be called because it was already fresh /// when it was leader. - if (leader_index < our_index + coordination_settings->fresh_log_gap) + if (leader_index < our_index + keeper_context->getCoordinationSettings()->fresh_log_gap) set_initialized(); } return nuraft::cb_func::ReturnCode::Ok; @@ -905,7 +905,7 @@ void KeeperServer::waitInit() { std::unique_lock lock(initialized_mutex); - int64_t timeout = coordination_settings->startup_timeout.totalMilliseconds(); + int64_t timeout = keeper_context->getCoordinationSettings()->startup_timeout.totalMilliseconds(); if (!initialized_cv.wait_for(lock, std::chrono::milliseconds(timeout), [&] { return initialized_flag.load(); })) LOG_WARNING(log, "Failed to wait for RAFT initialization in {}ms, will continue in background", timeout); } @@ -977,6 +977,7 @@ KeeperServer::ConfigUpdateState KeeperServer::applyConfigUpdate( ClusterUpdateActions KeeperServer::getRaftConfigurationDiff(const Poco::Util::AbstractConfiguration & config) { + const auto & coordination_settings = keeper_context->getCoordinationSettings(); auto diff = state_manager->getRaftConfigurationDiff(config, coordination_settings); if (!diff.empty()) @@ -1004,6 +1005,7 @@ void KeeperServer::applyConfigUpdateWithReconfigDisabled(const ClusterUpdateActi std::this_thread::sleep_for(sleep_time * (i + 1)); }; + const auto & coordination_settings = keeper_context->getCoordinationSettings(); if (const auto * add = std::get_if(&action)) { for (size_t i = 0; i < coordination_settings->configuration_change_tries_count && !is_recovering; ++i) @@ -1059,6 +1061,7 @@ bool KeeperServer::waitForConfigUpdateWithReconfigDisabled(const ClusterUpdateAc auto became_leader = [&] { LOG_INFO(log, "Became leader, aborting"); return false; }; auto backoff = [&](size_t i) { std::this_thread::sleep_for(sleep_time * (i + 1)); }; + const auto & coordination_settings = keeper_context->getCoordinationSettings(); if (const auto* add = std::get_if(&action)) { for (size_t i = 0; i < coordination_settings->configuration_change_tries_count && !is_recovering; ++i) diff --git a/src/Coordination/KeeperServer.h b/src/Coordination/KeeperServer.h index ef298df3efc..dd54539a92b 100644 --- a/src/Coordination/KeeperServer.h +++ b/src/Coordination/KeeperServer.h @@ -22,8 +22,6 @@ class KeeperServer private: const int server_id; - CoordinationSettingsPtr coordination_settings; - nuraft::ptr state_machine; nuraft::ptr state_manager; diff --git a/src/Coordination/KeeperSnapshotManager.cpp b/src/Coordination/KeeperSnapshotManager.cpp index 091571b4a1a..4ae39487e0b 100644 --- a/src/Coordination/KeeperSnapshotManager.cpp +++ b/src/Coordination/KeeperSnapshotManager.cpp @@ -3,6 +3,7 @@ #include #include #include +#include #include #include #include @@ -13,7 +14,7 @@ #include #include #include -#include +#include #include #include #include @@ -32,23 +33,15 @@ namespace ErrorCodes namespace { - constexpr std::string_view tmp_prefix = "tmp_"; - - void moveFileBetweenDisks(DiskPtr disk_from, const std::string & path_from, DiskPtr disk_to, const std::string & path_to) + void moveSnapshotFileBetweenDisks( + DiskPtr disk_from, + const std::string & path_from, + DiskPtr disk_to, + const std::string & path_to, + const KeeperContextPtr & keeper_context) { - /// we use empty file with prefix tmp_ to detect incomplete copies - /// if a copy is complete we don't care from which disk we use the same file - /// so it's okay if a failure happens after removing of tmp file but before we remove - /// the snapshot from the source disk - auto from_path = fs::path(path_from); - auto tmp_snapshot_name = from_path.parent_path() / (std::string{tmp_prefix} + from_path.filename().string()); - { - auto buf = disk_to->writeFile(tmp_snapshot_name); - buf->finalize(); - } - disk_from->copyFile(from_path, *disk_to, path_to, {}); - disk_to->removeFile(tmp_snapshot_name); - disk_from->removeFile(path_from); + moveFileBetweenDisks( + std::move(disk_from), path_from, std::move(disk_to), path_to, getLogger("KeeperSnapshotManager"), keeper_context); } uint64_t getSnapshotPathUpToLogIdx(const String & snapshot_path) @@ -582,9 +575,9 @@ KeeperSnapshotManager::KeeperSnapshotManager( std::vector snapshot_files; for (auto it = disk->iterateDirectory(""); it->isValid(); it->next()) { - if (it->name().starts_with(tmp_prefix)) + if (it->name().starts_with(tmp_keeper_file_prefix)) { - incomplete_files.emplace(it->name().substr(tmp_prefix.size()), it->path()); + incomplete_files.emplace(it->name().substr(tmp_keeper_file_prefix.size()), it->path()); continue; } @@ -774,7 +767,7 @@ void KeeperSnapshotManager::moveSnapshotsIfNeeded() { if (file_info.disk != latest_snapshot_disk) { - moveFileBetweenDisks(file_info.disk, file_info.path, latest_snapshot_disk, file_info.path); + moveSnapshotFileBetweenDisks(file_info.disk, file_info.path, latest_snapshot_disk, file_info.path, keeper_context); file_info.disk = latest_snapshot_disk; } } @@ -782,7 +775,7 @@ void KeeperSnapshotManager::moveSnapshotsIfNeeded() { if (file_info.disk != disk) { - moveFileBetweenDisks(file_info.disk, file_info.path, disk, file_info.path); + moveSnapshotFileBetweenDisks(file_info.disk, file_info.path, disk, file_info.path, keeper_context); file_info.disk = disk; } } diff --git a/src/Coordination/KeeperStateMachine.cpp b/src/Coordination/KeeperStateMachine.cpp index c82f8301eff..f83a49833a7 100644 --- a/src/Coordination/KeeperStateMachine.cpp +++ b/src/Coordination/KeeperStateMachine.cpp @@ -42,22 +42,20 @@ namespace ErrorCodes KeeperStateMachine::KeeperStateMachine( ResponsesQueue & responses_queue_, SnapshotsQueue & snapshots_queue_, - const CoordinationSettingsPtr & coordination_settings_, const KeeperContextPtr & keeper_context_, KeeperSnapshotManagerS3 * snapshot_manager_s3_, CommitCallback commit_callback_, const std::string & superdigest_) : commit_callback(commit_callback_) - , coordination_settings(coordination_settings_) , snapshot_manager( - coordination_settings->snapshots_to_keep, + keeper_context_->getCoordinationSettings()->snapshots_to_keep, keeper_context_, - coordination_settings->compress_snapshots_with_zstd_format, + keeper_context_->getCoordinationSettings()->compress_snapshots_with_zstd_format, superdigest_, - coordination_settings->dead_session_check_period_ms.totalMilliseconds()) + keeper_context_->getCoordinationSettings()->dead_session_check_period_ms.totalMilliseconds()) , responses_queue(responses_queue_) , snapshots_queue(snapshots_queue_) - , min_request_size_to_cache(coordination_settings_->min_request_size_for_cache) + , min_request_size_to_cache(keeper_context_->getCoordinationSettings()->min_request_size_for_cache) , last_committed_idx(0) , log(getLogger("KeeperStateMachine")) , superdigest(superdigest_) @@ -129,7 +127,7 @@ void KeeperStateMachine::init() if (!storage) storage = std::make_unique( - coordination_settings->dead_session_check_period_ms.totalMilliseconds(), superdigest, keeper_context); + keeper_context->getCoordinationSettings()->dead_session_check_period_ms.totalMilliseconds(), superdigest, keeper_context); } namespace diff --git a/src/Coordination/KeeperStateMachine.h b/src/Coordination/KeeperStateMachine.h index b11cd53c00e..f0a565aed8a 100644 --- a/src/Coordination/KeeperStateMachine.h +++ b/src/Coordination/KeeperStateMachine.h @@ -25,7 +25,6 @@ public: KeeperStateMachine( ResponsesQueue & responses_queue_, SnapshotsQueue & snapshots_queue_, - const CoordinationSettingsPtr & coordination_settings_, const KeeperContextPtr & keeper_context_, KeeperSnapshotManagerS3 * snapshot_manager_s3_, CommitCallback commit_callback_ = {}, @@ -139,8 +138,6 @@ private: SnapshotFileInfo latest_snapshot_info; nuraft::ptr latest_snapshot_buf = nullptr; - CoordinationSettingsPtr coordination_settings; - /// Main state machine logic KeeperStoragePtr storage TSA_PT_GUARDED_BY(storage_and_responses_lock); diff --git a/src/Coordination/KeeperStateManager.cpp b/src/Coordination/KeeperStateManager.cpp index 4fbb9b52e6e..3f9c7aa3e44 100644 --- a/src/Coordination/KeeperStateManager.cpp +++ b/src/Coordination/KeeperStateManager.cpp @@ -241,23 +241,20 @@ KeeperStateManager::KeeperStateManager( const std::string & config_prefix_, const std::string & server_state_file_name_, const Poco::Util::AbstractConfiguration & config, - const CoordinationSettingsPtr & coordination_settings, KeeperContextPtr keeper_context_) : my_server_id(my_server_id_) , secure(config.getBool(config_prefix_ + ".raft_configuration.secure", false)) , config_prefix(config_prefix_) - , configuration_wrapper(parseServersConfiguration(config, false, coordination_settings->async_replication)) + , configuration_wrapper(parseServersConfiguration(config, false, keeper_context_->getCoordinationSettings()->async_replication)) , log_store(nuraft::cs_new( - LogFileSettings - { - .force_sync = coordination_settings->force_sync, - .compress_logs = coordination_settings->compress_logs, - .rotate_interval = coordination_settings->rotate_log_storage_interval, - .max_size = coordination_settings->max_log_file_size, - .overallocate_size = coordination_settings->log_file_overallocate_size}, - FlushSettings - { - .max_flush_batch_size = coordination_settings->max_flush_batch_size, + LogFileSettings{ + .force_sync = keeper_context_->getCoordinationSettings()->force_sync, + .compress_logs = keeper_context_->getCoordinationSettings()->compress_logs, + .rotate_interval = keeper_context_->getCoordinationSettings()->rotate_log_storage_interval, + .max_size = keeper_context_->getCoordinationSettings()->max_log_file_size, + .overallocate_size = keeper_context_->getCoordinationSettings()->log_file_overallocate_size}, + FlushSettings{ + .max_flush_batch_size = keeper_context_->getCoordinationSettings()->max_flush_batch_size, }, keeper_context_)) , server_state_file_name(server_state_file_name_) diff --git a/src/Coordination/KeeperStateManager.h b/src/Coordination/KeeperStateManager.h index 02dd6b2ff53..60f6dbe7b62 100644 --- a/src/Coordination/KeeperStateManager.h +++ b/src/Coordination/KeeperStateManager.h @@ -23,7 +23,6 @@ public: const std::string & config_prefix_, const std::string & server_state_file_name_, const Poco::Util::AbstractConfiguration & config, - const CoordinationSettingsPtr & coordination_settings, KeeperContextPtr keeper_context_); /// Constructor for tests diff --git a/src/Coordination/KeeperStorage.cpp b/src/Coordination/KeeperStorage.cpp index c62ed90e378..d3101543362 100644 --- a/src/Coordination/KeeperStorage.cpp +++ b/src/Coordination/KeeperStorage.cpp @@ -18,7 +18,7 @@ #include #include -#include +#include #include #include #include @@ -26,7 +26,6 @@ #include #include -#include namespace ProfileEvents { diff --git a/src/Coordination/ZooKeeperDataReader.cpp b/src/Coordination/ZooKeeperDataReader.cpp index c7b1abf1d83..c205db942b9 100644 --- a/src/Coordination/ZooKeeperDataReader.cpp +++ b/src/Coordination/ZooKeeperDataReader.cpp @@ -8,7 +8,7 @@ #include #include #include -#include +#include namespace DB diff --git a/src/Coordination/pathUtils.cpp b/src/Coordination/pathUtils.cpp deleted file mode 100644 index 25f8e25cf06..00000000000 --- a/src/Coordination/pathUtils.cpp +++ /dev/null @@ -1,37 +0,0 @@ -#include - -namespace DB -{ - -static size_t findLastSlash(StringRef path) -{ - if (path.size == 0) - return std::string::npos; - - for (size_t i = path.size - 1; i > 0; --i) - { - if (path.data[i] == '/') - return i; - } - - if (path.data[0] == '/') - return 0; - - return std::string::npos; -} - -StringRef parentNodePath(StringRef path) -{ - auto rslash_pos = findLastSlash(path); - if (rslash_pos > 0) - return StringRef{path.data, rslash_pos}; - return "/"; -} - -StringRef getBaseNodeName(StringRef path) -{ - size_t basename_start = findLastSlash(path); - return StringRef{path.data + basename_start + 1, path.size - basename_start - 1}; -} - -} diff --git a/src/Coordination/pathUtils.h b/src/Coordination/pathUtils.h deleted file mode 100644 index b2b79b14110..00000000000 --- a/src/Coordination/pathUtils.h +++ /dev/null @@ -1,13 +0,0 @@ -#pragma once - -#include -#include - -namespace DB -{ - -StringRef parentNodePath(StringRef path); - -StringRef getBaseNodeName(StringRef path); - -} diff --git a/src/Coordination/tests/gtest_coordination.cpp b/src/Coordination/tests/gtest_coordination.cpp index bd9dc4c3fd3..763804ba389 100644 --- a/src/Coordination/tests/gtest_coordination.cpp +++ b/src/Coordination/tests/gtest_coordination.cpp @@ -1,8 +1,6 @@ #include #include -#include "Common/ZooKeeper/IKeeper.h" -#include "Core/Defines.h" #include "config.h" #if USE_NURAFT @@ -22,7 +20,7 @@ #include #include #include -#include +#include #include #include #include @@ -65,7 +63,7 @@ struct CompressionParam class CoordinationTest : public ::testing::TestWithParam { protected: - DB::KeeperContextPtr keeper_context = std::make_shared(true); + DB::KeeperContextPtr keeper_context = std::make_shared(true, std::make_shared()); LoggerPtr log{getLogger("CoordinationTest")}; void SetUp() override @@ -1758,7 +1756,6 @@ getLogEntryFromZKRequest(size_t term, int64_t session_id, int64_t zxid, const Co } void testLogAndStateMachine( - Coordination::CoordinationSettingsPtr settings, uint64_t total_logs, bool enable_compression, Coordination::KeeperContextPtr keeper_context) @@ -1766,6 +1763,7 @@ void testLogAndStateMachine( using namespace Coordination; using namespace DB; + const auto & settings = keeper_context->getCoordinationSettings(); ChangelogDirTest snapshots("./snapshots"); keeper_context->setSnapshotDisk(std::make_shared("SnapshotDisk", "./snapshots")); ChangelogDirTest logs("./logs"); @@ -1773,7 +1771,7 @@ void testLogAndStateMachine( ResponsesQueue queue(std::numeric_limits::max()); SnapshotsQueue snapshots_queue{1}; - auto state_machine = std::make_shared(queue, snapshots_queue, settings, keeper_context, nullptr); + auto state_machine = std::make_shared(queue, snapshots_queue, keeper_context, nullptr); state_machine->init(); DB::KeeperLogStore changelog( DB::LogFileSettings{ @@ -1816,7 +1814,7 @@ void testLogAndStateMachine( } SnapshotsQueue snapshots_queue1{1}; - auto restore_machine = std::make_shared(queue, snapshots_queue1, settings, keeper_context, nullptr); + auto restore_machine = std::make_shared(queue, snapshots_queue1, keeper_context, nullptr); restore_machine->init(); EXPECT_EQ(restore_machine->last_commit_index(), total_logs - total_logs % settings->snapshot_distance); @@ -1863,63 +1861,72 @@ TEST_P(CoordinationTest, TestStateMachineAndLogStore) settings->snapshot_distance = 10; settings->reserved_log_items = 10; settings->rotate_log_storage_interval = 10; - testLogAndStateMachine(settings, 37, params.enable_compression, keeper_context); + auto local_keeper_context = std::make_shared(true, settings); + testLogAndStateMachine(37, params.enable_compression, local_keeper_context); } { CoordinationSettingsPtr settings = std::make_shared(); settings->snapshot_distance = 10; settings->reserved_log_items = 10; settings->rotate_log_storage_interval = 10; - testLogAndStateMachine(settings, 11, params.enable_compression, keeper_context); + auto local_keeper_context = std::make_shared(true, settings); + testLogAndStateMachine(11, params.enable_compression, local_keeper_context); } { CoordinationSettingsPtr settings = std::make_shared(); settings->snapshot_distance = 10; settings->reserved_log_items = 10; settings->rotate_log_storage_interval = 10; - testLogAndStateMachine(settings, 40, params.enable_compression, keeper_context); + auto local_keeper_context = std::make_shared(true, settings); + testLogAndStateMachine(40, params.enable_compression, local_keeper_context); } { CoordinationSettingsPtr settings = std::make_shared(); settings->snapshot_distance = 10; settings->reserved_log_items = 20; settings->rotate_log_storage_interval = 30; - testLogAndStateMachine(settings, 40, params.enable_compression, keeper_context); + auto local_keeper_context = std::make_shared(true, settings); + testLogAndStateMachine(40, params.enable_compression, local_keeper_context); } { CoordinationSettingsPtr settings = std::make_shared(); settings->snapshot_distance = 10; settings->reserved_log_items = 0; settings->rotate_log_storage_interval = 10; - testLogAndStateMachine(settings, 40, params.enable_compression, keeper_context); + auto local_keeper_context = std::make_shared(true, settings); + testLogAndStateMachine(40, params.enable_compression, local_keeper_context); } { CoordinationSettingsPtr settings = std::make_shared(); settings->snapshot_distance = 1; settings->reserved_log_items = 1; settings->rotate_log_storage_interval = 32; - testLogAndStateMachine(settings, 32, params.enable_compression, keeper_context); + auto local_keeper_context = std::make_shared(true, settings); + testLogAndStateMachine(32, params.enable_compression, local_keeper_context); } { CoordinationSettingsPtr settings = std::make_shared(); settings->snapshot_distance = 10; settings->reserved_log_items = 7; settings->rotate_log_storage_interval = 1; - testLogAndStateMachine(settings, 33, params.enable_compression, keeper_context); + auto local_keeper_context = std::make_shared(true, settings); + testLogAndStateMachine(33, params.enable_compression, local_keeper_context); } { CoordinationSettingsPtr settings = std::make_shared(); settings->snapshot_distance = 37; settings->reserved_log_items = 1000; settings->rotate_log_storage_interval = 5000; - testLogAndStateMachine(settings, 33, params.enable_compression, keeper_context); + auto local_keeper_context = std::make_shared(true, settings); + testLogAndStateMachine(33, params.enable_compression, local_keeper_context); } { CoordinationSettingsPtr settings = std::make_shared(); settings->snapshot_distance = 37; settings->reserved_log_items = 1000; settings->rotate_log_storage_interval = 5000; - testLogAndStateMachine(settings, 45, params.enable_compression, keeper_context); + auto local_keeper_context = std::make_shared(true, settings); + testLogAndStateMachine(45, params.enable_compression, local_keeper_context); } } @@ -1931,11 +1938,10 @@ TEST_P(CoordinationTest, TestEphemeralNodeRemove) ChangelogDirTest snapshots("./snapshots"); setSnapshotDirectory("./snapshots"); - CoordinationSettingsPtr settings = std::make_shared(); - ResponsesQueue queue(std::numeric_limits::max()); SnapshotsQueue snapshots_queue{1}; - auto state_machine = std::make_shared(queue, snapshots_queue, settings, keeper_context, nullptr); + + auto state_machine = std::make_shared(queue, snapshots_queue, keeper_context, nullptr); state_machine->init(); std::shared_ptr request_c = std::make_shared(); @@ -1965,11 +1971,10 @@ TEST_P(CoordinationTest, TestCreateNodeWithAuthSchemeForAclWhenAuthIsPrecommitte ChangelogDirTest snapshots("./snapshots"); setSnapshotDirectory("./snapshots"); - CoordinationSettingsPtr settings = std::make_shared(); ResponsesQueue queue(std::numeric_limits::max()); SnapshotsQueue snapshots_queue{1}; - auto state_machine = std::make_shared(queue, snapshots_queue, settings, keeper_context, nullptr); + auto state_machine = std::make_shared(queue, snapshots_queue, keeper_context, nullptr); state_machine->init(); String user_auth_data = "test_user:test_password"; @@ -2017,11 +2022,10 @@ TEST_P(CoordinationTest, TestSetACLWithAuthSchemeForAclWhenAuthIsPrecommitted) ChangelogDirTest snapshots("./snapshots"); setSnapshotDirectory("./snapshots"); - CoordinationSettingsPtr settings = std::make_shared(); ResponsesQueue queue(std::numeric_limits::max()); SnapshotsQueue snapshots_queue{1}; - auto state_machine = std::make_shared(queue, snapshots_queue, settings, keeper_context, nullptr); + auto state_machine = std::make_shared(queue, snapshots_queue, keeper_context, nullptr); state_machine->init(); String user_auth_data = "test_user:test_password"; diff --git a/src/Core/AccurateComparison.h b/src/Core/AccurateComparison.h index a201c136e3a..139ee4d88dc 100644 --- a/src/Core/AccurateComparison.h +++ b/src/Core/AccurateComparison.h @@ -152,7 +152,7 @@ bool notEqualsOp(A a, B b) } /// Converts numeric to an equal numeric of other type. -/// When `strict` is `true` check that result exactly same as input, otherwise just check overflow +/// When `strict` is `true` check that result exactly the same as input, otherwise just check overflow template inline bool NO_SANITIZE_UNDEFINED convertNumeric(From value, To & result) { diff --git a/src/Core/Settings.h b/src/Core/Settings.h index 18a5f0dddf2..433195af9c3 100644 --- a/src/Core/Settings.h +++ b/src/Core/Settings.h @@ -834,7 +834,7 @@ class IColumn; M(UInt64, insert_keeper_retry_max_backoff_ms, 10000, "Max backoff timeout for keeper operations during insert", 0) \ M(Float, insert_keeper_fault_injection_probability, 0.0f, "Approximate probability of failure for a keeper request during insert. Valid value is in interval [0.0f, 1.0f]", 0) \ M(UInt64, insert_keeper_fault_injection_seed, 0, "0 - random seed, otherwise the setting value", 0) \ - M(Bool, force_aggregation_in_order, false, "Force use of aggregation in order on remote nodes during distributed aggregation. PLEASE, NEVER CHANGE THIS SETTING VALUE MANUALLY!", IMPORTANT) \ + M(Bool, force_aggregation_in_order, false, "The setting is used by the server itself to support distributed queries. Do not change it manually, because it will break normal operations. (Forces use of aggregation in order on remote nodes during distributed aggregation).", IMPORTANT) \ M(UInt64, http_max_request_param_data_size, 10_MiB, "Limit on size of request data used as a query parameter in predefined HTTP requests.", 0) \ M(Bool, function_json_value_return_type_allow_nullable, false, "Allow function JSON_VALUE to return nullable type.", 0) \ M(Bool, function_json_value_return_type_allow_complex, false, "Allow function JSON_VALUE to return complex type, such as: struct, array, map.", 0) \ @@ -872,7 +872,6 @@ class IColumn; M(UInt64, cache_warmer_threads, 4, "Only available in ClickHouse Cloud", 0) \ M(Int64, ignore_cold_parts_seconds, 0, "Only available in ClickHouse Cloud", 0) \ M(Int64, prefer_warmed_unmerged_parts_seconds, 0, "Only available in ClickHouse Cloud", 0) \ - M(Bool, enable_order_by_all, true, "Enable sorting expression ORDER BY ALL.", 0) \ M(Bool, iceberg_engine_ignore_schema_evolution, false, "Ignore schema evolution in Iceberg table engine and read all data using latest schema saved on table creation. Note that it can lead to incorrect result", 0) \ // End of COMMON_SETTINGS @@ -940,6 +939,7 @@ class IColumn; MAKE_OBSOLETE(M, Bool, allow_experimental_undrop_table_query, true) \ MAKE_OBSOLETE(M, Bool, allow_experimental_s3queue, true) \ MAKE_OBSOLETE(M, Bool, query_plan_optimize_primary_key, true) \ + MAKE_OBSOLETE(M, Bool, enable_order_by_all, true) \ /** The section above is for obsolete settings. Do not add anything there. */ @@ -1019,6 +1019,7 @@ class IColumn; M(Bool, input_format_try_infer_integers, true, "Try to infer integers instead of floats while schema inference in text formats", 0) \ M(Bool, input_format_try_infer_dates, true, "Try to infer dates from string fields while schema inference in text formats", 0) \ M(Bool, input_format_try_infer_datetimes, true, "Try to infer datetimes from string fields while schema inference in text formats", 0) \ + M(Bool, input_format_try_infer_exponent_floats, false, "Try to infer floats in exponential notation while schema inference in text formats", 0) \ M(Bool, output_format_markdown_escape_special_characters, false, "Escape special characters in Markdown", 0) \ M(Bool, input_format_protobuf_flatten_google_wrappers, false, "Enable Google wrappers for regular non-nested columns, e.g. google.protobuf.StringValue 'str' for String column 'str'. For Nullable columns empty wrappers are recognized as defaults, and missing as nulls", 0) \ M(Bool, output_format_protobuf_nullables_with_google_wrappers, false, "When serializing Nullable columns with Google wrappers, serialize default values as empty wrappers. If turned off, default and null values are not serialized", 0) \ @@ -1146,6 +1147,8 @@ class IColumn; M(Bool, output_format_sql_insert_use_replace, false, "Use REPLACE statement instead of INSERT", 0) \ M(Bool, output_format_sql_insert_quote_names, true, "Quote column names with '`' characters", 0) \ \ + M(Bool, output_format_values_escape_quote_with_quote, false, "If true escape ' with '', otherwise quoted with \\'", 0) \ + \ M(Bool, output_format_bson_string_as_string, false, "Use BSON String type instead of Binary for String columns.", 0) \ M(Bool, input_format_bson_skip_fields_with_unsupported_types_in_schema_inference, false, "Skip fields with unsupported types while schema inference for format BSON.", 0) \ \ diff --git a/src/Core/SettingsChangesHistory.h b/src/Core/SettingsChangesHistory.h index 64650bf0ef5..e97a411e2c1 100644 --- a/src/Core/SettingsChangesHistory.h +++ b/src/Core/SettingsChangesHistory.h @@ -85,6 +85,8 @@ namespace SettingsChangesHistory static std::map settings_changes_history = { {"24.2", { + {"output_format_values_escape_quote_with_quote", false, false, "If true escape ' with '', otherwise quoted with \\'"}, + {"input_format_try_infer_exponent_floats", true, false, "Don't infer floats in exponential notation by default"}, {"async_insert_max_data_size", 1000000, 10485760, "The previous value appeared to be too small."}, {"async_insert_poll_timeout_ms", 10, 10, "Timeout in milliseconds for polling data from asynchronous insert queue"}, {"async_insert_use_adaptive_busy_timeout", true, true, "Use adaptive asynchronous insert timeout"}, diff --git a/src/Core/SettingsFields.cpp b/src/Core/SettingsFields.cpp index 80197cfbe22..f72b64fd56d 100644 --- a/src/Core/SettingsFields.cpp +++ b/src/Core/SettingsFields.cpp @@ -1,8 +1,7 @@ #include - #include +#include #include -#include #include #include #include @@ -13,6 +12,7 @@ #include + namespace DB { namespace ErrorCodes @@ -20,6 +20,7 @@ namespace ErrorCodes extern const int SIZE_OF_FIXED_STRING_DOESNT_MATCH; extern const int CANNOT_PARSE_BOOL; extern const int CANNOT_PARSE_NUMBER; + extern const int CANNOT_CONVERT_TYPE; } @@ -48,9 +49,51 @@ namespace T fieldToNumber(const Field & f) { if (f.getType() == Field::Types::String) + { return stringToNumber(f.get()); + } + else if (f.getType() == Field::Types::UInt64) + { + T result; + if (!accurate::convertNumeric(f.get(), result)) + throw Exception(ErrorCodes::CANNOT_CONVERT_TYPE, "Field value {} is out of range of {} type", f, demangle(typeid(T).name())); + return result; + } + else if (f.getType() == Field::Types::Int64) + { + T result; + if (!accurate::convertNumeric(f.get(), result)) + throw Exception(ErrorCodes::CANNOT_CONVERT_TYPE, "Field value {} is out of range of {} type", f, demangle(typeid(T).name())); + return result; + } + else if (f.getType() == Field::Types::Bool) + { + return T(f.get()); + } + else if (f.getType() == Field::Types::Float64) + { + Float64 x = f.get(); + if constexpr (std::is_floating_point_v) + { + return T(x); + } + else + { + if (!isFinite(x)) + { + /// Conversion of infinite values to integer is undefined. + throw Exception(ErrorCodes::CANNOT_CONVERT_TYPE, "Cannot convert infinite value to integer type"); + } + else if (x > Float64(std::numeric_limits::max()) || x < Float64(std::numeric_limits::lowest())) + { + throw Exception(ErrorCodes::CANNOT_CONVERT_TYPE, "Cannot convert out of range floating point value to integer type"); + } + else + return T(x); + } + } else - return applyVisitor(FieldVisitorConvertToNumber(), f); + throw Exception(ErrorCodes::CANNOT_CONVERT_TYPE, "Invalid value {} of the setting, which needs {}", f, demangle(typeid(T).name())); } Map stringToMap(const String & str) @@ -174,7 +217,7 @@ namespace if (f.getType() == Field::Types::String) return stringToMaxThreads(f.get()); else - return applyVisitor(FieldVisitorConvertToNumber(), f); + return fieldToNumber(f); } } diff --git a/src/Daemon/SentryWriter.cpp b/src/Daemon/SentryWriter.cpp index ebfd18abeee..192e9952b9a 100644 --- a/src/Daemon/SentryWriter.cpp +++ b/src/Daemon/SentryWriter.cpp @@ -78,7 +78,7 @@ void SentryWriter::initialize(Poco::Util::LayeredConfiguration & config) if (enabled) { - server_data_path = config.getString("path", ""); + server_data_path = config.getString("path", DB::DBMS_DEFAULT_PATH); const std::filesystem::path & default_tmp_path = fs::path(config.getString("tmp_path", fs::temp_directory_path())) / "sentry"; const std::string & endpoint = config.getString("send_crash_reports.endpoint"); diff --git a/src/DataTypes/DataTypeArray.cpp b/src/DataTypes/DataTypeArray.cpp index 24cd759e2a5..6e5760933eb 100644 --- a/src/DataTypes/DataTypeArray.cpp +++ b/src/DataTypes/DataTypeArray.cpp @@ -69,6 +69,11 @@ String DataTypeArray::doGetPrettyName(size_t indent) const return s.str(); } +void DataTypeArray::forEachChild(const ChildCallback & callback) const +{ + callback(*nested); + nested->forEachChild(callback); +} static DataTypePtr create(const ASTPtr & arguments) { diff --git a/src/DataTypes/DataTypeArray.h b/src/DataTypes/DataTypeArray.h index 6a09b3b530d..4423f137e1a 100644 --- a/src/DataTypes/DataTypeArray.h +++ b/src/DataTypes/DataTypeArray.h @@ -43,6 +43,7 @@ public: MutableColumnPtr createColumn() const override; + void forEachChild(const ChildCallback & callback) const override; Field getDefault() const override; diff --git a/src/DataTypes/DataTypeLowCardinality.cpp b/src/DataTypes/DataTypeLowCardinality.cpp index 3e94b533c7a..5af1f28cbad 100644 --- a/src/DataTypes/DataTypeLowCardinality.cpp +++ b/src/DataTypes/DataTypeLowCardinality.cpp @@ -153,6 +153,12 @@ SerializationPtr DataTypeLowCardinality::doGetDefaultSerialization() const return std::make_shared(dictionary_type); } +void DataTypeLowCardinality::forEachChild(const ChildCallback & callback) const +{ + callback(*dictionary_type); + dictionary_type->forEachChild(callback); +} + static DataTypePtr create(const ASTPtr & arguments) { diff --git a/src/DataTypes/DataTypeLowCardinality.h b/src/DataTypes/DataTypeLowCardinality.h index 389e24ef2a9..cd926bb595c 100644 --- a/src/DataTypes/DataTypeLowCardinality.h +++ b/src/DataTypes/DataTypeLowCardinality.h @@ -60,6 +60,8 @@ public: static MutableColumnUniquePtr createColumnUnique(const IDataType & keys_type); static MutableColumnUniquePtr createColumnUnique(const IDataType & keys_type, MutableColumnPtr && keys); + void forEachChild(const ChildCallback & callback) const override; + private: SerializationPtr doGetDefaultSerialization() const override; diff --git a/src/DataTypes/DataTypeMap.cpp b/src/DataTypes/DataTypeMap.cpp index 1f246af74d3..4b85606ff26 100644 --- a/src/DataTypes/DataTypeMap.cpp +++ b/src/DataTypes/DataTypeMap.cpp @@ -143,6 +143,14 @@ DataTypePtr DataTypeMap::getNestedTypeWithUnnamedTuple() const return std::make_shared(std::make_shared(from_tuple.getElements())); } +void DataTypeMap::forEachChild(const DB::IDataType::ChildCallback & callback) const +{ + callback(*key_type); + key_type->forEachChild(callback); + callback(*value_type); + value_type->forEachChild(callback); +} + static DataTypePtr create(const ASTPtr & arguments) { if (!arguments || arguments->children.size() != 2) diff --git a/src/DataTypes/DataTypeMap.h b/src/DataTypes/DataTypeMap.h index 257888a8e44..7281cca1bb1 100644 --- a/src/DataTypes/DataTypeMap.h +++ b/src/DataTypes/DataTypeMap.h @@ -54,6 +54,8 @@ public: static bool checkKeyType(DataTypePtr key_type); + void forEachChild(const ChildCallback & callback) const override; + private: void assertKeyType() const; }; diff --git a/src/DataTypes/DataTypeNullable.cpp b/src/DataTypes/DataTypeNullable.cpp index 484d779551f..16d5d41e5e5 100644 --- a/src/DataTypes/DataTypeNullable.cpp +++ b/src/DataTypes/DataTypeNullable.cpp @@ -61,6 +61,12 @@ SerializationPtr DataTypeNullable::doGetDefaultSerialization() const return std::make_shared(nested_data_type->getDefaultSerialization()); } +void DataTypeNullable::forEachChild(const ChildCallback & callback) const +{ + callback(*nested_data_type); + nested_data_type->forEachChild(callback); +} + static DataTypePtr create(const ASTPtr & arguments) { diff --git a/src/DataTypes/DataTypeNullable.h b/src/DataTypes/DataTypeNullable.h index 7ad0e1ba5f1..b102c767993 100644 --- a/src/DataTypes/DataTypeNullable.h +++ b/src/DataTypes/DataTypeNullable.h @@ -43,6 +43,9 @@ public: bool canBePromoted() const override { return nested_data_type->canBePromoted(); } const DataTypePtr & getNestedType() const { return nested_data_type; } + + void forEachChild(const ChildCallback & callback) const override; + private: SerializationPtr doGetDefaultSerialization() const override; diff --git a/src/DataTypes/DataTypeTuple.cpp b/src/DataTypes/DataTypeTuple.cpp index 5c9d5a3366e..eb218d8efb7 100644 --- a/src/DataTypes/DataTypeTuple.cpp +++ b/src/DataTypes/DataTypeTuple.cpp @@ -376,6 +376,14 @@ SerializationInfoPtr DataTypeTuple::getSerializationInfo(const IColumn & column) return std::make_shared(std::move(infos), names, SerializationInfo::Settings{}); } +void DataTypeTuple::forEachChild(const ChildCallback & callback) const +{ + for (const auto & elem : elems) + { + callback(*elem); + elem->forEachChild(callback); + } +} static DataTypePtr create(const ASTPtr & arguments) { diff --git a/src/DataTypes/DataTypeTuple.h b/src/DataTypes/DataTypeTuple.h index db49b7f22d1..4e5a0c1b33c 100644 --- a/src/DataTypes/DataTypeTuple.h +++ b/src/DataTypes/DataTypeTuple.h @@ -70,6 +70,8 @@ public: String getNameByPosition(size_t i) const; bool haveExplicitNames() const { return have_explicit_names; } + + void forEachChild(const ChildCallback & callback) const override; }; } diff --git a/src/DataTypes/DataTypeVariant.cpp b/src/DataTypes/DataTypeVariant.cpp index 456b4ea03b6..0543507a14d 100644 --- a/src/DataTypes/DataTypeVariant.cpp +++ b/src/DataTypes/DataTypeVariant.cpp @@ -175,6 +175,15 @@ SerializationPtr DataTypeVariant::doGetDefaultSerialization() const return std::make_shared(std::move(serializations), std::move(variant_names), SerializationVariant::getVariantsDeserializeTextOrder(variants), getName()); } +void DataTypeVariant::forEachChild(const DB::IDataType::ChildCallback & callback) const +{ + for (const auto & variant : variants) + { + callback(*variant); + variant->forEachChild(callback); + } +} + static DataTypePtr create(const ASTPtr & arguments) { if (!arguments || arguments->children.empty()) diff --git a/src/DataTypes/DataTypeVariant.h b/src/DataTypes/DataTypeVariant.h index d26ce4ea90f..2a2206f985a 100644 --- a/src/DataTypes/DataTypeVariant.h +++ b/src/DataTypes/DataTypeVariant.h @@ -54,6 +54,8 @@ public: /// Check if Variant has provided type in the list of variants and return its discriminator. std::optional tryGetVariantDiscriminator(const DataTypePtr & type) const; + void forEachChild(const ChildCallback & callback) const override; + private: std::string doGetName() const override; std::string doGetPrettyName(size_t indent) const override; diff --git a/src/DataTypes/IDataType.h b/src/DataTypes/IDataType.h index 48cc127746f..220658afda5 100644 --- a/src/DataTypes/IDataType.h +++ b/src/DataTypes/IDataType.h @@ -111,6 +111,10 @@ public: const SubcolumnCallback & callback, const SubstreamData & data); + /// Call callback for each nested type recursively. + using ChildCallback = std::function; + virtual void forEachChild(const ChildCallback &) const {} + Names getSubcolumnNames() const; virtual MutableSerializationInfoPtr createSerializationInfo(const SerializationInfo::Settings & settings) const; diff --git a/src/DataTypes/Serializations/SerializationString.cpp b/src/DataTypes/Serializations/SerializationString.cpp index 6bffa0ff72e..fd46206e9ad 100644 --- a/src/DataTypes/Serializations/SerializationString.cpp +++ b/src/DataTypes/Serializations/SerializationString.cpp @@ -334,9 +334,12 @@ bool SerializationString::tryDeserializeTextEscaped(IColumn & column, ReadBuffer return read(column, [&](ColumnString::Chars & data) { readEscapedStringInto(data, istr); return true; }); } -void SerializationString::serializeTextQuoted(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const +void SerializationString::serializeTextQuoted(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const { - writeQuotedString(assert_cast(column).getDataAt(row_num), ostr); + if (settings.values.escape_quote_with_quote) + writeQuotedStringPostgreSQL(assert_cast(column).getDataAt(row_num).toView(), ostr); + else + writeQuotedString(assert_cast(column).getDataAt(row_num), ostr); } diff --git a/src/DataTypes/Serializations/SerializationTuple.cpp b/src/DataTypes/Serializations/SerializationTuple.cpp index 5d8c84b70bf..399ad870d60 100644 --- a/src/DataTypes/Serializations/SerializationTuple.cpp +++ b/src/DataTypes/Serializations/SerializationTuple.cpp @@ -62,7 +62,7 @@ void SerializationTuple::serializeBinary(const IColumn & column, size_t row_num, } -template +template static ReturnType addElementSafe(size_t num_elems, IColumn & column, F && impl) { static constexpr bool throw_exception = std::is_same_v; @@ -85,11 +85,7 @@ static ReturnType addElementSafe(size_t num_elems, IColumn & column, F && impl) try { - if constexpr (throw_exception) - { - impl(); - } - else if (!impl()) + if (!impl()) { restore_elements(); return ReturnType(false); @@ -125,10 +121,11 @@ static ReturnType addElementSafe(size_t num_elems, IColumn & column, F && impl) void SerializationTuple::deserializeBinary(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const { - addElementSafe(elems.size(), column, [&] + addElementSafe(elems.size(), column, [&] { for (size_t i = 0; i < elems.size(); ++i) elems[i]->deserializeBinary(extractElementColumn(column, i), istr, settings); + return true; }); } @@ -165,7 +162,7 @@ ReturnType SerializationTuple::deserializeTextImpl(IColumn & column, ReadBuffer if constexpr (throw_exception) assertChar(',', istr); else if (!checkChar(',', istr)) - return ReturnType(false); + return false; skipWhitespaceIfAny(istr); } @@ -203,16 +200,16 @@ ReturnType SerializationTuple::deserializeTextImpl(IColumn & column, ReadBuffer if constexpr (throw_exception) assertChar(')', istr); else if (!checkChar(')', istr)) - return ReturnType(false); + return false; if (whole && !istr.eof()) { if constexpr (throw_exception) throwUnexpectedDataAfterParsedValue(column, istr, settings, "Tuple"); - return ReturnType(false); + return false; } - return ReturnType(true); + return true; }; return addElementSafe(elems.size(), column, impl); @@ -323,6 +320,7 @@ ReturnType SerializationTuple::deserializeTextJSONImpl(IColumn & column, ReadBuf SerializationNullable::deserializeNullAsDefaultOrNestedTextJSON(element_column, istr, settings, elems[element_pos]); else elems[element_pos]->deserializeTextJSON(element_column, istr, settings); + return true; } else { @@ -353,7 +351,7 @@ ReturnType SerializationTuple::deserializeTextJSONImpl(IColumn & column, ReadBuf { if constexpr (throw_exception) throw Exception(ErrorCodes::INCORRECT_DATA, "Unexpected number of elements in named tuple. Expected no more than {} (consider enabling input_format_json_ignore_unknown_keys_in_named_tuple setting)", elems.size()); - return ReturnType(false); + return false; } if (processed + skipped > 0) @@ -361,7 +359,7 @@ ReturnType SerializationTuple::deserializeTextJSONImpl(IColumn & column, ReadBuf if constexpr (throw_exception) assertChar(',', istr); else if (!checkChar(',', istr)) - return ReturnType(false); + return false; skipWhitespaceIfAny(istr); } @@ -369,13 +367,13 @@ ReturnType SerializationTuple::deserializeTextJSONImpl(IColumn & column, ReadBuf if constexpr (throw_exception) readDoubleQuotedString(name, istr); else if (!tryReadDoubleQuotedString(name, istr)) - return ReturnType(false); + return false; skipWhitespaceIfAny(istr); if constexpr (throw_exception) assertChar(':', istr); else if (!checkChar(':', istr)) - return ReturnType(false); + return false; skipWhitespaceIfAny(istr); const size_t element_pos = getPositionByName(name); @@ -386,7 +384,7 @@ ReturnType SerializationTuple::deserializeTextJSONImpl(IColumn & column, ReadBuf if constexpr (throw_exception) skipJSONField(istr, name); else if (!trySkipJSONField(istr, name)) - return ReturnType(false); + return false; skipWhitespaceIfAny(istr); ++skipped; @@ -396,7 +394,7 @@ ReturnType SerializationTuple::deserializeTextJSONImpl(IColumn & column, ReadBuf { if constexpr (throw_exception) throw Exception(ErrorCodes::NOT_FOUND_COLUMN_IN_BLOCK, "Tuple doesn't have element with name '{}', enable setting input_format_json_ignore_unknown_keys_in_named_tuple", name); - return ReturnType(false); + return false; } } @@ -418,7 +416,7 @@ ReturnType SerializationTuple::deserializeTextJSONImpl(IColumn & column, ReadBuf else { if (!deserialize_element(element_column, element_pos)) - return ReturnType(false); + return false; } skipWhitespaceIfAny(istr); @@ -428,7 +426,7 @@ ReturnType SerializationTuple::deserializeTextJSONImpl(IColumn & column, ReadBuf if constexpr (throw_exception) assertChar('}', istr); else if (!checkChar('}', istr)) - return ReturnType(false); + return false; /// Check if we have missing elements. if (processed != elems.size()) @@ -446,7 +444,7 @@ ReturnType SerializationTuple::deserializeTextJSONImpl(IColumn & column, ReadBuf "JSON object doesn't contain tuple element {}. If you want to insert defaults in case of missing elements, " "enable setting input_format_json_defaults_for_missing_elements_in_named_tuple", elems[element_pos]->getElementName()); - return ReturnType(false); + return false; } auto & element_column = extractElementColumn(column, element_pos); @@ -454,7 +452,7 @@ ReturnType SerializationTuple::deserializeTextJSONImpl(IColumn & column, ReadBuf } } - return ReturnType(true); + return true; }; return addElementSafe(elems.size(), column, impl); @@ -465,7 +463,7 @@ ReturnType SerializationTuple::deserializeTextJSONImpl(IColumn & column, ReadBuf if constexpr (throw_exception) assertChar('[', istr); else if (!checkChar('[', istr)) - return ReturnType(false); + return false; skipWhitespaceIfAny(istr); auto impl = [&]() @@ -478,7 +476,7 @@ ReturnType SerializationTuple::deserializeTextJSONImpl(IColumn & column, ReadBuf if constexpr (throw_exception) assertChar(',', istr); else if (!checkChar(',', istr)) - return ReturnType(false); + return false; skipWhitespaceIfAny(istr); } @@ -487,16 +485,16 @@ ReturnType SerializationTuple::deserializeTextJSONImpl(IColumn & column, ReadBuf if constexpr (throw_exception) deserialize_element(element_column, i); else if (!deserialize_element(element_column, i)) - return ReturnType(false); + return false; } skipWhitespaceIfAny(istr); if constexpr (throw_exception) assertChar(']', istr); else if (!checkChar(']', istr)) - return ReturnType(false); + return false; - return ReturnType(true); + return true; }; return addElementSafe(elems.size(), column, impl); @@ -538,7 +536,7 @@ void SerializationTuple::serializeTextCSV(const IColumn & column, size_t row_num void SerializationTuple::deserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const { - addElementSafe(elems.size(), column, [&] + addElementSafe(elems.size(), column, [&] { const size_t size = elems.size(); for (size_t i = 0; i < size; ++i) @@ -556,6 +554,7 @@ void SerializationTuple::deserializeTextCSV(IColumn & column, ReadBuffer & istr, else elems[i]->deserializeTextCSV(element_column, istr, settings); } + return true; }); } diff --git a/src/Databases/DatabaseFilesystem.cpp b/src/Databases/DatabaseFilesystem.cpp index 55ae60469ed..bc6714a6471 100644 --- a/src/Databases/DatabaseFilesystem.cpp +++ b/src/Databases/DatabaseFilesystem.cpp @@ -146,9 +146,18 @@ StoragePtr DatabaseFilesystem::getTableImpl(const String & name, ContextPtr cont if (!checkTableFilePath(table_path, context_, throw_on_error)) return {}; - auto format = FormatFactory::instance().getFormatFromFileName(table_path, throw_on_error); - if (format.empty()) - return {}; + String format; + if (throw_on_error) + { + format = FormatFactory::instance().getFormatFromFileName(table_path); + } + else + { + auto format_maybe = FormatFactory::instance().tryGetFormatFromFileName(table_path); + if (!format_maybe) + return {}; + format = *format_maybe; + } auto ast_function_ptr = makeASTFunction("file", std::make_shared(table_path), std::make_shared(format)); diff --git a/src/Databases/IDatabase.h b/src/Databases/IDatabase.h index 2f448cd7036..ec380fa759d 100644 --- a/src/Databases/IDatabase.h +++ b/src/Databases/IDatabase.h @@ -407,7 +407,7 @@ public: virtual void stopReplication() { - throw Exception(ErrorCodes::LOGICAL_ERROR, "Database engine {} does not run a replication thread!", getEngineName()); + throw Exception(ErrorCodes::LOGICAL_ERROR, "Database engine {} does not run a replication thread", getEngineName()); } virtual bool shouldReplicateQuery(const ContextPtr & /*query_context*/, const ASTPtr & /*query_ptr*/) const { return false; } diff --git a/src/Dictionaries/CacheDictionary.h b/src/Dictionaries/CacheDictionary.h index aae86a83f12..a52bcbc4ae4 100644 --- a/src/Dictionaries/CacheDictionary.h +++ b/src/Dictionaries/CacheDictionary.h @@ -78,22 +78,22 @@ public: double getLoadFactor() const override; - size_t getQueryCount() const override { return query_count.load(std::memory_order_relaxed); } + size_t getQueryCount() const override { return query_count.load(); } double getFoundRate() const override { - size_t queries = query_count.load(std::memory_order_relaxed); + size_t queries = query_count.load(); if (!queries) return 0; - return static_cast(found_count.load(std::memory_order_relaxed)) / queries; + return std::min(1.0, static_cast(found_count.load()) / queries); } double getHitRate() const override { - size_t queries = query_count.load(std::memory_order_relaxed); + size_t queries = query_count.load(); if (!queries) return 0; - return static_cast(hit_count.load(std::memory_order_acquire)) / queries; + return static_cast(hit_count.load()) / queries; } bool supportUpdates() const override { return false; } diff --git a/src/Dictionaries/DirectDictionary.h b/src/Dictionaries/DirectDictionary.h index 214c8ef8a13..73340904684 100644 --- a/src/Dictionaries/DirectDictionary.h +++ b/src/Dictionaries/DirectDictionary.h @@ -34,14 +34,14 @@ public: size_t getBytesAllocated() const override { return 0; } - size_t getQueryCount() const override { return query_count.load(std::memory_order_relaxed); } + size_t getQueryCount() const override { return query_count.load(); } double getFoundRate() const override { - size_t queries = query_count.load(std::memory_order_relaxed); + size_t queries = query_count.load(); if (!queries) return 0; - return static_cast(found_count.load(std::memory_order_relaxed)) / queries; + return std::min(1.0, static_cast(found_count.load()) / queries); } double getHitRate() const override { return 1.0; } diff --git a/src/Dictionaries/FlatDictionary.h b/src/Dictionaries/FlatDictionary.h index a54916c5cd1..aac55610351 100644 --- a/src/Dictionaries/FlatDictionary.h +++ b/src/Dictionaries/FlatDictionary.h @@ -41,14 +41,14 @@ public: size_t getBytesAllocated() const override { return bytes_allocated; } - size_t getQueryCount() const override { return query_count.load(std::memory_order_relaxed); } + size_t getQueryCount() const override { return query_count.load(); } double getFoundRate() const override { - size_t queries = query_count.load(std::memory_order_relaxed); + size_t queries = query_count.load(); if (!queries) return 0; - return static_cast(found_count.load(std::memory_order_relaxed)) / queries; + return std::min(1.0, static_cast(found_count.load()) / queries); } double getHitRate() const override { return 1.0; } diff --git a/src/Dictionaries/HashedArrayDictionary.h b/src/Dictionaries/HashedArrayDictionary.h index 86b21443e18..f18a8f4a474 100644 --- a/src/Dictionaries/HashedArrayDictionary.h +++ b/src/Dictionaries/HashedArrayDictionary.h @@ -57,14 +57,14 @@ public: size_t getBytesAllocated() const override { return bytes_allocated; } - size_t getQueryCount() const override { return query_count.load(std::memory_order_relaxed); } + size_t getQueryCount() const override { return query_count.load(); } double getFoundRate() const override { - size_t queries = query_count.load(std::memory_order_relaxed); + size_t queries = query_count.load(); if (!queries) return 0; - return static_cast(found_count.load(std::memory_order_relaxed)) / queries; + return std::min(1.0, static_cast(found_count.load()) / queries); } double getHitRate() const override { return 1.0; } diff --git a/src/Dictionaries/HashedDictionary.h b/src/Dictionaries/HashedDictionary.h index 0b8419dd242..ed80973fcf3 100644 --- a/src/Dictionaries/HashedDictionary.h +++ b/src/Dictionaries/HashedDictionary.h @@ -99,14 +99,14 @@ public: size_t getBytesAllocated() const override { return bytes_allocated; } - size_t getQueryCount() const override { return query_count.load(std::memory_order_relaxed); } + size_t getQueryCount() const override { return query_count.load(); } double getFoundRate() const override { - size_t queries = query_count.load(std::memory_order_relaxed); + size_t queries = query_count.load(); if (!queries) return 0; - return static_cast(found_count.load(std::memory_order_relaxed)) / queries; + return std::min(1.0, static_cast(found_count.load()) / queries); } double getHitRate() const override { return 1.0; } diff --git a/src/Dictionaries/IDictionary.h b/src/Dictionaries/IDictionary.h index f1834b4b129..d3e28682f35 100644 --- a/src/Dictionaries/IDictionary.h +++ b/src/Dictionaries/IDictionary.h @@ -109,6 +109,9 @@ public: virtual size_t getQueryCount() const = 0; + /// The percentage of time a lookup successfully found an entry. + /// When there were no lookups, it returns zero (instead of NaN). + /// The value is calculated non atomically and can be slightly off in the presence of concurrent lookups. virtual double getFoundRate() const = 0; virtual double getHitRate() const = 0; diff --git a/src/Dictionaries/IPAddressDictionary.h b/src/Dictionaries/IPAddressDictionary.h index d758e23043d..105bf7e340a 100644 --- a/src/Dictionaries/IPAddressDictionary.h +++ b/src/Dictionaries/IPAddressDictionary.h @@ -41,14 +41,14 @@ public: size_t getBytesAllocated() const override { return bytes_allocated; } - size_t getQueryCount() const override { return query_count.load(std::memory_order_relaxed); } + size_t getQueryCount() const override { return query_count.load(); } double getFoundRate() const override { - size_t queries = query_count.load(std::memory_order_relaxed); + size_t queries = query_count.load(); if (!queries) return 0; - return static_cast(found_count.load(std::memory_order_relaxed)) / queries; + return std::min(1.0, static_cast(found_count.load()) / queries); } double getHitRate() const override { return 1.0; } diff --git a/src/Dictionaries/PolygonDictionary.h b/src/Dictionaries/PolygonDictionary.h index a856d12b66c..48a1f0e56da 100644 --- a/src/Dictionaries/PolygonDictionary.h +++ b/src/Dictionaries/PolygonDictionary.h @@ -71,14 +71,14 @@ public: size_t getBytesAllocated() const override { return bytes_allocated; } - size_t getQueryCount() const override { return query_count.load(std::memory_order_relaxed); } + size_t getQueryCount() const override { return query_count.load(); } double getFoundRate() const override { - size_t queries = query_count.load(std::memory_order_relaxed); + size_t queries = query_count.load(); if (!queries) return 0; - return static_cast(found_count.load(std::memory_order_relaxed)) / queries; + return std::min(1.0, static_cast(found_count.load()) / queries); } double getHitRate() const override { return 1.0; } diff --git a/src/Dictionaries/RangeHashedDictionary.h b/src/Dictionaries/RangeHashedDictionary.h index c44bffe42e1..28db67038ca 100644 --- a/src/Dictionaries/RangeHashedDictionary.h +++ b/src/Dictionaries/RangeHashedDictionary.h @@ -85,14 +85,14 @@ public: size_t getBytesAllocated() const override { return bytes_allocated; } - size_t getQueryCount() const override { return query_count.load(std::memory_order_relaxed); } + size_t getQueryCount() const override { return query_count.load(); } double getFoundRate() const override { - size_t queries = query_count.load(std::memory_order_relaxed); + size_t queries = query_count.load(); if (!queries) return 0; - return static_cast(found_count.load(std::memory_order_relaxed)) / queries; + return std::min(1.0, static_cast(found_count.load()) / queries); } double getHitRate() const override { return 1.0; } diff --git a/src/Dictionaries/RegExpTreeDictionary.h b/src/Dictionaries/RegExpTreeDictionary.h index 78b7f441d34..68b6b603692 100644 --- a/src/Dictionaries/RegExpTreeDictionary.h +++ b/src/Dictionaries/RegExpTreeDictionary.h @@ -58,14 +58,14 @@ public: size_t getBytesAllocated() const override { return bytes_allocated; } - size_t getQueryCount() const override { return query_count.load(std::memory_order_relaxed); } + size_t getQueryCount() const override { return query_count.load(); } double getFoundRate() const override { - const auto queries = query_count.load(std::memory_order_relaxed); + const auto queries = query_count.load(); if (!queries) return 0; - return static_cast(found_count.load(std::memory_order_relaxed)) / queries; + return std::min(1.0, static_cast(found_count.load()) / queries); } double getHitRate() const override { return 1.0; } diff --git a/src/Dictionaries/YAMLRegExpTreeDictionarySource.cpp b/src/Dictionaries/YAMLRegExpTreeDictionarySource.cpp index f1591943a12..b35e507b242 100644 --- a/src/Dictionaries/YAMLRegExpTreeDictionarySource.cpp +++ b/src/Dictionaries/YAMLRegExpTreeDictionarySource.cpp @@ -227,7 +227,7 @@ void parseMatchNode(UInt64 parent_id, UInt64 & id, const YAML::Node & node, Resu if (!match.contains(key_name)) { - throw Exception(ErrorCodes::INVALID_CONFIG_PARAMETER, "Yaml match rule must contain key {}", key_name); + throw Exception(ErrorCodes::INVALID_CONFIG_PARAMETER, "YAML match rule must contain key {}", key_name); } for (const auto & [key, node_] : match) { diff --git a/src/Disks/IO/AsynchronousBoundedReadBuffer.cpp b/src/Disks/IO/AsynchronousBoundedReadBuffer.cpp index bd19540bf44..2373640704b 100644 --- a/src/Disks/IO/AsynchronousBoundedReadBuffer.cpp +++ b/src/Disks/IO/AsynchronousBoundedReadBuffer.cpp @@ -69,12 +69,10 @@ bool AsynchronousBoundedReadBuffer::hasPendingDataToRead() return false; if (file_offset_of_buffer_end > *read_until_position) - { throw Exception( ErrorCodes::LOGICAL_ERROR, - "Read beyond last offset ({} > {}, info: {})", - file_offset_of_buffer_end, *read_until_position, impl->getInfoForLog()); - } + "Read beyond last offset ({} > {}): file size = {}, info: {}", + file_offset_of_buffer_end, *read_until_position, impl->getFileSize(), impl->getInfoForLog()); } return true; @@ -126,14 +124,15 @@ void AsynchronousBoundedReadBuffer::setReadUntilPosition(size_t position) if (position < file_offset_of_buffer_end) { /// file has been read beyond new read until position already - if (working_buffer.size() >= file_offset_of_buffer_end - position) + if (available() >= file_offset_of_buffer_end - position) { - /// new read until position is inside working buffer + /// new read until position is after the current position in the working buffer file_offset_of_buffer_end = position; + working_buffer.resize(working_buffer.size() - (file_offset_of_buffer_end - position)); } else { - /// new read until position is before working buffer begin + /// new read until position is before the current position in the working buffer throw Exception( ErrorCodes::LOGICAL_ERROR, "Attempt to set read until position before already read data ({} > {}, info: {})", @@ -186,6 +185,7 @@ bool AsynchronousBoundedReadBuffer::nextImpl() return false; chassert(file_offset_of_buffer_end <= impl->getFileSize()); + size_t old_file_offset_of_buffer_end = file_offset_of_buffer_end; IAsynchronousReader::Result result; if (prefetch_future.valid()) @@ -221,6 +221,9 @@ bool AsynchronousBoundedReadBuffer::nextImpl() ProfileEvents::increment(ProfileEvents::RemoteFSUnprefetchedBytes, result.size); } + bytes_to_ignore = 0; + resetWorkingBuffer(); + size_t bytes_read = result.size - result.offset; if (bytes_read) { @@ -231,14 +234,26 @@ bool AsynchronousBoundedReadBuffer::nextImpl() } file_offset_of_buffer_end = impl->getFileOffsetOfBufferEnd(); - bytes_to_ignore = 0; /// In case of multiple files for the same file in clickhouse (i.e. log family) /// file_offset_of_buffer_end will not match getImplementationBufferOffset() /// so we use [impl->getImplementationBufferOffset(), impl->getFileSize()] chassert(file_offset_of_buffer_end <= impl->getFileSize()); - return bytes_read; + if (read_until_position && (file_offset_of_buffer_end > *read_until_position)) + { + size_t excessive_bytes_read = file_offset_of_buffer_end - *read_until_position; + + if (excessive_bytes_read > working_buffer.size()) + throw Exception(ErrorCodes::LOGICAL_ERROR, + "File offset moved too far: old_file_offset = {}, new_file_offset = {}, read_until_position = {}, bytes_read = {}", + old_file_offset_of_buffer_end, file_offset_of_buffer_end, *read_until_position, bytes_read); + + working_buffer.resize(working_buffer.size() - excessive_bytes_read); + file_offset_of_buffer_end = *read_until_position; + } + + return !working_buffer.empty(); } diff --git a/src/Disks/IO/AsynchronousBoundedReadBuffer.h b/src/Disks/IO/AsynchronousBoundedReadBuffer.h index e5030f37b1d..6dc76352aca 100644 --- a/src/Disks/IO/AsynchronousBoundedReadBuffer.h +++ b/src/Disks/IO/AsynchronousBoundedReadBuffer.h @@ -95,7 +95,6 @@ private: IAsynchronousReader::Result readSync(char * data, size_t size); void resetPrefetch(FilesystemPrefetchState state); - }; } diff --git a/src/Disks/IO/ThreadPoolReader.h b/src/Disks/IO/ThreadPoolReader.h index 42bc9bf8bb4..b8aff9f22a2 100644 --- a/src/Disks/IO/ThreadPoolReader.h +++ b/src/Disks/IO/ThreadPoolReader.h @@ -3,7 +3,7 @@ #include #include #include -#include +#include namespace DB diff --git a/src/Disks/IO/ThreadPoolRemoteFSReader.h b/src/Disks/IO/ThreadPoolRemoteFSReader.h index cd2bf223f33..abc251b2b10 100644 --- a/src/Disks/IO/ThreadPoolRemoteFSReader.h +++ b/src/Disks/IO/ThreadPoolRemoteFSReader.h @@ -3,7 +3,7 @@ #include #include #include -#include +#include namespace DB { diff --git a/src/Disks/ObjectStorages/IObjectStorage.h b/src/Disks/ObjectStorages/IObjectStorage.h index 049935ad60c..56c269a3fc5 100644 --- a/src/Disks/ObjectStorages/IObjectStorage.h +++ b/src/Disks/ObjectStorages/IObjectStorage.h @@ -22,7 +22,7 @@ #include #include #include -#include +#include #include #include "config.h" diff --git a/src/Disks/ObjectStorages/ObjectStorageIteratorAsync.h b/src/Disks/ObjectStorages/ObjectStorageIteratorAsync.h index a6abe03bac9..5f63e5f6e8a 100644 --- a/src/Disks/ObjectStorages/ObjectStorageIteratorAsync.h +++ b/src/Disks/ObjectStorages/ObjectStorageIteratorAsync.h @@ -2,7 +2,7 @@ #include #include -#include +#include #include #include diff --git a/src/Disks/ObjectStorages/S3/S3ObjectStorage.cpp b/src/Disks/ObjectStorages/S3/S3ObjectStorage.cpp index 4cc49288af6..5771eb1ebe0 100644 --- a/src/Disks/ObjectStorages/S3/S3ObjectStorage.cpp +++ b/src/Disks/ObjectStorages/S3/S3ObjectStorage.cpp @@ -13,7 +13,7 @@ #include #include #include -#include +#include #include #include diff --git a/src/Disks/VolumeJBOD.cpp b/src/Disks/VolumeJBOD.cpp index e437684b802..ec9e5ea0d39 100644 --- a/src/Disks/VolumeJBOD.cpp +++ b/src/Disks/VolumeJBOD.cpp @@ -85,7 +85,7 @@ VolumeJBOD::VolumeJBOD(const VolumeJBOD & volume_jbod, DiskSelectorPtr disk_selector) : VolumeJBOD(volume_jbod.name, config, config_prefix, disk_selector) { - are_merges_avoided_user_override = volume_jbod.are_merges_avoided_user_override.load(std::memory_order_relaxed); + are_merges_avoided_user_override = volume_jbod.are_merges_avoided_user_override.load(); last_used = volume_jbod.last_used.load(std::memory_order_relaxed); } diff --git a/src/Disks/tests/gtest_asynchronous_bounded_read_buffer.cpp b/src/Disks/tests/gtest_asynchronous_bounded_read_buffer.cpp new file mode 100644 index 00000000000..63a39fe39c7 --- /dev/null +++ b/src/Disks/tests/gtest_asynchronous_bounded_read_buffer.cpp @@ -0,0 +1,82 @@ +#include + +#include +#include +#include +#include +#include +#include + + +using namespace DB; +namespace fs = std::filesystem; + +class AsynchronousBoundedReadBufferTest : public ::testing::TestWithParam +{ +public: + AsynchronousBoundedReadBufferTest() { fs::create_directories(temp_folder.path()); } + + String makeTempFile(const String & contents) + { + String path = fmt::format("{}/{}", temp_folder.path(), counter); + ++counter; + + WriteBufferFromFile out{path}; + out.write(contents.data(), contents.size()); + out.finalize(); + + return path; + } + +private: + Poco::TemporaryFile temp_folder; + size_t counter = 0; +}; + +String getAlphabetWithDigits() +{ + String contents; + for (char c = 'a'; c <= 'z'; ++c) + contents += c; + for (char c = '0'; c <= '9'; ++c) + contents += c; + return contents; +} + + +TEST_F(AsynchronousBoundedReadBufferTest, setReadUntilPosition) +{ + String file_path = makeTempFile(getAlphabetWithDigits()); + ThreadPoolRemoteFSReader remote_fs_reader(4, 0); + + for (bool with_prefetch : {false, true}) + { + AsynchronousBoundedReadBuffer read_buffer(createReadBufferFromFileBase(file_path, {}), remote_fs_reader, {}); + read_buffer.setReadUntilPosition(20); + + auto try_read = [&](size_t count) + { + if (with_prefetch) + read_buffer.prefetch(Priority{0}); + + String str; + str.resize(count); + str.resize(read_buffer.read(str.data(), str.size())); + return str; + }; + + EXPECT_EQ(try_read(15), "abcdefghijklmno"); + EXPECT_EQ(try_read(15), "pqrst"); + EXPECT_EQ(try_read(15), ""); + + read_buffer.setReadUntilPosition(25); + + EXPECT_EQ(try_read(15), "uvwxy"); + EXPECT_EQ(try_read(15), ""); + + read_buffer.setReadUntilEnd(); + + EXPECT_EQ(try_read(15), "z0123456789"); + EXPECT_EQ(try_read(15), ""); + } +} diff --git a/src/Formats/FormatFactory.cpp b/src/Formats/FormatFactory.cpp index 8c39b4b71e4..0654dd01e49 100644 --- a/src/Formats/FormatFactory.cpp +++ b/src/Formats/FormatFactory.cpp @@ -39,7 +39,7 @@ const FormatFactory::Creators & FormatFactory::getCreators(const String & name) throw Exception(ErrorCodes::UNKNOWN_FORMAT, "Unknown format {}", name); } -FormatSettings getFormatSettings(ContextPtr context) +FormatSettings getFormatSettings(const ContextPtr & context) { const auto & settings = context->getSettingsRef(); @@ -47,7 +47,7 @@ FormatSettings getFormatSettings(ContextPtr context) } template -FormatSettings getFormatSettings(ContextPtr context, const Settings & settings) +FormatSettings getFormatSettings(const ContextPtr & context, const Settings & settings) { FormatSettings format_settings; @@ -181,6 +181,7 @@ FormatSettings getFormatSettings(ContextPtr context, const Settings & settings) format_settings.values.allow_data_after_semicolon = settings.input_format_values_allow_data_after_semicolon; format_settings.values.deduce_templates_of_expressions = settings.input_format_values_deduce_templates_of_expressions; format_settings.values.interpret_expressions = settings.input_format_values_interpret_expressions; + format_settings.values.escape_quote_with_quote = settings.output_format_values_escape_quote_with_quote; format_settings.with_names_use_header = settings.input_format_with_names_use_header; format_settings.with_types_use_header = settings.input_format_with_types_use_header; format_settings.write_statistics = settings.output_format_write_statistics; @@ -228,6 +229,7 @@ FormatSettings getFormatSettings(ContextPtr context, const Settings & settings) format_settings.try_infer_integers = settings.input_format_try_infer_integers; format_settings.try_infer_dates = settings.input_format_try_infer_dates; format_settings.try_infer_datetimes = settings.input_format_try_infer_datetimes; + format_settings.try_infer_exponent_floats = settings.input_format_try_infer_exponent_floats; format_settings.markdown.escape_special_characters = settings.output_format_markdown_escape_special_characters; format_settings.bson.output_string_as_string = settings.output_format_bson_string_as_string; format_settings.bson.skip_fields_with_unsupported_types_in_schema_inference = settings.input_format_bson_skip_fields_with_unsupported_types_in_schema_inference; @@ -255,16 +257,16 @@ FormatSettings getFormatSettings(ContextPtr context, const Settings & settings) return format_settings; } -template FormatSettings getFormatSettings(ContextPtr context, const FormatFactorySettings & settings); +template FormatSettings getFormatSettings(const ContextPtr & context, const FormatFactorySettings & settings); -template FormatSettings getFormatSettings(ContextPtr context, const Settings & settings); +template FormatSettings getFormatSettings(const ContextPtr & context, const Settings & settings); InputFormatPtr FormatFactory::getInput( const String & name, ReadBuffer & _buf, const Block & sample, - ContextPtr context, + const ContextPtr & context, UInt64 max_block_size, const std::optional & _format_settings, std::optional _max_parsing_threads, @@ -427,7 +429,7 @@ std::unique_ptr FormatFactory::wrapReadBufferIfNeeded( return res; } -static void addExistingProgressToOutputFormat(OutputFormatPtr format, ContextPtr context) +static void addExistingProgressToOutputFormat(OutputFormatPtr format, const ContextPtr & context) { auto element_id = context->getProcessListElementSafe(); if (element_id) @@ -446,7 +448,7 @@ OutputFormatPtr FormatFactory::getOutputFormatParallelIfPossible( const String & name, WriteBuffer & buf, const Block & sample, - ContextPtr context, + const ContextPtr & context, const std::optional & _format_settings) const { const auto & output_getter = getCreators(name).output_creator; @@ -484,7 +486,7 @@ OutputFormatPtr FormatFactory::getOutputFormat( const String & name, WriteBuffer & buf, const Block & sample, - ContextPtr context, + const ContextPtr & context, const std::optional & _format_settings) const { const auto & output_getter = getCreators(name).output_creator; @@ -518,7 +520,7 @@ OutputFormatPtr FormatFactory::getOutputFormat( String FormatFactory::getContentType( const String & name, - ContextPtr context, + const ContextPtr & context, const std::optional & _format_settings) const { const auto & output_getter = getCreators(name).output_creator; @@ -537,7 +539,7 @@ String FormatFactory::getContentType( SchemaReaderPtr FormatFactory::getSchemaReader( const String & name, ReadBuffer & buf, - ContextPtr & context, + const ContextPtr & context, const std::optional & _format_settings) const { const auto & schema_reader_creator = dict.at(name).schema_reader_creator; @@ -553,7 +555,7 @@ SchemaReaderPtr FormatFactory::getSchemaReader( ExternalSchemaReaderPtr FormatFactory::getExternalSchemaReader( const String & name, - ContextPtr & context, + const ContextPtr & context, const std::optional & _format_settings) const { const auto & external_schema_reader_creator = dict.at(name).external_schema_reader_creator; @@ -607,7 +609,7 @@ void FormatFactory::markFormatHasNoAppendSupport(const String & name) registerAppendSupportChecker(name, [](const FormatSettings &){ return false; }); } -bool FormatFactory::checkIfFormatSupportAppend(const String & name, ContextPtr context, const std::optional & format_settings_) +bool FormatFactory::checkIfFormatSupportAppend(const String & name, const ContextPtr & context, const std::optional & format_settings_) { auto format_settings = format_settings_ ? *format_settings_ : getFormatSettings(context); auto & append_support_checker = dict[name].append_support_checker; @@ -630,10 +632,10 @@ void FormatFactory::registerFileExtension(const String & extension, const String file_extension_formats[boost::to_lower_copy(extension)] = format_name; } -String FormatFactory::getFormatFromFileName(String file_name, bool throw_if_not_found) +std::optional FormatFactory::tryGetFormatFromFileName(String file_name) { if (file_name == "stdin") - return getFormatFromFileDescriptor(STDIN_FILENO); + return tryGetFormatFromFileDescriptor(STDIN_FILENO); CompressionMethod compression_method = chooseCompressionMethod(file_name, ""); if (CompressionMethod::None != compression_method) @@ -645,43 +647,53 @@ String FormatFactory::getFormatFromFileName(String file_name, bool throw_if_not_ auto pos = file_name.find_last_of('.'); if (pos == String::npos) - { - if (throw_if_not_found) - throw Exception(ErrorCodes::BAD_ARGUMENTS, "Cannot determine the file format by it's extension"); - return ""; - } + return std::nullopt; String file_extension = file_name.substr(pos + 1, String::npos); boost::algorithm::to_lower(file_extension); auto it = file_extension_formats.find(file_extension); if (it == file_extension_formats.end()) - { - if (throw_if_not_found) - throw Exception(ErrorCodes::BAD_ARGUMENTS, "Cannot determine the file format by it's extension"); - return ""; - } + return std::nullopt; + return it->second; } -String FormatFactory::getFormatFromFileDescriptor(int fd) +String FormatFactory::getFormatFromFileName(String file_name) +{ + if (auto format = tryGetFormatFromFileName(file_name)) + return *format; + + throw Exception(ErrorCodes::BAD_ARGUMENTS, "Cannot determine the format of the file {} by it's extension", file_name); +} + +std::optional FormatFactory::tryGetFormatFromFileDescriptor(int fd) { #ifdef OS_LINUX std::string proc_path = fmt::format("/proc/self/fd/{}", fd); char file_path[PATH_MAX] = {'\0'}; if (readlink(proc_path.c_str(), file_path, sizeof(file_path) - 1) != -1) - return getFormatFromFileName(file_path, false); - return ""; + return tryGetFormatFromFileName(file_path); + return std::nullopt; #elif defined(OS_DARWIN) char file_path[PATH_MAX] = {'\0'}; if (fcntl(fd, F_GETPATH, file_path) != -1) - return getFormatFromFileName(file_path, false); - return ""; + return tryGetFormatFromFileName(file_path); + return std::nullopt; #else (void)fd; - return ""; + return std::nullopt; #endif } +String FormatFactory::getFormatFromFileDescriptor(int fd) +{ + if (auto format = tryGetFormatFromFileDescriptor(fd)) + return *format; + + throw Exception(ErrorCodes::BAD_ARGUMENTS, "Cannot determine the format of the data by the file descriptor {}", fd); +} + + void FormatFactory::registerFileSegmentationEngine(const String & name, FileSegmentationEngine file_segmentation_engine) { auto & target = dict[name].file_segmentation_engine_creator; @@ -767,7 +779,7 @@ void FormatFactory::registerAdditionalInfoForSchemaCacheGetter( target = std::move(additional_info_for_schema_cache_getter); } -String FormatFactory::getAdditionalInfoForSchemaCache(const String & name, ContextPtr context, const std::optional & format_settings_) +String FormatFactory::getAdditionalInfoForSchemaCache(const String & name, const ContextPtr & context, const std::optional & format_settings_) { const auto & additional_info_getter = getCreators(name).additional_info_for_schema_cache_getter; if (!additional_info_getter) @@ -812,7 +824,7 @@ bool FormatFactory::checkIfOutputFormatPrefersLargeBlocks(const String & name) c return target.prefers_large_blocks; } -bool FormatFactory::checkParallelizeOutputAfterReading(const String & name, ContextPtr context) const +bool FormatFactory::checkParallelizeOutputAfterReading(const String & name, const ContextPtr & context) const { if (name == "Parquet" && context->getSettingsRef().input_format_parquet_preserve_order) return false; @@ -827,6 +839,18 @@ void FormatFactory::checkFormatName(const String & name) const throw Exception(ErrorCodes::UNKNOWN_FORMAT, "Unknown format {}", name); } +std::vector FormatFactory::getAllInputFormats() const +{ + std::vector input_formats; + for (const auto & [format_name, creators] : dict) + { + if (creators.input_creator || creators.random_access_input_creator) + input_formats.push_back(format_name); + } + + return input_formats; +} + FormatFactory & FormatFactory::instance() { static FormatFactory ret; diff --git a/src/Formats/FormatFactory.h b/src/Formats/FormatFactory.h index 9670c690456..165a20f7c4d 100644 --- a/src/Formats/FormatFactory.h +++ b/src/Formats/FormatFactory.h @@ -48,10 +48,10 @@ using RowOutputFormatPtr = std::shared_ptr; template struct Memory; -FormatSettings getFormatSettings(ContextPtr context); +FormatSettings getFormatSettings(const ContextPtr & context); template -FormatSettings getFormatSettings(ContextPtr context, const T & settings); +FormatSettings getFormatSettings(const ContextPtr & context, const T & settings); /** Allows to create an IInputFormat or IOutputFormat by the name of the format. * Note: format and compression are independent things. @@ -161,7 +161,7 @@ public: const String & name, ReadBuffer & buf, const Block & sample, - ContextPtr context, + const ContextPtr & context, UInt64 max_block_size, const std::optional & format_settings = std::nullopt, std::optional max_parsing_threads = std::nullopt, @@ -178,30 +178,30 @@ public: const String & name, WriteBuffer & buf, const Block & sample, - ContextPtr context, + const ContextPtr & context, const std::optional & format_settings = std::nullopt) const; OutputFormatPtr getOutputFormat( const String & name, WriteBuffer & buf, const Block & sample, - ContextPtr context, + const ContextPtr & context, const std::optional & _format_settings = std::nullopt) const; String getContentType( const String & name, - ContextPtr context, + const ContextPtr & context, const std::optional & format_settings = std::nullopt) const; SchemaReaderPtr getSchemaReader( const String & name, ReadBuffer & buf, - ContextPtr & context, + const ContextPtr & context, const std::optional & format_settings = std::nullopt) const; ExternalSchemaReaderPtr getExternalSchemaReader( const String & name, - ContextPtr & context, + const ContextPtr & context, const std::optional & format_settings = std::nullopt) const; void registerFileSegmentationEngine(const String & name, FileSegmentationEngine file_segmentation_engine); @@ -216,7 +216,7 @@ public: /// registerAppendSupportChecker with append_support_checker that always returns true. void markFormatHasNoAppendSupport(const String & name); - bool checkIfFormatSupportAppend(const String & name, ContextPtr context, const std::optional & format_settings_ = std::nullopt); + bool checkIfFormatSupportAppend(const String & name, const ContextPtr & context, const std::optional & format_settings_ = std::nullopt); /// Register format by its name. void registerInputFormat(const String & name, InputCreator input_creator); @@ -225,8 +225,10 @@ public: /// Register file extension for format void registerFileExtension(const String & extension, const String & format_name); - String getFormatFromFileName(String file_name, bool throw_if_not_found = false); + String getFormatFromFileName(String file_name); + std::optional tryGetFormatFromFileName(String file_name); String getFormatFromFileDescriptor(int fd); + std::optional tryGetFormatFromFileDescriptor(int fd); /// Register schema readers for format its name. void registerSchemaReader(const String & name, SchemaReaderCreator schema_reader_creator); @@ -244,16 +246,18 @@ public: bool checkIfFormatHasAnySchemaReader(const String & name) const; bool checkIfOutputFormatPrefersLargeBlocks(const String & name) const; - bool checkParallelizeOutputAfterReading(const String & name, ContextPtr context) const; + bool checkParallelizeOutputAfterReading(const String & name, const ContextPtr & context) const; void registerAdditionalInfoForSchemaCacheGetter(const String & name, AdditionalInfoForSchemaCacheGetter additional_info_for_schema_cache_getter); - String getAdditionalInfoForSchemaCache(const String & name, ContextPtr context, const std::optional & format_settings_ = std::nullopt); + String getAdditionalInfoForSchemaCache(const String & name, const ContextPtr & context, const std::optional & format_settings_ = std::nullopt); const FormatsDictionary & getAllFormats() const { return dict; } + std::vector getAllInputFormats() const; + bool isInputFormat(const String & name) const; bool isOutputFormat(const String & name) const; diff --git a/src/Formats/FormatSettings.h b/src/Formats/FormatSettings.h index bdd2dda5287..aa37216d381 100644 --- a/src/Formats/FormatSettings.h +++ b/src/Formats/FormatSettings.h @@ -46,6 +46,7 @@ struct FormatSettings bool try_infer_integers = false; bool try_infer_dates = false; bool try_infer_datetimes = false; + bool try_infer_exponent_floats = false; enum class DateTimeInputFormat { @@ -361,6 +362,7 @@ struct FormatSettings bool deduce_templates_of_expressions = true; bool accurate_types_of_literals = true; bool allow_data_after_semicolon = false; + bool escape_quote_with_quote = false; } values; enum class ORCCompression diff --git a/src/Formats/ReadSchemaUtils.cpp b/src/Formats/ReadSchemaUtils.cpp index 43931be3449..5badf4301bf 100644 --- a/src/Formats/ReadSchemaUtils.cpp +++ b/src/Formats/ReadSchemaUtils.cpp @@ -4,6 +4,7 @@ #include #include #include +#include namespace DB { @@ -14,7 +15,9 @@ namespace ErrorCodes extern const int BAD_ARGUMENTS; extern const int ONLY_NULLS_WHILE_READING_SCHEMA; extern const int CANNOT_EXTRACT_TABLE_STRUCTURE; + extern const int CANNOT_DETECT_FORMAT; extern const int TYPE_MISMATCH; + extern const int LOGICAL_ERROR; } static std::optional getOrderedColumnsList(const NamesAndTypesList & columns_list, const Names & columns_order_hint) @@ -43,50 +46,87 @@ bool isRetryableSchemaInferenceError(int code) return code == ErrorCodes::EMPTY_DATA_PASSED || code == ErrorCodes::ONLY_NULLS_WHILE_READING_SCHEMA; } -ColumnsDescription readSchemaFromFormat( - const String & format_name, +/// Order of formats to try in automatic format detection. +/// If we can successfully detect some format, we won't try next ones. +static const std::vector & getFormatsOrderForDetection() +{ + static const std::vector formats_order = + { + "Parquet", + "ORC", + "Arrow", + "ArrowStream", + "Avro", + "AvroConfluent", + "Npy", + "Native", + "BSONEachRow", + "JSONCompact", + "Values", + "TSKV", + "JSONObjectEachRow", + "JSONColumns", + "JSONCompactColumns", + "JSONCompact", + "JSON", + }; + + return formats_order; +} + +/// The set of similar formats to try in automatic format detection. +/// We will try all formats from this set and then choose the best one +/// according to inferred schema. +static const std::vector & getSimilarFormatsSetForDetection() +{ + static const std::vector formats_order = + { + "TSV", + "CSV", + }; + + return formats_order; +} + +std::pair readSchemaFromFormatImpl( + std::optional format_name, const std::optional & format_settings, IReadBufferIterator & read_buffer_iterator, - bool retry, - ContextPtr & context, - std::unique_ptr & buf) + const ContextPtr & context) try { NamesAndTypesList names_and_types; SchemaInferenceMode mode = context->getSettingsRef().schema_inference_mode; - if (mode == SchemaInferenceMode::UNION && !FormatFactory::instance().checkIfFormatSupportsSubsetOfColumns(format_name, context, format_settings)) + if (format_name && mode == SchemaInferenceMode::UNION && !FormatFactory::instance().checkIfFormatSupportsSubsetOfColumns(*format_name, context, format_settings)) { String additional_message; /// Better exception message for WithNames(AndTypes) formats. - if (format_name.ends_with("WithNames") || format_name.ends_with("WithNamesAndTypes")) + if (format_name->ends_with("WithNames") || format_name->ends_with("WithNamesAndTypes")) additional_message = " (formats -WithNames(AndTypes) support reading subset of columns only when setting input_format_with_names_use_header is enabled)"; - throw Exception(ErrorCodes::BAD_ARGUMENTS, "UNION schema inference mode is not supported for format {}, because it doesn't support reading subset of columns{}", format_name, additional_message); + throw Exception(ErrorCodes::BAD_ARGUMENTS, "UNION schema inference mode is not supported for format {}, because it doesn't support reading subset of columns{}", *format_name, additional_message); } - if (FormatFactory::instance().checkIfFormatHasExternalSchemaReader(format_name)) + if (format_name && FormatFactory::instance().checkIfFormatHasExternalSchemaReader(*format_name)) { - auto external_schema_reader = FormatFactory::instance().getExternalSchemaReader(format_name, context, format_settings); + auto external_schema_reader = FormatFactory::instance().getExternalSchemaReader(*format_name, context, format_settings); try { - names_and_types = external_schema_reader->readSchema(); + return {ColumnsDescription(external_schema_reader->readSchema()), *format_name}; } catch (Exception & e) { e.addMessage( - fmt::format("Cannot extract table structure from {} format file. You can specify the structure manually", format_name)); + fmt::format("The table structure cannot be extracted from a {} format file. You can specify the structure manually", *format_name)); throw; } } - else if (FormatFactory::instance().checkIfFormatHasSchemaReader(format_name)) - { - if (mode == SchemaInferenceMode::UNION) - retry = false; + if (!format_name || FormatFactory::instance().checkIfFormatHasSchemaReader(*format_name)) + { + IReadBufferIterator::Data iterator_data; std::vector> schemas_for_union_mode; - std::optional cached_columns; std::string exception_messages; - SchemaReaderPtr schema_reader; size_t max_rows_to_read = format_settings ? format_settings->max_rows_to_read_for_schema_inference : context->getSettingsRef().input_format_max_rows_to_read_for_schema_inference; size_t max_bytes_to_read = format_settings ? format_settings->max_bytes_to_read_for_schema_inference @@ -94,45 +134,71 @@ try size_t iterations = 0; while (true) { + /// When we finish working with current buffer we should put it back to iterator. + SCOPE_EXIT(if (iterator_data.buf) read_buffer_iterator.setPreviousReadBuffer(std::move(iterator_data.buf))); bool is_eof = false; try { - read_buffer_iterator.setPreviousReadBuffer(std::move(buf)); - std::tie(buf, cached_columns) = read_buffer_iterator.next(); - if (cached_columns) + iterator_data = read_buffer_iterator.next(); + + /// Read buffer iterator can determine the data format if it's unknown. + /// For example by scanning schema cache or by finding new file with format extension. + if (!format_name && iterator_data.format_name) { + format_name = *iterator_data.format_name; + read_buffer_iterator.setFormatName(*iterator_data.format_name); + } + + if (iterator_data.cached_columns) + { + /// If we have schema in cache, we must also know the format. + if (!format_name) + throw Exception(ErrorCodes::LOGICAL_ERROR, "Schema from cache was returned, but format name is unknown"); + if (mode == SchemaInferenceMode::DEFAULT) - return *cached_columns; - schemas_for_union_mode.emplace_back(cached_columns->getAll(), read_buffer_iterator.getLastFileName()); + { + read_buffer_iterator.setResultingSchema(*iterator_data.cached_columns); + return {*iterator_data.cached_columns, *format_name}; + } + + schemas_for_union_mode.emplace_back(iterator_data.cached_columns->getAll(), read_buffer_iterator.getLastFileName()); continue; } - if (!buf) + if (!iterator_data.buf) break; /// We just want to check for eof, but eof() can be pretty expensive. /// So we use getFileSize() when available, which has better worst case. /// (For remote files, typically eof() would read 1 MB from S3, which may be much /// more than what the schema reader and even data reader will read). - auto size = tryGetFileSizeFromReadBuffer(*buf); + auto size = tryGetFileSizeFromReadBuffer(*iterator_data.buf); if (size.has_value()) is_eof = *size == 0; else - is_eof = buf->eof(); + is_eof = iterator_data.buf->eof(); } catch (Exception & e) { - e.addMessage( - fmt::format("Cannot extract table structure from {} format file. You can specify the structure manually", format_name)); + if (format_name) + e.addMessage(fmt::format("The table structure cannot be extracted from a {} format file. You can specify the structure manually", *format_name)); + else + e.addMessage("The data format cannot be detected by the contents of the files. You can specify the format manually"); throw; } catch (...) { auto exception_message = getCurrentExceptionMessage(false); + if (format_name) + throw Exception( + ErrorCodes::CANNOT_EXTRACT_TABLE_STRUCTURE, + "The table structure cannot be extracted from a {} format file:\n{}.\nYou can specify the structure manually", + *format_name, + exception_message); + throw Exception( - ErrorCodes::CANNOT_EXTRACT_TABLE_STRUCTURE, - "Cannot extract table structure from {} format file:\n{}\nYou can specify the structure manually", - format_name, + ErrorCodes::CANNOT_DETECT_FORMAT, + "The data format cannot be detected by the contents of the files:\n{}.\nYou can specify the format manually", exception_message); } @@ -140,91 +206,224 @@ try if (is_eof) { - auto exception_message = fmt::format("Cannot extract table structure from {} format file, file is empty", format_name); + String exception_message; + if (format_name) + exception_message = fmt::format("The table structure cannot be extracted from a {} format file: the file is empty", *format_name); + else + exception_message = fmt::format("The data format cannot be detected by the contents of the files: the file is empty"); - if (!retry) - throw Exception( - ErrorCodes::CANNOT_EXTRACT_TABLE_STRUCTURE, "{}. You can specify the structure manually", exception_message); + if (mode == SchemaInferenceMode::UNION) + { + if (!format_name) + throw Exception(ErrorCodes::CANNOT_DETECT_FORMAT, "The data format cannot be detected by the contents of the files: the file is empty. You can specify the format manually"); - exception_messages += "\n" + exception_message; + throw Exception(ErrorCodes::CANNOT_EXTRACT_TABLE_STRUCTURE, "{}. You can specify the structure manually", exception_message); + } + + if (!exception_messages.empty()) + exception_messages += "\n"; + exception_messages += exception_message; continue; } - try - { - schema_reader = FormatFactory::instance().getSchemaReader(format_name, *buf, context, format_settings); - schema_reader->setMaxRowsAndBytesToRead(max_rows_to_read, max_bytes_to_read); - names_and_types = schema_reader->readSchema(); - auto num_rows = schema_reader->readNumberOrRows(); - if (num_rows) - read_buffer_iterator.setNumRowsToLastFile(*num_rows); + std::unique_ptr peekable_buf; /// Can be used in format detection. Should be destroyed after schema reader. - /// In default mode, we finish when schema is inferred successfully from any file. - if (mode == SchemaInferenceMode::DEFAULT) - break; - - if (!names_and_types.empty()) - read_buffer_iterator.setSchemaToLastFile(ColumnsDescription(names_and_types)); - schemas_for_union_mode.emplace_back(names_and_types, read_buffer_iterator.getLastFileName()); - } - catch (...) + if (format_name) { - auto exception_message = getCurrentExceptionMessage(false); - if (schema_reader && mode == SchemaInferenceMode::DEFAULT) + SchemaReaderPtr schema_reader; + + try { - size_t rows_read = schema_reader->getNumRowsRead(); - assert(rows_read <= max_rows_to_read); - max_rows_to_read -= schema_reader->getNumRowsRead(); - size_t bytes_read = buf->count(); - /// We could exceed max_bytes_to_read a bit to complete row parsing. - max_bytes_to_read -= std::min(bytes_read, max_bytes_to_read); - if (rows_read != 0 && (max_rows_to_read == 0 || max_bytes_to_read == 0)) - { - exception_message += "\nTo increase the maximum number of rows/bytes to read for structure determination, use setting " - "input_format_max_rows_to_read_for_schema_inference/input_format_max_bytes_to_read_for_schema_inference"; + schema_reader = FormatFactory::instance().getSchemaReader(*format_name, *iterator_data.buf, context, format_settings); + schema_reader->setMaxRowsAndBytesToRead(max_rows_to_read, max_bytes_to_read); + names_and_types = schema_reader->readSchema(); + auto num_rows = schema_reader->readNumberOrRows(); + if (num_rows) + read_buffer_iterator.setNumRowsToLastFile(*num_rows); - if (iterations > 1) + /// In default mode, we finish when schema is inferred successfully from any file. + if (mode == SchemaInferenceMode::DEFAULT) + break; + + if (!names_and_types.empty()) + read_buffer_iterator.setSchemaToLastFile(ColumnsDescription(names_and_types)); + schemas_for_union_mode.emplace_back(names_and_types, read_buffer_iterator.getLastFileName()); + } + catch (...) + { + auto exception_message = getCurrentExceptionMessage(false); + if (schema_reader && mode == SchemaInferenceMode::DEFAULT) + { + size_t rows_read = schema_reader->getNumRowsRead(); + assert(rows_read <= max_rows_to_read); + max_rows_to_read -= schema_reader->getNumRowsRead(); + size_t bytes_read = iterator_data.buf->count(); + /// We could exceed max_bytes_to_read a bit to complete row parsing. + max_bytes_to_read -= std::min(bytes_read, max_bytes_to_read); + if (rows_read != 0 && (max_rows_to_read == 0 || max_bytes_to_read == 0)) { - exception_messages += "\n" + exception_message; + exception_message + += "\nTo increase the maximum number of rows/bytes to read for structure determination, use setting " + "input_format_max_rows_to_read_for_schema_inference/input_format_max_bytes_to_read_for_schema_inference"; + if (!exception_messages.empty()) + exception_messages += "\n"; + exception_messages += exception_message; break; } - retry = false; } - } - if (!retry || !isRetryableSchemaInferenceError(getCurrentExceptionCode())) - { - try - { - throw; - } - catch (Exception & e) - { - e.addMessage(fmt::format( - "Cannot extract table structure from {} format file. You can specify the structure manually", format_name)); - throw; - } - catch (...) + if (mode == SchemaInferenceMode::UNION || !isRetryableSchemaInferenceError(getCurrentExceptionCode())) { throw Exception( ErrorCodes::CANNOT_EXTRACT_TABLE_STRUCTURE, - "Cannot extract table structure from {} format file. " - "Error: {}. You can specify the structure manually", - format_name, + "The table structure cannot be extracted from a {} format file. " + "Error:\n{}.\nYou can specify the structure manually", + *format_name, exception_message); } + + if (!exception_messages.empty()) + exception_messages += "\n"; + exception_messages += exception_message; + } + } + else + { + /// If the format is unknown we try some formats in order and try to apply their schema readers. + /// If we can successfully infer the schema in some format, most likely we can use this format to read this data. + + /// If read_buffer_iterator supports recreation of last buffer, we will recreate it for + /// each format. Otherwise we will use PeekableReadBuffer and will rollback to the + /// beginning of the file before each format. Using PeekableReadBuffer can lead + /// to high memory usage as it will save all the read data from the beginning of the file, + /// especially it will be noticeable for formats like Parquet/ORC/Arrow that do seeks to the + /// end of file. + bool support_buf_recreation = read_buffer_iterator.supportsLastReadBufferRecreation(); + if (!support_buf_recreation) + { + peekable_buf = std::make_unique(*iterator_data.buf); + peekable_buf->setCheckpoint(); + } + + /// First, try some formats in order. If we successfully inferred the schema for any format, + /// we will use this format. + for (const auto & format_to_detect : getFormatsOrderForDetection()) + { + try + { + SchemaReaderPtr schema_reader = FormatFactory::instance().getSchemaReader(format_to_detect, support_buf_recreation ? *iterator_data.buf : *peekable_buf, context, format_settings); + schema_reader->setMaxRowsAndBytesToRead(max_rows_to_read, max_bytes_to_read); + names_and_types = schema_reader->readSchema(); + if (names_and_types.empty()) + continue; + + /// We successfully inferred schema from this file using current format. + format_name = format_to_detect; + read_buffer_iterator.setFormatName(format_to_detect); + + auto num_rows = schema_reader->readNumberOrRows(); + if (num_rows) + read_buffer_iterator.setNumRowsToLastFile(*num_rows); + + break; + } + catch (...) + { + /// We failed to infer the schema for this format. + /// Recreate read buffer or rollback to the beginning of the data + /// before trying next format. + if (support_buf_recreation) + { + read_buffer_iterator.setPreviousReadBuffer(std::move(iterator_data.buf)); + iterator_data.buf = read_buffer_iterator.recreateLastReadBuffer(); + } + else + { + peekable_buf->rollbackToCheckpoint(); + } + } } - exception_messages += "\n" + exception_message; + /// If no format was detected from first set of formats, we try second set. + /// In this set formats are similar and it can happen that data matches some of them. + /// We try to infer schema for all of the formats from this set and then choose the best + /// one according to the inferred schema. + if (!format_name) + { + std::unordered_map format_to_schema; + const auto & formats_set_to_detect = getSimilarFormatsSetForDetection(); + for (size_t i = 0; i != formats_set_to_detect.size(); ++i) + { + try + { + SchemaReaderPtr schema_reader = FormatFactory::instance().getSchemaReader( + formats_set_to_detect[i], support_buf_recreation ? *iterator_data.buf : *peekable_buf, context, format_settings); + schema_reader->setMaxRowsAndBytesToRead(max_rows_to_read, max_bytes_to_read); + auto tmp_names_and_types = schema_reader->readSchema(); + /// If schema was inferred successfully for this format, remember it and try next format. + if (!tmp_names_and_types.empty()) + format_to_schema[formats_set_to_detect[i]] = tmp_names_and_types; + } + catch (...) // NOLINT(bugprone-empty-catch) + { + /// Try next format. + } + + if (i != formats_set_to_detect.size() - 1) + { + if (support_buf_recreation) + { + read_buffer_iterator.setPreviousReadBuffer(std::move(iterator_data.buf)); + iterator_data.buf = read_buffer_iterator.recreateLastReadBuffer(); + } + else + { + peekable_buf->rollbackToCheckpoint(); + } + } + } + + /// We choose the format with larger number of columns in inferred schema. + size_t max_number_of_columns = 0; + for (const auto & [format_to_detect, schema] : format_to_schema) + { + if (schema.size() > max_number_of_columns) + { + names_and_types = schema; + format_name = format_to_detect; + max_number_of_columns = schema.size(); + } + } + + if (format_name) + read_buffer_iterator.setFormatName(*format_name); + } + + if (mode == SchemaInferenceMode::UNION) + { + /// For UNION mode we need to know the schema of each file, + /// if we failed to detect the format, we failed to detect the schema of this file + /// in any format. It doesn't make sense to continue. + if (!format_name) + throw Exception(ErrorCodes::CANNOT_DETECT_FORMAT, "The data format cannot be detected by the contents of the files. You can specify the format manually"); + + read_buffer_iterator.setSchemaToLastFile(ColumnsDescription(names_and_types)); + schemas_for_union_mode.emplace_back(names_and_types, read_buffer_iterator.getLastFileName()); + } + + if (format_name && mode == SchemaInferenceMode::DEFAULT) + break; } } - /// If we got all schemas from cache, schema_reader can be uninitialized. - /// But we still need some stateless methods of ISchemaReader, - /// let's initialize it with empty buffer. + if (!format_name) + throw Exception(ErrorCodes::CANNOT_DETECT_FORMAT, "The data format cannot be detected by the contents of the files. You can specify the format manually"); + + /// We need some stateless methods of ISchemaReader, but during reading schema we + /// could not even create a schema reader (for example when we got schema from cache). + /// Let's create stateless schema reader from empty read buffer. EmptyReadBuffer empty; - if (!schema_reader) - schema_reader = FormatFactory::instance().getSchemaReader(format_name, empty, context, format_settings); + SchemaReaderPtr stateless_schema_reader = FormatFactory::instance().getSchemaReader(*format_name, empty, context, format_settings); if (mode == SchemaInferenceMode::UNION) { @@ -251,7 +450,7 @@ try /// If types are not the same, try to transform them according /// to the format to find common type. auto new_type_copy = type; - schema_reader->transformTypesFromDifferentFilesIfNeeded(it->second, new_type_copy); + stateless_schema_reader->transformTypesFromDifferentFilesIfNeeded(it->second, new_type_copy); /// If types are not the same after transform, we cannot do anything, throw an exception. if (!it->second->equals(*new_type_copy)) @@ -273,11 +472,23 @@ try } if (names_and_types.empty()) + { + if (iterations <= 1) + { + throw Exception( + ErrorCodes::CANNOT_EXTRACT_TABLE_STRUCTURE, + "The table structure cannot be extracted from a {} format file. " + "Error:\n{}.\nYou can specify the structure manually", + *format_name, + exception_messages); + } + throw Exception( ErrorCodes::CANNOT_EXTRACT_TABLE_STRUCTURE, "All attempts to extract table structure from files failed. " - "Errors:{}\nYou can specify the structure manually", + "Errors:\n{}\nYou can specify the structure manually", exception_messages); + } /// If we have "INSERT SELECT" query then try to order /// columns as they are ordered in table schema for formats @@ -285,7 +496,7 @@ try /// It will allow to execute simple data loading with query /// "INSERT INTO table SELECT * FROM ..." const auto & insertion_table = context->getInsertionTable(); - if (schema_reader && !schema_reader->hasStrictOrderOfColumns() && !insertion_table.empty()) + if (!stateless_schema_reader->hasStrictOrderOfColumns() && !insertion_table.empty()) { auto storage = DatabaseCatalog::instance().getTable(insertion_table, context); auto metadata = storage->getInMemoryMetadataPtr(); @@ -294,22 +505,22 @@ try if (ordered_list) names_and_types = *ordered_list; } + + /// Some formats like CSVWithNames can contain empty column names. We don't support empty column names and further processing can fail with an exception. Let's just remove columns with empty names from the structure. + names_and_types.erase( + std::remove_if(names_and_types.begin(), names_and_types.end(), [](const NameAndTypePair & pair) { return pair.name.empty(); }), + names_and_types.end()); + + auto columns = ColumnsDescription(names_and_types); + if (mode == SchemaInferenceMode::DEFAULT) + read_buffer_iterator.setResultingSchema(columns); + return {columns, *format_name}; } - else - throw Exception( - ErrorCodes::BAD_ARGUMENTS, - "{} file format doesn't support schema inference. You must specify the structure manually", - format_name); - /// Some formats like CSVWithNames can contain empty column names. We don't support empty column names and further processing can fail with an exception. Let's just remove columns with empty names from the structure. - names_and_types.erase( - std::remove_if(names_and_types.begin(), names_and_types.end(), [](const NameAndTypePair & pair) { return pair.name.empty(); }), - names_and_types.end()); - - auto columns = ColumnsDescription(names_and_types); - if (mode == SchemaInferenceMode::DEFAULT) - read_buffer_iterator.setResultingSchema(columns); - return columns; + throw Exception( + ErrorCodes::BAD_ARGUMENTS, + "{} file format doesn't support schema inference. You must specify the structure manually", + *format_name); } catch (Exception & e) { @@ -319,16 +530,21 @@ catch (Exception & e) throw; } - ColumnsDescription readSchemaFromFormat( const String & format_name, const std::optional & format_settings, IReadBufferIterator & read_buffer_iterator, - bool retry, - ContextPtr & context) + const ContextPtr & context) { - std::unique_ptr buf_out; - return readSchemaFromFormat(format_name, format_settings, read_buffer_iterator, retry, context, buf_out); + return readSchemaFromFormatImpl(format_name, format_settings, read_buffer_iterator, context).first; +} + +std::pair detectFormatAndReadSchema( + const std::optional & format_settings, + IReadBufferIterator & read_buffer_iterator, + const ContextPtr & context) +{ + return readSchemaFromFormatImpl(std::nullopt, format_settings, read_buffer_iterator, context); } SchemaCache::Key getKeyForSchemaCache( diff --git a/src/Formats/ReadSchemaUtils.h b/src/Formats/ReadSchemaUtils.h index 6aa8f3f9c4c..bb5e068f696 100644 --- a/src/Formats/ReadSchemaUtils.h +++ b/src/Formats/ReadSchemaUtils.h @@ -7,29 +7,68 @@ namespace DB { +namespace ErrorCodes +{ + extern const int NOT_IMPLEMENTED; +} + struct IReadBufferIterator { virtual ~IReadBufferIterator() = default; - virtual void setPreviousReadBuffer(std::unique_ptr /* buffer */) {} - /// Return read buffer of the next file or cached schema. /// In DEFAULT schema inference mode cached schema can be from any file. /// In UNION mode cached schema can be only from current file. /// When there is no files to process, return pair (nullptr, nullopt) - virtual std::pair, std::optional> next() = 0; + struct Data + { + /// Read buffer of the next file. Can be nullptr if there are no more files + /// or when schema was found in cache. + std::unique_ptr buf; + + /// Schema from cache. + /// In DEFAULT schema inference mode cached schema can be from any file. + /// In UNION mode cached schema can be only from current file. + std::optional cached_columns; + + /// Format of the file if known. + std::optional format_name; + }; + + virtual Data next() = 0; + + /// Set read buffer returned in previous iteration. + virtual void setPreviousReadBuffer(std::unique_ptr /* buffer */) {} + + /// Set number of rows to last file extracted during schema inference. + /// Used for caching number of rows from files metadata during schema inference. virtual void setNumRowsToLastFile(size_t /*num_rows*/) {} /// Set schema inferred from last file. Used for UNION mode to cache schema /// per file. virtual void setSchemaToLastFile(const ColumnsDescription & /*columns*/) {} + /// Set resulting inferred schema. Used for DEFAULT mode to cache schema /// for all files. virtual void setResultingSchema(const ColumnsDescription & /*columns*/) {} + /// Set auto detected format name. + virtual void setFormatName(const String & /*format_name*/) {} + /// Get last processed file name for better exception messages. virtual String getLastFileName() const { return ""; } + + /// Return true if method recreateLastReadBuffer is implemented. + virtual bool supportsLastReadBufferRecreation() const { return false; } + + /// Recreate last read buffer to read data from the same file again. + /// Used to detect format from the file content to avoid + /// copying data. + virtual std::unique_ptr recreateLastReadBuffer() + { + throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Method recreateLastReadBuffer is not implemented"); + } }; struct SingleReadBufferIterator : public IReadBufferIterator @@ -39,12 +78,22 @@ public: { } - std::pair, std::optional> next() override + Data next() override { if (done) - return {nullptr, {}}; + return {nullptr, {}, std::nullopt}; done = true; - return {std::move(buf), {}}; + return Data{std::move(buf), {}, std::nullopt}; + } + + void setPreviousReadBuffer(std::unique_ptr buf_) override + { + buf = std::move(buf_); + } + + std::unique_ptr releaseBuffer() + { + return std::move(buf); } private: @@ -73,17 +122,16 @@ ColumnsDescription readSchemaFromFormat( const String & format_name, const std::optional & format_settings, IReadBufferIterator & read_buffer_iterator, - bool retry, - ContextPtr & context); + const ContextPtr & context); -/// If ReadBuffer is created, it will be written to buf_out. -ColumnsDescription readSchemaFromFormat( - const String & format_name, +/// Try to detect the format of the data and it's schema. +/// It runs schema inference for some set of formats on the same file. +/// If schema reader of some format successfully inferred the schema from +/// some file, we consider that the data is in this format. +std::pair detectFormatAndReadSchema( const std::optional & format_settings, IReadBufferIterator & read_buffer_iterator, - bool retry, - ContextPtr & context, - std::unique_ptr & buf_out); + const ContextPtr & context); SchemaCache::Key getKeyForSchemaCache(const String & source, const String & format, const std::optional & format_settings, const ContextPtr & context); SchemaCache::Keys getKeysForSchemaCache(const Strings & sources, const String & format, const std::optional & format_settings, const ContextPtr & context); diff --git a/src/Formats/SchemaInferenceUtils.cpp b/src/Formats/SchemaInferenceUtils.cpp index 2cfcff75edd..06b52e7a7a2 100644 --- a/src/Formats/SchemaInferenceUtils.cpp +++ b/src/Formats/SchemaInferenceUtils.cpp @@ -17,6 +17,7 @@ #include #include #include +#include #include #include @@ -865,6 +866,13 @@ namespace return std::make_shared(nested_types); } + bool tryReadFloat(Float64 & value, ReadBuffer & buf, const FormatSettings & settings) + { + if (settings.try_infer_exponent_floats) + return tryReadFloatText(value, buf); + return tryReadFloatTextNoExponent(value, buf); + } + DataTypePtr tryInferNumber(ReadBuffer & buf, const FormatSettings & settings) { if (buf.eof()) @@ -903,7 +911,7 @@ namespace buf.position() = number_start; } - if (tryReadFloatText(tmp_float, buf)) + if (tryReadFloat(tmp_float, buf, settings)) { if (read_int && buf.position() == int_end) return std::make_shared(); @@ -937,7 +945,7 @@ namespace peekable_buf.rollbackToCheckpoint(true); } - if (tryReadFloatText(tmp_float, peekable_buf)) + if (tryReadFloat(tmp_float, peekable_buf, settings)) { /// Float parsing reads no fewer bytes than integer parsing, /// so position of the buffer is either the same, or further. @@ -949,7 +957,7 @@ namespace return std::make_shared(); } } - else if (tryReadFloatText(tmp_float, buf)) + else if (tryReadFloat(tmp_float, buf, settings)) { return std::make_shared(); } @@ -1390,7 +1398,7 @@ DataTypePtr tryInferNumberFromString(std::string_view field, const FormatSetting buf.position() = buf.buffer().begin(); Float64 tmp; - if (tryReadFloatText(tmp, buf) && buf.eof()) + if (tryReadFloat(tmp, buf, settings) && buf.eof()) return std::make_shared(); return nullptr; diff --git a/src/Functions/CMakeLists.txt b/src/Functions/CMakeLists.txt index a06e898b7c5..ac3e3671ae0 100644 --- a/src/Functions/CMakeLists.txt +++ b/src/Functions/CMakeLists.txt @@ -9,6 +9,11 @@ extract_into_parent_list(clickhouse_functions_sources dbms_sources FunctionHelpers.cpp extractTimeZoneFromFunctionArguments.cpp FunctionsLogical.cpp + if.cpp + multiIf.cpp + multiMatchAny.cpp + checkHyperscanRegexp.cpp + array/has.cpp CastOverloadResolver.cpp ) extract_into_parent_list(clickhouse_functions_headers dbms_headers diff --git a/src/Functions/FunctionBinaryArithmetic.h b/src/Functions/FunctionBinaryArithmetic.h index 1b2519d1ec5..d253095ca01 100644 --- a/src/Functions/FunctionBinaryArithmetic.h +++ b/src/Functions/FunctionBinaryArithmetic.h @@ -147,11 +147,32 @@ private: /// it's not correct for Decimal public: static constexpr bool allow_decimal = IsOperation::allow_decimal; + using DecimalResultDataType = Switch< + Case, + Case && IsDataTypeDecimal && UseLeftDecimal, LeftDataType>, + Case && IsDataTypeDecimal, RightDataType>, + Case && IsIntegralOrExtended, LeftDataType>, + Case && IsIntegralOrExtended, RightDataType>, + + /// e.g Decimal +-*/ Float, least(Decimal, Float), greatest(Decimal, Float) = Float64 + Case && IsFloatingPoint, DataTypeFloat64>, + Case && IsFloatingPoint, DataTypeFloat64>, + + Case::bit_hamming_distance && IsIntegral && IsIntegral, DataTypeUInt8>, + Case::bit_hamming_distance && IsFixedString && IsFixedString, DataTypeUInt16>, + Case::bit_hamming_distance && IsString && IsString, DataTypeUInt64>, + + /// Decimal Real is not supported (traditional DBs convert Decimal Real to Real) + Case && !IsIntegralOrExtendedOrDecimal, InvalidType>, + Case && !IsIntegralOrExtendedOrDecimal, InvalidType>>; + /// Appropriate result type for binary operator on numeric types. "Date" can also mean /// DateTime, but if both operands are Dates, their type must be the same (e.g. Date - DateTime is invalid). using ResultDataType = Switch< + /// Result must be Integer + Case::div_int || IsOperation::div_int_or_zero, DataTypeFromFieldType>, /// Decimal cases - Case || IsDataTypeDecimal), InvalidType>, + Case || IsDataTypeDecimal, DecimalResultDataType>, Case< IsDataTypeDecimal && IsDataTypeDecimal && UseLeftDecimal, LeftDataType>, @@ -622,7 +643,11 @@ private: if constexpr (op_case == OpCase::RightConstant) { if ((*right_nullmap)[0]) + { + for (size_t i = 0; i < size; ++i) + c[i] = ResultType(); return; + } for (size_t i = 0; i < size; ++i) c[i] = apply_func(undec(a[i]), undec(b)); @@ -1665,7 +1690,9 @@ public: if constexpr (!std::is_same_v) { - if constexpr (IsDataTypeDecimal && IsDataTypeDecimal) + if constexpr (is_div_int || is_div_int_or_zero) + type_res = std::make_shared(); + else if constexpr (IsDataTypeDecimal && IsDataTypeDecimal) { if constexpr (is_division) { @@ -1685,13 +1712,19 @@ public: ResultDataType result_type = decimalResultType(left, right); type_res = std::make_shared(result_type.getPrecision(), result_type.getScale()); } - else if constexpr ((IsDataTypeDecimal && IsFloatingPoint) || - (IsDataTypeDecimal && IsFloatingPoint)) + else if constexpr (((IsDataTypeDecimal && IsFloatingPoint) || + (IsDataTypeDecimal && IsFloatingPoint))) + { type_res = std::make_shared(); + } else if constexpr (IsDataTypeDecimal) + { type_res = std::make_shared(left.getPrecision(), left.getScale()); + } else if constexpr (IsDataTypeDecimal) + { type_res = std::make_shared(right.getPrecision(), right.getScale()); + } else if constexpr (std::is_same_v) { // Special case for DateTime: binary OPS should reuse timezone @@ -2000,6 +2033,7 @@ ColumnPtr executeStringInteger(const ColumnsWithTypeAndName & arguments, const A using LeftDataType = std::decay_t; using RightDataType = std::decay_t; using ResultDataType = typename BinaryOperationTraits::ResultDataType; + using DecimalResultType = typename BinaryOperationTraits::DecimalResultDataType; if constexpr (std::is_same_v) return nullptr; @@ -2051,6 +2085,35 @@ ColumnPtr executeStringInteger(const ColumnsWithTypeAndName & arguments, const A col_left_size, right_nullmap); } + /// Here we check if we have `intDiv` or `intDivOrZero` and at least one of the arguments is decimal, because in this case originally we had result as decimal, so we need to convert result into integer after calculations + else if constexpr (!decimal_with_float && (is_div_int || is_div_int_or_zero) && (IsDataTypeDecimal || IsDataTypeDecimal)) + { + + if constexpr (!std::is_same_v) + { + DataTypePtr type_res; + if constexpr (IsDataTypeDecimal && IsDataTypeDecimal) + { + DecimalResultType result_type = decimalResultType(left, right); + type_res = std::make_shared(result_type.getPrecision(), result_type.getScale()); + } + else if constexpr (IsDataTypeDecimal) + type_res = std::make_shared(left.getPrecision(), left.getScale()); + else + type_res = std::make_shared(right.getPrecision(), right.getScale()); + + auto res = executeNumericWithDecimal( + left, right, + col_left_const, col_right_const, + col_left, col_right, + col_left_size, + right_nullmap); + + auto col = ColumnWithTypeAndName(res, type_res, name); + return castColumn(col, std::make_shared()); + } + return nullptr; + } else // can't avoid else and another indentation level, otherwise the compiler would try to instantiate // ColVecResult for Decimals which would lead to a compile error. { diff --git a/src/Functions/FunctionsJSON.h b/src/Functions/FunctionsJSON.h index 31a99475b63..2539fa1aeb4 100644 --- a/src/Functions/FunctionsJSON.h +++ b/src/Functions/FunctionsJSON.h @@ -5,7 +5,7 @@ #include -#include +#include #include #include diff --git a/src/Functions/FunctionsLogical.cpp b/src/Functions/FunctionsLogical.cpp index d01fdc99076..d0795941e1f 100644 --- a/src/Functions/FunctionsLogical.cpp +++ b/src/Functions/FunctionsLogical.cpp @@ -1,5 +1,6 @@ #include #include +#include #include #include @@ -776,4 +777,21 @@ ColumnPtr FunctionUnaryLogical::executeImpl(const ColumnsWithTypeAnd return res; } +FunctionOverloadResolverPtr createInternalFunctionOrOverloadResolver() +{ + return std::make_unique(std::make_shared()); +} +FunctionOverloadResolverPtr createInternalFunctionAndOverloadResolver() +{ + return std::make_unique(std::make_shared()); +} +FunctionOverloadResolverPtr createInternalFunctionXorOverloadResolver() +{ + return std::make_unique(std::make_shared()); +} +FunctionOverloadResolverPtr createInternalFunctionNotOverloadResolver() +{ + return std::make_unique(std::make_shared()); +} + } diff --git a/src/Functions/IsOperation.h b/src/Functions/IsOperation.h index 8ea53c865ce..b2c7a27d375 100644 --- a/src/Functions/IsOperation.h +++ b/src/Functions/IsOperation.h @@ -61,7 +61,7 @@ struct IsOperation static constexpr bool bit_hamming_distance = IsSameOperation::value; static constexpr bool division = div_floating || div_int || div_int_or_zero || modulo; - + // NOTE: allow_decimal should not fully contain `division` because of divInt static constexpr bool allow_decimal = plus || minus || multiply || division || least || greatest; }; diff --git a/src/Functions/addressToLine.cpp b/src/Functions/addressToLine.cpp index 771c85cabf6..bb5edf2a07a 100644 --- a/src/Functions/addressToLine.cpp +++ b/src/Functions/addressToLine.cpp @@ -17,7 +17,7 @@ namespace DB namespace { -class FunctionAddressToLine: public FunctionAddressToLineBase +class FunctionAddressToLine : public FunctionAddressToLineBase { public: static constexpr auto name = "addressToLine"; diff --git a/src/Functions/array/arrayFold.cpp b/src/Functions/array/arrayFold.cpp index 44fe95624a6..63c14f475fc 100644 --- a/src/Functions/array/arrayFold.cpp +++ b/src/Functions/array/arrayFold.cpp @@ -32,6 +32,12 @@ public: size_t getNumberOfArguments() const override { return 0; } bool isSuitableForShortCircuitArgumentsExecution(const DataTypesWithConstInfo & /*arguments*/) const override { return true; } + /// Avoid the default adaptors since they modify the inputs and that makes knowing the lambda argument types + /// (getLambdaArgumentTypes) more complex, as it requires knowing what the adaptors will do + /// It's much simpler to avoid the adapters + bool useDefaultImplementationForNulls() const override { return false; } + bool useDefaultImplementationForLowCardinalityColumns() const override { return false; } + void getLambdaArgumentTypes(DataTypes & arguments) const override { if (arguments.size() < 3) diff --git a/src/Functions/array/arrayReduce.cpp b/src/Functions/array/arrayReduce.cpp index 5a6a99ef785..d47d1ae98cc 100644 --- a/src/Functions/array/arrayReduce.cpp +++ b/src/Functions/array/arrayReduce.cpp @@ -1,14 +1,15 @@ -#include -#include -#include -#include -#include -#include -#include #include #include #include #include +#include +#include +#include +#include +#include +#include +#include +#include #include #include @@ -48,6 +49,11 @@ public: bool isSuitableForShortCircuitArgumentsExecution(const DataTypesWithConstInfo & /*arguments*/) const override { return true; } bool useDefaultImplementationForConstants() const override { return true; } + /// As we parse the function name and deal with arrays we don't want to default NULL handler, which will hide + /// nullability from us (which also means hidden from the aggregate functions) + bool useDefaultImplementationForNulls() const override { return false; } + /// Same for low cardinality. We want to return exactly what the aggregate function returns, no meddling + bool useDefaultImplementationForLowCardinalityColumns() const override { return false; } ColumnNumbers getArgumentsThatAreAlwaysConstant() const override { return {0}; } DataTypePtr getReturnTypeImpl(const ColumnsWithTypeAndName & arguments) const override; @@ -115,7 +121,8 @@ ColumnPtr FunctionArrayReduce::executeImpl(const ColumnsWithTypeAndName & argume const IAggregateFunction & agg_func = *aggregate_function; std::unique_ptr arena = std::make_unique(); - /// Aggregate functions do not support constant columns. Therefore, we materialize them. + /// Aggregate functions do not support constant or lowcardinality columns. Therefore, we materialize them and + /// keep a reference so they are alive until we finish using their nested columns (array data/offset) std::vector materialized_columns; const size_t num_arguments_columns = arguments.size() - 1; @@ -126,6 +133,12 @@ ColumnPtr FunctionArrayReduce::executeImpl(const ColumnsWithTypeAndName & argume for (size_t i = 0; i < num_arguments_columns; ++i) { const IColumn * col = arguments[i + 1].column.get(); + auto col_no_lowcardinality = recursiveRemoveLowCardinality(arguments[i + 1].column); + if (col_no_lowcardinality != arguments[i + 1].column) + { + materialized_columns.emplace_back(col_no_lowcardinality); + col = col_no_lowcardinality.get(); + } const ColumnArray::Offsets * offsets_i = nullptr; if (const ColumnArray * arr = checkAndGetColumn(col)) diff --git a/src/Functions/array/has.cpp b/src/Functions/array/has.cpp index f08a4f29d2d..a17dcdcfbf9 100644 --- a/src/Functions/array/has.cpp +++ b/src/Functions/array/has.cpp @@ -9,4 +9,10 @@ struct NameHas { static constexpr auto name = "has"; }; using FunctionHas = FunctionArrayIndex; REGISTER_FUNCTION(Has) { factory.registerFunction(); } + +FunctionOverloadResolverPtr createInternalFunctionHasOverloadResolver() +{ + return std::make_unique(std::make_shared()); +} + } diff --git a/src/Functions/array/has.h b/src/Functions/array/has.h new file mode 100644 index 00000000000..226662d4051 --- /dev/null +++ b/src/Functions/array/has.h @@ -0,0 +1,12 @@ +#pragma once +#include + +namespace DB +{ + +class IFunctionOverloadResolver; +using FunctionOverloadResolverPtr = std::shared_ptr; + +FunctionOverloadResolverPtr createInternalFunctionHasOverloadResolver(); + +} diff --git a/src/Functions/countMatches.h b/src/Functions/countMatches.h index 5e02915de56..e9880e6e93f 100644 --- a/src/Functions/countMatches.h +++ b/src/Functions/countMatches.h @@ -3,6 +3,7 @@ #include #include #include +#include #include #include #include @@ -15,9 +16,7 @@ namespace DB namespace ErrorCodes { - extern const int ILLEGAL_TYPE_OF_ARGUMENT; extern const int ILLEGAL_COLUMN; - extern const int LOGICAL_ERROR; } using Pos = const char *; @@ -35,45 +34,46 @@ public: DataTypePtr getReturnTypeImpl(const ColumnsWithTypeAndName & arguments) const override { - if (!isStringOrFixedString(arguments[1].type)) - throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, - "Illegal type {} of second argument (pattern) of function {}. Must be String/FixedString.", - arguments[1].type->getName(), getName()); - if (!isStringOrFixedString(arguments[0].type)) - throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, - "Illegal type {} of first argument (haystack) of function {}. Must be String/FixedString.", - arguments[0].type->getName(), getName()); - const auto * column = arguments[1].column.get(); - if (!column || !checkAndGetColumnConstStringOrFixedString(column)) - throw Exception(ErrorCodes::ILLEGAL_COLUMN, - "The second argument of function {} should be a constant string with the pattern", - getName()); + FunctionArgumentDescriptors args{ + {"haystack", &isStringOrFixedString, nullptr, "String or FixedString"}, + {"pattern", &isString, isColumnConst, "constant String"} + }; + validateFunctionArgumentTypes(*this, arguments, args); return std::make_shared(); } ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr & result_type, size_t input_rows_count) const override { - const ColumnConst * column_pattern = checkAndGetColumnConstStringOrFixedString(arguments[1].column.get()); - const OptimizedRegularExpression re = Regexps::createRegexp(column_pattern->getValue()); + const IColumn * col_pattern = arguments[1].column.get(); + const ColumnConst * col_pattern_const = checkAndGetColumnConst(col_pattern); + if (col_pattern_const == nullptr) + throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Pattern argument is not const"); + + const OptimizedRegularExpression re = Regexps::createRegexp(col_pattern_const->getValue()); + + const IColumn * col_haystack = arguments[0].column.get(); OptimizedRegularExpression::MatchVec matches; - const IColumn * column_haystack = arguments[0].column.get(); - - if (const ColumnString * col_str = checkAndGetColumn(column_haystack)) + if (const ColumnConst * col_haystack_const = checkAndGetColumnConstStringOrFixedString(col_haystack)) { - auto result_column = ColumnUInt64::create(); + std::string_view str = col_haystack_const->getDataColumn().getDataAt(0).toView(); + uint64_t matches_count = countMatches(str, re, matches); + return result_type->createColumnConst(input_rows_count, matches_count); + } + else if (const ColumnString * col_haystack_string = checkAndGetColumn(col_haystack)) + { + auto col_res = ColumnUInt64::create(); - const ColumnString::Chars & src_chars = col_str->getChars(); - const ColumnString::Offsets & src_offsets = col_str->getOffsets(); + const ColumnString::Chars & src_chars = col_haystack_string->getChars(); + const ColumnString::Offsets & src_offsets = col_haystack_string->getOffsets(); - ColumnUInt64::Container & vec_res = result_column->getData(); + ColumnUInt64::Container & vec_res = col_res->getData(); vec_res.resize(input_rows_count); - size_t size = src_offsets.size(); ColumnString::Offset current_src_offset = 0; - for (size_t i = 0; i < size; ++i) + for (size_t i = 0; i < input_rows_count; ++i) { Pos pos = reinterpret_cast(&src_chars[current_src_offset]); current_src_offset = src_offsets[i]; @@ -83,16 +83,25 @@ public: vec_res[i] = countMatches(str, re, matches); } - return result_column; + return col_res; } - else if (const ColumnConst * col_const_str = checkAndGetColumnConstStringOrFixedString(column_haystack)) + else if (const ColumnFixedString * col_haystack_fixedstring = checkAndGetColumn(col_haystack)) { - std::string_view str = col_const_str->getDataColumn().getDataAt(0).toView(); - uint64_t matches_count = countMatches(str, re, matches); - return result_type->createColumnConst(input_rows_count, matches_count); + auto col_res = ColumnUInt64::create(); + + ColumnUInt64::Container & vec_res = col_res->getData(); + vec_res.resize(input_rows_count); + + for (size_t i = 0; i < input_rows_count; ++i) + { + std::string_view str = col_haystack_fixedstring->getDataAt(i).toView(); + vec_res[i] = countMatches(str, re, matches); + } + + return col_res; } else - throw Exception(ErrorCodes::LOGICAL_ERROR, "Error in FunctionCountMatches::getReturnTypeImpl()"); + throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Could not cast haystack argument to String or FixedString"); } static uint64_t countMatches(std::string_view src, const OptimizedRegularExpression & re, OptimizedRegularExpression::MatchVec & matches) @@ -116,7 +125,7 @@ public: if (!matches[0].length) break; pos += matches[0].offset + matches[0].length; - match_count++; + ++match_count; } return match_count; diff --git a/src/Functions/divide/divide.cpp b/src/Functions/divide/divide.cpp index 6262d42a666..0708964c7d4 100644 --- a/src/Functions/divide/divide.cpp +++ b/src/Functions/divide/divide.cpp @@ -1,5 +1,5 @@ #include "divide.h" -#include +#include #if defined(__x86_64__) namespace SSE2 @@ -26,9 +26,9 @@ template void divideImpl(const A * __restrict a_pos, B b, ResultType * __restrict c_pos, size_t size) { #if defined(__x86_64__) - if (DB::Cpu::CpuFlagsCache::have_AVX2) + if (DB::CPU::CPUFlagsCache::have_AVX2) AVX2::divideImpl(a_pos, b, c_pos, size); - else if (DB::Cpu::CpuFlagsCache::have_SSE2) + else if (DB::CPU::CPUFlagsCache::have_SSE2) SSE2::divideImpl(a_pos, b, c_pos, size); #else Generic::divideImpl(a_pos, b, c_pos, size); diff --git a/src/Functions/if.cpp b/src/Functions/if.cpp index 7306dc4173e..70aced8842a 100644 --- a/src/Functions/if.cpp +++ b/src/Functions/if.cpp @@ -1413,4 +1413,9 @@ REGISTER_FUNCTION(If) factory.registerFunction({}, FunctionFactory::CaseInsensitive); } +FunctionOverloadResolverPtr createInternalFunctionIfOverloadResolver(bool allow_experimental_variant_type, bool use_variant_as_common_type) +{ + return std::make_unique(std::make_shared(allow_experimental_variant_type && use_variant_as_common_type)); +} + } diff --git a/src/Functions/if.h b/src/Functions/if.h new file mode 100644 index 00000000000..09a7a6a3e78 --- /dev/null +++ b/src/Functions/if.h @@ -0,0 +1,12 @@ +#pragma once +#include + +namespace DB +{ + +class IFunctionOverloadResolver; +using FunctionOverloadResolverPtr = std::shared_ptr; + +FunctionOverloadResolverPtr createInternalFunctionIfOverloadResolver(bool allow_experimental_variant_type, bool use_variant_as_common_type); + +} diff --git a/src/Functions/logical.h b/src/Functions/logical.h new file mode 100644 index 00000000000..d2d07f6cec7 --- /dev/null +++ b/src/Functions/logical.h @@ -0,0 +1,15 @@ +#pragma once +#include + +namespace DB +{ + +class IFunctionOverloadResolver; +using FunctionOverloadResolverPtr = std::shared_ptr; + +FunctionOverloadResolverPtr createInternalFunctionOrOverloadResolver(); +FunctionOverloadResolverPtr createInternalFunctionAndOverloadResolver(); +FunctionOverloadResolverPtr createInternalFunctionXorOverloadResolver(); +FunctionOverloadResolverPtr createInternalFunctionNotOverloadResolver(); + +} diff --git a/src/Functions/multiIf.cpp b/src/Functions/multiIf.cpp index cb946b55c73..af7afb75e1a 100644 --- a/src/Functions/multiIf.cpp +++ b/src/Functions/multiIf.cpp @@ -40,9 +40,17 @@ class FunctionMultiIf final : public FunctionIfBase { public: static constexpr auto name = "multiIf"; - static FunctionPtr create(ContextPtr context_) { return std::make_shared(context_); } + static FunctionPtr create(ContextPtr context_) + { + const auto & settings = context_->getSettingsRef(); + return std::make_shared(settings.allow_execute_multiif_columnar, settings.allow_experimental_variant_type, settings.use_variant_as_common_type); + } - explicit FunctionMultiIf(ContextPtr context_) : context(context_) { } + explicit FunctionMultiIf(bool allow_execute_multiif_columnar_, bool allow_experimental_variant_type_, bool use_variant_as_common_type_) + : allow_execute_multiif_columnar(allow_execute_multiif_columnar_) + , allow_experimental_variant_type(allow_experimental_variant_type_) + , use_variant_as_common_type(use_variant_as_common_type_) + {} String getName() const override { return name; } bool isVariadic() const override { return true; } @@ -118,7 +126,7 @@ public: types_of_branches.emplace_back(arg); }); - if (context->getSettingsRef().allow_experimental_variant_type && context->getSettingsRef().use_variant_as_common_type) + if (allow_experimental_variant_type && use_variant_as_common_type) return getLeastSupertypeOrVariant(types_of_branches); return getLeastSupertype(types_of_branches); @@ -240,10 +248,9 @@ public: } } - const auto & settings = context->getSettingsRef(); const WhichDataType which(removeNullable(result_type)); bool execute_multiif_columnar - = settings.allow_execute_multiif_columnar && !contains_short && (which.isInt() || which.isUInt() || which.isFloat()); + = allow_execute_multiif_columnar && !contains_short && (which.isInt() || which.isUInt() || which.isFloat()); size_t rows = input_rows_count; if (!execute_multiif_columnar) @@ -507,7 +514,9 @@ private: executeColumnIfNeeded(arguments[i], true); } - ContextPtr context; + const bool allow_execute_multiif_columnar; + const bool allow_experimental_variant_type; + const bool use_variant_as_common_type; }; } @@ -521,6 +530,11 @@ REGISTER_FUNCTION(MultiIf) factory.registerFunction("caseWithoutExpression"); } +FunctionOverloadResolverPtr createInternalMultiIfOverloadResolver(bool allow_execute_multiif_columnar, bool allow_experimental_variant_type, bool use_variant_as_common_type) +{ + return std::make_unique(std::make_shared(allow_execute_multiif_columnar, allow_experimental_variant_type, use_variant_as_common_type)); +} + } diff --git a/src/Functions/multiIf.h b/src/Functions/multiIf.h new file mode 100644 index 00000000000..617d63b89bc --- /dev/null +++ b/src/Functions/multiIf.h @@ -0,0 +1,12 @@ +#pragma once +#include + +namespace DB +{ + +class IFunctionOverloadResolver; +using FunctionOverloadResolverPtr = std::shared_ptr; + +FunctionOverloadResolverPtr createInternalMultiIfOverloadResolver(bool allow_execute_multiif_columnar, bool allow_experimental_variant_type, bool use_variant_as_common_type); + +} diff --git a/src/Functions/multiMatchAny.cpp b/src/Functions/multiMatchAny.cpp index 6e6abe61898..054a60fce2d 100644 --- a/src/Functions/multiMatchAny.cpp +++ b/src/Functions/multiMatchAny.cpp @@ -22,4 +22,9 @@ REGISTER_FUNCTION(MultiMatchAny) factory.registerFunction(); } +FunctionOverloadResolverPtr createInternalMultiMatchAnyOverloadResolver(bool allow_hyperscan, size_t max_hyperscan_regexp_length, size_t max_hyperscan_regexp_total_length, bool reject_expensive_hyperscan_regexps) +{ + return std::make_unique(std::make_shared(allow_hyperscan, max_hyperscan_regexp_length, max_hyperscan_regexp_total_length, reject_expensive_hyperscan_regexps)); +} + } diff --git a/src/Functions/multiMatchAny.h b/src/Functions/multiMatchAny.h new file mode 100644 index 00000000000..4548ec1d593 --- /dev/null +++ b/src/Functions/multiMatchAny.h @@ -0,0 +1,12 @@ +#pragma once +#include + +namespace DB +{ + +class IFunctionOverloadResolver; +using FunctionOverloadResolverPtr = std::shared_ptr; + +FunctionOverloadResolverPtr createInternalMultiMatchAnyOverloadResolver(bool allow_hyperscan, size_t max_hyperscan_regexp_length, size_t max_hyperscan_regexp_total_length, bool reject_expensive_hyperscan_regexps); + +} diff --git a/src/Functions/serverConstants.cpp b/src/Functions/serverConstants.cpp index 9f1a3584df8..fd8fb22455b 100644 --- a/src/Functions/serverConstants.cpp +++ b/src/Functions/serverConstants.cpp @@ -51,12 +51,12 @@ namespace }; - class FunctionTcpPort : public FunctionConstantBase + class FunctionTCPPort : public FunctionConstantBase { public: static constexpr auto name = "tcpPort"; - static FunctionPtr create(ContextPtr context) { return std::make_shared(context); } - explicit FunctionTcpPort(ContextPtr context) : FunctionConstantBase(context->getTCPPort(), context->isDistributed()) {} + static FunctionPtr create(ContextPtr context) { return std::make_shared(context); } + explicit FunctionTCPPort(ContextPtr context) : FunctionConstantBase(context->getTCPPort(), context->isDistributed()) {} }; @@ -153,9 +153,9 @@ REGISTER_FUNCTION(ServerUUID) factory.registerFunction(); } -REGISTER_FUNCTION(TcpPort) +REGISTER_FUNCTION(TCPPort) { - factory.registerFunction(); + factory.registerFunction(); } REGISTER_FUNCTION(Timezone) diff --git a/src/Functions/vectorFunctions.cpp b/src/Functions/vectorFunctions.cpp index 33b0e9f6039..de4a6fb0a5c 100644 --- a/src/Functions/vectorFunctions.cpp +++ b/src/Functions/vectorFunctions.cpp @@ -1,9 +1,9 @@ #include #include #include +#include #include #include -#include #include #include #include @@ -1364,11 +1364,11 @@ public: ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr &, size_t input_rows_count) const override { - if (getReturnTypeImpl(arguments)->isNullable()) - { - return DataTypeNullable(std::make_shared()) - .createColumnConstWithDefaultValue(input_rows_count); - } + /// TODO: cosineDistance does not support nullable arguments + /// https://github.com/ClickHouse/ClickHouse/pull/27933#issuecomment-916670286 + auto return_type = getReturnTypeImpl(arguments); + if (return_type->isNullable()) + return return_type->createColumnConstWithDefaultValue(input_rows_count); FunctionDotProduct dot(context); ColumnWithTypeAndName dot_result{dot.executeImpl(arguments, DataTypePtr(), input_rows_count), diff --git a/src/Functions/widthBucket.cpp b/src/Functions/widthBucket.cpp index e95f7c05756..62ed460ca9d 100644 --- a/src/Functions/widthBucket.cpp +++ b/src/Functions/widthBucket.cpp @@ -44,7 +44,7 @@ class FunctionWidthBucket : public IFunction { throw Exception( ErrorCodes::LOGICAL_ERROR, - "Logical error in function {}: argument {} has unexpected type or size!", + "Logical error in function {}: argument {} has unexpected type or size.", getName(), argument_index); } @@ -157,7 +157,7 @@ class FunctionWidthBucket : public IFunction if (are_all_const_cols) { throw Exception( - ErrorCodes::LOGICAL_ERROR, "Logical error in function {}: unexpected combination of argument types!", getName()); + ErrorCodes::LOGICAL_ERROR, "Logical error in function {}: unexpected combination of argument types.", getName()); } auto result_column = ColumnVector::create(); diff --git a/src/IO/Archives/IArchiveReader.h b/src/IO/Archives/IArchiveReader.h index 84a1dc21f5b..ee516d2655b 100644 --- a/src/IO/Archives/IArchiveReader.h +++ b/src/IO/Archives/IArchiveReader.h @@ -56,6 +56,7 @@ public: /// It's possible to convert a file enumerator to a read buffer and vice versa. virtual std::unique_ptr readFile(std::unique_ptr enumerator) = 0; virtual std::unique_ptr nextFile(std::unique_ptr read_buffer) = 0; + virtual std::unique_ptr currentFile(std::unique_ptr read_buffer) = 0; virtual std::vector getAllFiles() = 0; virtual std::vector getAllFiles(NameFilter filter) = 0; diff --git a/src/IO/Archives/LibArchiveReader.cpp b/src/IO/Archives/LibArchiveReader.cpp index 94e68045575..a9ce401138b 100644 --- a/src/IO/Archives/LibArchiveReader.cpp +++ b/src/IO/Archives/LibArchiveReader.cpp @@ -335,6 +335,15 @@ std::unique_ptr LibArchiveReader::nextFile(std return std::make_unique(std::move(handle)); } +std::unique_ptr LibArchiveReader::currentFile(std::unique_ptr read_buffer) +{ + if (!dynamic_cast(read_buffer.get())) + throw Exception(ErrorCodes::LOGICAL_ERROR, "Wrong ReadBuffer passed to nextFile()"); + auto read_buffer_from_libarchive = std::unique_ptr(static_cast(read_buffer.release())); + auto handle = std::move(*read_buffer_from_libarchive).releaseHandle(); + return std::make_unique(std::move(handle)); +} + std::vector LibArchiveReader::getAllFiles() { return getAllFiles({}); diff --git a/src/IO/Archives/LibArchiveReader.h b/src/IO/Archives/LibArchiveReader.h index 3dadd710089..c4b08d8ddf7 100644 --- a/src/IO/Archives/LibArchiveReader.h +++ b/src/IO/Archives/LibArchiveReader.h @@ -40,6 +40,7 @@ public: /// It's possible to convert a file enumerator to a read buffer and vice versa. std::unique_ptr readFile(std::unique_ptr enumerator) override; std::unique_ptr nextFile(std::unique_ptr read_buffer) override; + std::unique_ptr currentFile(std::unique_ptr read_buffer) override; std::vector getAllFiles() override; std::vector getAllFiles(NameFilter filter) override; diff --git a/src/IO/Archives/ZipArchiveReader.cpp b/src/IO/Archives/ZipArchiveReader.cpp index 8c9c37e4ae0..2a9b7a43519 100644 --- a/src/IO/Archives/ZipArchiveReader.cpp +++ b/src/IO/Archives/ZipArchiveReader.cpp @@ -583,6 +583,15 @@ std::unique_ptr ZipArchiveReader::nextFile(std return std::make_unique(std::move(handle)); } +std::unique_ptr ZipArchiveReader::currentFile(std::unique_ptr read_buffer) +{ + if (!dynamic_cast(read_buffer.get())) + throw Exception(ErrorCodes::LOGICAL_ERROR, "Wrong ReadBuffer passed to nextFile()"); + auto read_buffer_from_zip = std::unique_ptr(static_cast(read_buffer.release())); + auto handle = std::move(*read_buffer_from_zip).releaseHandle(); + return std::make_unique(std::move(handle)); +} + std::vector ZipArchiveReader::getAllFiles() { return getAllFiles({}); diff --git a/src/IO/Archives/ZipArchiveReader.h b/src/IO/Archives/ZipArchiveReader.h index a8788064fec..4b1910839eb 100644 --- a/src/IO/Archives/ZipArchiveReader.h +++ b/src/IO/Archives/ZipArchiveReader.h @@ -47,6 +47,7 @@ public: /// It's possible to convert a file enumerator to a read buffer and vice versa. std::unique_ptr readFile(std::unique_ptr enumerator) override; std::unique_ptr nextFile(std::unique_ptr read_buffer) override; + std::unique_ptr currentFile(std::unique_ptr read_buffer) override; std::vector getAllFiles() override; std::vector getAllFiles(NameFilter filter) override; diff --git a/src/IO/AzureBlobStorage/copyAzureBlobStorageFile.h b/src/IO/AzureBlobStorage/copyAzureBlobStorageFile.h index 83814f42693..1433f8d18ba 100644 --- a/src/IO/AzureBlobStorage/copyAzureBlobStorageFile.h +++ b/src/IO/AzureBlobStorage/copyAzureBlobStorageFile.h @@ -6,7 +6,7 @@ #include #include -#include +#include #include #include #include diff --git a/src/IO/ConnectionTimeouts.cpp b/src/IO/ConnectionTimeouts.cpp index ecc0d64580b..f2db3169400 100644 --- a/src/IO/ConnectionTimeouts.cpp +++ b/src/IO/ConnectionTimeouts.cpp @@ -20,7 +20,7 @@ ConnectionTimeouts ConnectionTimeouts::getTCPTimeoutsWithoutFailover(const Setti .withConnectionTimeout(settings.connect_timeout) .withSendTimeout(settings.send_timeout) .withReceiveTimeout(settings.receive_timeout) - .withTcpKeepAliveTimeout(settings.tcp_keep_alive_timeout) + .withTCPKeepAliveTimeout(settings.tcp_keep_alive_timeout) .withHandshakeTimeout(settings.handshake_timeout_ms) .withHedgedConnectionTimeout(settings.hedged_connection_timeout_ms) .withReceiveDataTimeout(settings.receive_data_timeout_ms); @@ -40,8 +40,8 @@ ConnectionTimeouts ConnectionTimeouts::getHTTPTimeouts(const Settings & settings .withConnectionTimeout(settings.http_connection_timeout) .withSendTimeout(settings.http_send_timeout) .withReceiveTimeout(settings.http_receive_timeout) - .withHttpKeepAliveTimeout(http_keep_alive_timeout) - .withTcpKeepAliveTimeout(settings.tcp_keep_alive_timeout) + .withHTTPKeepAliveTimeout(http_keep_alive_timeout) + .withTCPKeepAliveTimeout(settings.tcp_keep_alive_timeout) .withHandshakeTimeout(settings.handshake_timeout_ms); } diff --git a/src/IO/ConnectionTimeouts.h b/src/IO/ConnectionTimeouts.h index 6967af08204..7fe97b5ec36 100644 --- a/src/IO/ConnectionTimeouts.h +++ b/src/IO/ConnectionTimeouts.h @@ -16,8 +16,8 @@ struct Settings; M(secure_connection_timeout, withSecureConnectionTimeout) \ M(send_timeout, withSendTimeout) \ M(receive_timeout, withReceiveTimeout) \ - M(tcp_keep_alive_timeout, withTcpKeepAliveTimeout) \ - M(http_keep_alive_timeout, withHttpKeepAliveTimeout) \ + M(tcp_keep_alive_timeout, withTCPKeepAliveTimeout) \ + M(http_keep_alive_timeout, withHTTPKeepAliveTimeout) \ M(hedged_connection_timeout, withHedgedConnectionTimeout) \ M(receive_data_timeout, withReceiveDataTimeout) \ M(handshake_timeout, withHandshakeTimeout) \ diff --git a/src/IO/ParallelReadBuffer.h b/src/IO/ParallelReadBuffer.h index e76b40f77b7..daac1190399 100644 --- a/src/IO/ParallelReadBuffer.h +++ b/src/IO/ParallelReadBuffer.h @@ -3,7 +3,7 @@ #include #include #include -#include +#include #include namespace DB diff --git a/src/IO/S3/Client.cpp b/src/IO/S3/Client.cpp index 7f0ede72740..182e7ad18cd 100644 --- a/src/IO/S3/Client.cpp +++ b/src/IO/S3/Client.cpp @@ -27,7 +27,6 @@ #include -#include namespace ProfileEvents { @@ -48,7 +47,6 @@ namespace ErrorCodes { extern const int LOGICAL_ERROR; extern const int TOO_MANY_REDIRECTS; - extern const int BAD_ARGUMENTS; } namespace S3 @@ -106,19 +104,6 @@ void verifyClientConfiguration(const Aws::Client::ClientConfiguration & client_c assert_cast(*client_config.retryStrategy); } -void validateCredentials(const Aws::Auth::AWSCredentials& auth_credentials) -{ - if (auth_credentials.GetAWSAccessKeyId().empty()) - { - return; - } - /// Follow https://docs.aws.amazon.com/IAM/latest/APIReference/API_AccessKey.html - if (!std::all_of(auth_credentials.GetAWSAccessKeyId().begin(), auth_credentials.GetAWSAccessKeyId().end(), isWordCharASCII)) - { - throw Exception(ErrorCodes::BAD_ARGUMENTS, "Access key id has an invalid character"); - } -} - void addAdditionalAMZHeadersToCanonicalHeadersList( Aws::AmazonWebServiceRequest & request, const HTTPHeaderEntries & extra_headers @@ -144,7 +129,6 @@ std::unique_ptr Client::create( const ClientSettings & client_settings) { verifyClientConfiguration(client_configuration); - validateCredentials(credentials_provider->GetAWSCredentials()); return std::unique_ptr( new Client(max_redirects_, std::move(sse_kms_config_), credentials_provider, client_configuration, sign_payloads, client_settings)); } diff --git a/src/IO/S3/Credentials.cpp b/src/IO/S3/Credentials.cpp index e64f54b99ad..80366510b53 100644 --- a/src/IO/S3/Credentials.cpp +++ b/src/IO/S3/Credentials.cpp @@ -22,7 +22,6 @@ namespace ErrorCodes # include # include -# include # include # include @@ -31,9 +30,7 @@ namespace ErrorCodes # include # include -# include -# include # include # include # include @@ -755,7 +752,7 @@ S3CredentialsProviderChain::S3CredentialsProviderChain( configuration.put_request_throttler, Aws::Http::SchemeMapper::ToString(Aws::Http::Scheme::HTTP)); - /// See MakeDefaultHttpResourceClientConfiguration(). + /// See MakeDefaultHTTPResourceClientConfiguration(). /// This is part of EC2 metadata client, but unfortunately it can't be accessed from outside /// of contrib/aws/aws-cpp-sdk-core/source/internal/AWSHttpResourceClient.cpp aws_client_configuration.maxConnections = 2; diff --git a/src/IO/S3/PocoHTTPClient.cpp b/src/IO/S3/PocoHTTPClient.cpp index 21acdfd69f2..dbb93e63143 100644 --- a/src/IO/S3/PocoHTTPClient.cpp +++ b/src/IO/S3/PocoHTTPClient.cpp @@ -146,9 +146,9 @@ ConnectionTimeouts getTimeoutsFromConfiguration(const PocoHTTPClientConfiguratio .withConnectionTimeout(Poco::Timespan(client_configuration.connectTimeoutMs * 1000)) .withSendTimeout(Poco::Timespan(client_configuration.requestTimeoutMs * 1000)) .withReceiveTimeout(Poco::Timespan(client_configuration.requestTimeoutMs * 1000)) - .withTcpKeepAliveTimeout(Poco::Timespan( + .withTCPKeepAliveTimeout(Poco::Timespan( client_configuration.enableTcpKeepAlive ? client_configuration.tcpKeepAliveIntervalMs * 1000 : 0)) - .withHttpKeepAliveTimeout(Poco::Timespan( + .withHTTPKeepAliveTimeout(Poco::Timespan( client_configuration.http_keep_alive_timeout_ms * 1000)); /// flag indicating whether keep-alive is enabled is set to each session upon creation } diff --git a/src/IO/S3/copyS3File.h b/src/IO/S3/copyS3File.h index 607be51ed25..093d26ba7bb 100644 --- a/src/IO/S3/copyS3File.h +++ b/src/IO/S3/copyS3File.h @@ -5,7 +5,7 @@ #if USE_AWS_S3 #include -#include +#include #include #include #include diff --git a/src/IO/S3Common.cpp b/src/IO/S3Common.cpp index 5039059f522..56e3e0df21b 100644 --- a/src/IO/S3Common.cpp +++ b/src/IO/S3Common.cpp @@ -1,7 +1,9 @@ #include #include +#include #include + #include "config.h" #if USE_AWS_S3 @@ -124,6 +126,15 @@ AuthSettings AuthSettings::loadFromConfig(const std::string & config_elem, const HTTPHeaderEntries headers = getHTTPHeaders(config_elem, config); ServerSideEncryptionKMSConfig sse_kms_config = getSSEKMSConfig(config_elem, config); + std::unordered_set users; + Poco::Util::AbstractConfiguration::Keys keys; + config.keys(config_elem, keys); + for (const auto & key : keys) + { + if (startsWith(key, "user")) + users.insert(config.getString(config_elem + "." + key)); + } + return AuthSettings { std::move(access_key_id), std::move(secret_access_key), std::move(session_token), @@ -134,10 +145,16 @@ AuthSettings AuthSettings::loadFromConfig(const std::string & config_elem, const use_environment_credentials, use_insecure_imds_request, expiration_window_seconds, - no_sign_request + no_sign_request, + std::move(users) }; } +bool AuthSettings::canBeUsedByUser(const String & user) const +{ + return users.empty() || users.contains(user); +} + bool AuthSettings::hasUpdates(const AuthSettings & other) const { AuthSettings copy = *this; @@ -173,6 +190,8 @@ void AuthSettings::updateFrom(const AuthSettings & from) if (from.no_sign_request.has_value()) no_sign_request = from.no_sign_request; + + users.insert(from.users.begin(), from.users.end()); } } diff --git a/src/IO/S3Common.h b/src/IO/S3Common.h index 6ee8d96ed09..b3e01bd6132 100644 --- a/src/IO/S3Common.h +++ b/src/IO/S3Common.h @@ -6,6 +6,7 @@ #include #include +#include #include "config.h" @@ -92,9 +93,13 @@ struct AuthSettings std::optional expiration_window_seconds; std::optional no_sign_request; + std::unordered_set users; + bool hasUpdates(const AuthSettings & other) const; void updateFrom(const AuthSettings & from); + bool canBeUsedByUser(const String & user) const; + private: bool operator==(const AuthSettings & other) const = default; }; diff --git a/src/IO/WriteBufferFromS3.h b/src/IO/WriteBufferFromS3.h index 230f39b074e..5dc269990a1 100644 --- a/src/IO/WriteBufferFromS3.h +++ b/src/IO/WriteBufferFromS3.h @@ -10,7 +10,7 @@ #include #include #include -#include +#include #include #include diff --git a/src/IO/readFloatText.cpp b/src/IO/readFloatText.cpp index d1143f7c62c..17ccc1b25b7 100644 --- a/src/IO/readFloatText.cpp +++ b/src/IO/readFloatText.cpp @@ -67,4 +67,7 @@ template void readFloatText(Float64 &, ReadBuffer &); template bool tryReadFloatText(Float32 &, ReadBuffer &); template bool tryReadFloatText(Float64 &, ReadBuffer &); +template bool tryReadFloatTextNoExponent(Float32 &, ReadBuffer &); +template bool tryReadFloatTextNoExponent(Float64 &, ReadBuffer &); + } diff --git a/src/IO/readFloatText.h b/src/IO/readFloatText.h index 23e904f305a..51964636389 100644 --- a/src/IO/readFloatText.h +++ b/src/IO/readFloatText.h @@ -324,7 +324,7 @@ static inline void readUIntTextUpToNSignificantDigits(T & x, ReadBuffer & buf) } -template +template ReturnType readFloatTextFastImpl(T & x, ReadBuffer & in) { static_assert(std::is_same_v || std::is_same_v, "Argument for readFloatTextImpl must be float or double"); @@ -395,30 +395,33 @@ ReturnType readFloatTextFastImpl(T & x, ReadBuffer & in) after_point_exponent = (read_digits > significant_digits ? -significant_digits : static_cast(-read_digits)) - after_point_num_leading_zeros; } - if (checkChar('e', in) || checkChar('E', in)) + if constexpr (allow_exponent) { - if (in.eof()) + if (checkChar('e', in) || checkChar('E', in)) { - if constexpr (throw_exception) - throw Exception(ErrorCodes::CANNOT_PARSE_NUMBER, "Cannot read floating point value: nothing after exponent"); - else - return false; - } + if (in.eof()) + { + if constexpr (throw_exception) + throw Exception(ErrorCodes::CANNOT_PARSE_NUMBER, "Cannot read floating point value: nothing after exponent"); + else + return false; + } - bool exponent_negative = false; - if (*in.position() == '-') - { - exponent_negative = true; - ++in.position(); - } - else if (*in.position() == '+') - { - ++in.position(); - } + bool exponent_negative = false; + if (*in.position() == '-') + { + exponent_negative = true; + ++in.position(); + } + else if (*in.position() == '+') + { + ++in.position(); + } - readUIntTextUpToNSignificantDigits<4>(exponent, in); - if (exponent_negative) - exponent = -exponent; + readUIntTextUpToNSignificantDigits<4>(exponent, in); + if (exponent_negative) + exponent = -exponent; + } } if (after_point) @@ -604,4 +607,7 @@ template bool tryReadFloatTextSimple(T & x, ReadBuffer & in) { retu template void readFloatText(T & x, ReadBuffer & in) { readFloatTextFast(x, in); } template bool tryReadFloatText(T & x, ReadBuffer & in) { return tryReadFloatTextFast(x, in); } +/// Don't read exponent part of the number. +template bool tryReadFloatTextNoExponent(T & x, ReadBuffer & in) { return readFloatTextFastImpl(x, in); } + } diff --git a/src/Interpreters/ClusterDiscovery.cpp b/src/Interpreters/ClusterDiscovery.cpp index 52b74597c4b..d432488964d 100644 --- a/src/Interpreters/ClusterDiscovery.cpp +++ b/src/Interpreters/ClusterDiscovery.cpp @@ -319,7 +319,7 @@ bool ClusterDiscovery::updateCluster(ClusterInfo & cluster_info) if (cluster_info.current_cluster_is_invisible) { - LOG_DEBUG(log, "cluster '{}' is invisible!", cluster_info.name); + LOG_DEBUG(log, "Cluster '{}' is invisible.", cluster_info.name); return true; } diff --git a/src/Interpreters/Context.cpp b/src/Interpreters/Context.cpp index 0e5897e7306..55a4df10206 100644 --- a/src/Interpreters/Context.cpp +++ b/src/Interpreters/Context.cpp @@ -94,6 +94,7 @@ #include #include #include +#include #include #include #include @@ -907,7 +908,7 @@ Strings Context::getWarnings() const if (CurrentMetrics::get(CurrentMetrics::AttachedTable) > static_cast(shared->max_table_num_to_warn)) common_warnings.emplace_back(fmt::format("The number of attached tables is more than {}", shared->max_table_num_to_warn)); if (CurrentMetrics::get(CurrentMetrics::AttachedDatabase) > static_cast(shared->max_database_num_to_warn)) - common_warnings.emplace_back(fmt::format("The number of attached databases is more than {}", shared->max_table_num_to_warn)); + common_warnings.emplace_back(fmt::format("The number of attached databases is more than {}", shared->max_database_num_to_warn)); if (CurrentMetrics::get(CurrentMetrics::PartsActive) > static_cast(shared->max_part_num_to_warn)) common_warnings.emplace_back(fmt::format("The number of active parts is more than {}", shared->max_part_num_to_warn)); } @@ -1533,7 +1534,7 @@ void Context::addExternalTable(const String & table_name, TemporaryTableHolder & std::lock_guard lock(mutex); if (external_tables_mapping.end() != external_tables_mapping.find(table_name)) - throw Exception(ErrorCodes::TABLE_ALREADY_EXISTS, "Temporary table {} already exists.", backQuoteIfNeed(table_name)); + throw Exception(ErrorCodes::TABLE_ALREADY_EXISTS, "Temporary table {} already exists", backQuoteIfNeed(table_name)); external_tables_mapping.emplace(table_name, std::make_shared(std::move(temporary_table))); } @@ -1931,6 +1932,35 @@ StoragePtr Context::executeTableFunction(const ASTPtr & table_expression, const } +StoragePtr Context::buildParametrizedViewStorage(const ASTPtr & table_expression, const String & database_name, const String & table_name) +{ + if (table_name.empty()) + return nullptr; + + StoragePtr original_view = DatabaseCatalog::instance().tryGetTable({database_name, table_name}, getQueryContext()); + if (!original_view || !original_view->isView()) + return nullptr; + auto * storage_view = original_view->as(); + if (!storage_view || !storage_view->isParameterizedView()) + return nullptr; + + auto query = original_view->getInMemoryMetadataPtr()->getSelectQuery().inner_query->clone(); + NameToNameMap parameterized_view_values = analyzeFunctionParamValues(table_expression); + StorageView::replaceQueryParametersIfParametrizedView(query, parameterized_view_values); + + ASTCreateQuery create; + create.select = query->as(); + auto sample_block = InterpreterSelectQueryAnalyzer::getSampleBlock(query, shared_from_this()); + auto res = std::make_shared(StorageID(database_name, table_name), + create, + ColumnsDescription(sample_block.getNamesAndTypesList()), + /* comment */ "", + /* is_parameterized_view */ true); + res->startup(); + return res; +} + + void Context::addViewSource(const StoragePtr & storage) { if (view_source) @@ -4154,12 +4184,12 @@ void Context::setMaxTableSizeToDrop(size_t max_size) size_t Context::getMaxTableSizeToDrop() const { - return shared->max_table_size_to_drop.load(std::memory_order_relaxed); + return shared->max_table_size_to_drop.load(); } void Context::checkTableCanBeDropped(const String & database, const String & table, const size_t & table_size) const { - size_t max_table_size_to_drop = shared->max_table_size_to_drop.load(std::memory_order_relaxed); + size_t max_table_size_to_drop = shared->max_table_size_to_drop.load(); checkCanBeDropped(database, table, table_size, max_table_size_to_drop); } @@ -4177,12 +4207,12 @@ void Context::setMaxPartitionSizeToDrop(size_t max_size) size_t Context::getMaxPartitionSizeToDrop() const { - return shared->max_partition_size_to_drop.load(std::memory_order_relaxed); + return shared->max_partition_size_to_drop.load(); } void Context::checkPartitionCanBeDropped(const String & database, const String & table, const size_t & partition_size) const { - size_t max_partition_size_to_drop = shared->max_partition_size_to_drop.load(std::memory_order_relaxed); + size_t max_partition_size_to_drop = shared->max_partition_size_to_drop.load(); checkCanBeDropped(database, table, partition_size, max_partition_size_to_drop); } @@ -4483,7 +4513,7 @@ void Context::setClientConnectionId(uint32_t connection_id_) client_info.connection_id = connection_id_; } -void Context::setHttpClientInfo(ClientInfo::HTTPMethod http_method, const String & http_user_agent, const String & http_referer) +void Context::setHTTPClientInfo(ClientInfo::HTTPMethod http_method, const String & http_user_agent, const String & http_referer) { client_info.http_method = http_method; client_info.http_user_agent = http_user_agent; diff --git a/src/Interpreters/Context.h b/src/Interpreters/Context.h index 8d40ccb301b..a7908d45a9b 100644 --- a/src/Interpreters/Context.h +++ b/src/Interpreters/Context.h @@ -630,7 +630,7 @@ public: void setClientInterface(ClientInfo::Interface interface); void setClientVersion(UInt64 client_version_major, UInt64 client_version_minor, UInt64 client_version_patch, unsigned client_tcp_protocol_version); void setClientConnectionId(uint32_t connection_id); - void setHttpClientInfo(ClientInfo::HTTPMethod http_method, const String & http_user_agent, const String & http_referer); + void setHTTPClientInfo(ClientInfo::HTTPMethod http_method, const String & http_user_agent, const String & http_referer); void setForwardedFor(const String & forwarded_for); void setQueryKind(ClientInfo::QueryKind query_kind); void setQueryKindInitial(); @@ -718,6 +718,8 @@ public: /// Overload for the new analyzer. Structure inference is performed in QueryAnalysisPass. StoragePtr executeTableFunction(const ASTPtr & table_expression, const TableFunctionPtr & table_function_ptr); + StoragePtr buildParametrizedViewStorage(const ASTPtr & table_expression, const String & database_name, const String & table_name); + void addViewSource(const StoragePtr & storage); StoragePtr getViewSource() const; diff --git a/src/Interpreters/DDLTask.cpp b/src/Interpreters/DDLTask.cpp index 90eec421abf..543d8b16791 100644 --- a/src/Interpreters/DDLTask.cpp +++ b/src/Interpreters/DDLTask.cpp @@ -148,9 +148,8 @@ void DDLLogEntry::parse(const String & data) String settings_str; rb >> "settings: " >> settings_str >> "\n"; ParserSetQuery parser{true}; - constexpr UInt64 max_size = 4096; constexpr UInt64 max_depth = 16; - ASTPtr settings_ast = parseQuery(parser, settings_str, max_size, max_depth); + ASTPtr settings_ast = parseQuery(parser, settings_str, Context::getGlobalContextInstance()->getSettingsRef().max_query_size, max_depth); settings.emplace(std::move(settings_ast->as()->changes)); } } diff --git a/src/Interpreters/InterpreterCreateQuery.cpp b/src/Interpreters/InterpreterCreateQuery.cpp index c491ee30321..08dda0fe811 100644 --- a/src/Interpreters/InterpreterCreateQuery.cpp +++ b/src/Interpreters/InterpreterCreateQuery.cpp @@ -50,6 +50,7 @@ #include #include #include +#include #include @@ -82,7 +83,8 @@ #include #include - +#include +#include namespace DB { @@ -98,7 +100,6 @@ namespace ErrorCodes extern const int DATABASE_ALREADY_EXISTS; extern const int BAD_ARGUMENTS; extern const int BAD_DATABASE_FOR_TEMPORARY_TABLE; - extern const int SUSPICIOUS_TYPE_FOR_LOW_CARDINALITY; extern const int ILLEGAL_SYNTAX_FOR_DATA_TYPE; extern const int ILLEGAL_COLUMN; extern const int ILLEGAL_INDEX; @@ -692,6 +693,7 @@ ColumnsDescription InterpreterCreateQuery::getColumnsDescription( if (!attach && !is_restore_from_backup && context_->getSettingsRef().flatten_nested) res.flattenNested(); + if (res.getAllPhysical().empty()) throw Exception(ErrorCodes::EMPTY_LIST_OF_COLUMNS_PASSED, "Cannot CREATE table without physical columns"); @@ -796,6 +798,9 @@ InterpreterCreateQuery::TableProperties InterpreterCreateQuery::getTableProperti } else if (create.select) { + if (create.isParameterizedView()) + return properties; + Block as_select_sample; if (getContext()->getSettingsRef().allow_experimental_analyzer) @@ -820,11 +825,7 @@ InterpreterCreateQuery::TableProperties InterpreterCreateQuery::getTableProperti * for example: LIMIT, OFFSET, functions parameters, functions constant only arguments. */ - SelectQueryOptions options; - if (create.isParameterizedView()) - options = options.createParameterizedView(); - - InterpreterSelectWithUnionQuery interpreter(create.select->clone(), getContext(), options); + InterpreterSelectWithUnionQuery interpreter(create.select->clone(), getContext(), SelectQueryOptions()); as_select_sample = interpreter.getSampleBlock(); } @@ -910,66 +911,13 @@ void InterpreterCreateQuery::validateTableStructure(const ASTCreateQuery & creat const auto & settings = getContext()->getSettingsRef(); - /// Check low cardinality types in creating table if it was not allowed in setting - if (!create.attach && !settings.allow_suspicious_low_cardinality_types && !create.is_materialized_view) + /// If it's not attach and not materialized view to existing table, + /// we need to validate data types (check for experimental or suspicious types). + if (!create.attach && !create.is_materialized_view) { + DataTypeValidationSettings validation_settings(settings); for (const auto & name_and_type_pair : properties.columns.getAllPhysical()) - { - if (const auto * current_type_ptr = typeid_cast(name_and_type_pair.type.get())) - { - if (!isStringOrFixedString(*removeNullable(current_type_ptr->getDictionaryType()))) - throw Exception(ErrorCodes::SUSPICIOUS_TYPE_FOR_LOW_CARDINALITY, - "Creating columns of type {} is prohibited by default " - "due to expected negative impact on performance. " - "It can be enabled with the \"allow_suspicious_low_cardinality_types\" setting.", - current_type_ptr->getName()); - } - } - } - - if (!create.attach && !settings.allow_experimental_object_type) - { - for (const auto & [name, type] : properties.columns.getAllPhysical()) - { - if (type->hasDynamicSubcolumns()) - { - throw Exception(ErrorCodes::ILLEGAL_COLUMN, - "Cannot create table with column '{}' which type is '{}' " - "because experimental Object type is not allowed. " - "Set setting allow_experimental_object_type = 1 in order to allow it", - name, type->getName()); - } - } - } - if (!create.attach && !settings.allow_suspicious_fixed_string_types) - { - for (const auto & [name, type] : properties.columns.getAllPhysical()) - { - auto basic_type = removeLowCardinalityAndNullable(type); - if (const auto * fixed_string = typeid_cast(basic_type.get())) - { - if (fixed_string->getN() > MAX_FIXEDSTRING_SIZE_WITHOUT_SUSPICIOUS) - throw Exception(ErrorCodes::ILLEGAL_COLUMN, - "Cannot create table with column '{}' which type is '{}' " - "because fixed string with size > {} is suspicious. " - "Set setting allow_suspicious_fixed_string_types = 1 in order to allow it", - name, type->getName(), MAX_FIXEDSTRING_SIZE_WITHOUT_SUSPICIOUS); - } - } - } - if (!create.attach && !settings.allow_experimental_variant_type) - { - for (const auto & [name, type] : properties.columns.getAllPhysical()) - { - if (isVariant(type)) - { - throw Exception(ErrorCodes::ILLEGAL_COLUMN, - "Cannot create table with column '{}' which type is '{}' " - "because experimental Variant type is not allowed. " - "Set setting allow_experimental_variant_type = 1 in order to allow it", - name, type->getName()); - } - } + validateDataType(name_and_type_pair.type, validation_settings); } } diff --git a/src/Interpreters/InterpreterShowFunctionsQuery.cpp b/src/Interpreters/InterpreterShowFunctionsQuery.cpp index e83f61eac53..829670d7929 100644 --- a/src/Interpreters/InterpreterShowFunctionsQuery.cpp +++ b/src/Interpreters/InterpreterShowFunctionsQuery.cpp @@ -25,13 +25,13 @@ String InterpreterShowFunctionsQuery::getRewrittenQuery() const auto & query = query_ptr->as(); - DatabasePtr systemDb = DatabaseCatalog::instance().getSystemDatabase(); + DatabasePtr system_db = DatabaseCatalog::instance().getSystemDatabase(); String rewritten_query = fmt::format( R"( SELECT * FROM {}.{})", - systemDb->getDatabaseName(), + system_db->getDatabaseName(), functions_table); if (!query.like.empty()) diff --git a/src/Interpreters/InterpreterSystemQuery.cpp b/src/Interpreters/InterpreterSystemQuery.cpp index 9a80553f149..19449cd9e28 100644 --- a/src/Interpreters/InterpreterSystemQuery.cpp +++ b/src/Interpreters/InterpreterSystemQuery.cpp @@ -1083,7 +1083,7 @@ void InterpreterSystemQuery::syncReplica(ASTSystemQuery & query) auto sync_timeout = getContext()->getSettingsRef().receive_timeout.totalMilliseconds(); if (!storage_replicated->waitForProcessingQueue(sync_timeout, query.sync_replica_mode, query.src_replicas)) { - LOG_ERROR(log, "SYNC REPLICA {}: Timed out!", table_id.getNameForLogs()); + LOG_ERROR(log, "SYNC REPLICA {}: Timed out.", table_id.getNameForLogs()); throw Exception(ErrorCodes::TIMEOUT_EXCEEDED, "SYNC REPLICA {}: command timed out. " \ "See the 'receive_timeout' setting", table_id.getNameForLogs()); } diff --git a/src/Interpreters/RewriteUniqToCountVisitor.cpp b/src/Interpreters/RewriteUniqToCountVisitor.cpp index ddec6fe063e..a2e3a790c27 100644 --- a/src/Interpreters/RewriteUniqToCountVisitor.cpp +++ b/src/Interpreters/RewriteUniqToCountVisitor.cpp @@ -156,7 +156,11 @@ void RewriteUniqToCountMatcher::visit(ASTPtr & ast, Data & /*data*/) }; if (match_subquery_with_distinct() || match_subquery_with_group_by()) + { + auto main_alias = expr_list->children[0]->tryGetAlias(); expr_list->children[0] = makeASTFunction("count"); + expr_list->children[0]->setAlias(main_alias); + } } } diff --git a/src/Interpreters/S3QueueLog.cpp b/src/Interpreters/S3QueueLog.cpp index 967becb6e0f..3ed58de0f87 100644 --- a/src/Interpreters/S3QueueLog.cpp +++ b/src/Interpreters/S3QueueLog.cpp @@ -28,7 +28,9 @@ ColumnsDescription S3QueueLogElement::getColumnsDescription() {"hostname", std::make_shared(std::make_shared())}, {"event_date", std::make_shared()}, {"event_time", std::make_shared()}, - {"table_uuid", std::make_shared()}, + {"database", std::make_shared()}, + {"table", std::make_shared()}, + {"uuid", std::make_shared()}, {"file_name", std::make_shared()}, {"rows_processed", std::make_shared()}, {"status", status_datatype}, @@ -45,7 +47,9 @@ void S3QueueLogElement::appendToBlock(MutableColumns & columns) const columns[i++]->insert(getFQDNOrHostName()); columns[i++]->insert(DateLUT::instance().toDayNum(event_time).toUnderType()); columns[i++]->insert(event_time); - columns[i++]->insert(table_uuid); + columns[i++]->insert(database); + columns[i++]->insert(table); + columns[i++]->insert(uuid); columns[i++]->insert(file_name); columns[i++]->insert(rows_processed); columns[i++]->insert(status); diff --git a/src/Interpreters/S3QueueLog.h b/src/Interpreters/S3QueueLog.h index e0362bf9716..b6bc138d42c 100644 --- a/src/Interpreters/S3QueueLog.h +++ b/src/Interpreters/S3QueueLog.h @@ -12,7 +12,11 @@ namespace DB struct S3QueueLogElement { time_t event_time{}; - std::string table_uuid; + + std::string database; + std::string table; + std::string uuid; + std::string file_name; size_t rows_processed = 0; diff --git a/src/Interpreters/Session.cpp b/src/Interpreters/Session.cpp index df97a09f686..b52f8a507e3 100644 --- a/src/Interpreters/Session.cpp +++ b/src/Interpreters/Session.cpp @@ -429,11 +429,11 @@ void Session::setClientConnectionId(uint32_t connection_id) prepared_client_info->connection_id = connection_id; } -void Session::setHttpClientInfo(ClientInfo::HTTPMethod http_method, const String & http_user_agent, const String & http_referer) +void Session::setHTTPClientInfo(ClientInfo::HTTPMethod http_method, const String & http_user_agent, const String & http_referer) { if (session_context) { - session_context->setHttpClientInfo(http_method, http_user_agent, http_referer); + session_context->setHTTPClientInfo(http_method, http_user_agent, http_referer); } else { diff --git a/src/Interpreters/Session.h b/src/Interpreters/Session.h index cde000d89fa..334560a33c8 100644 --- a/src/Interpreters/Session.h +++ b/src/Interpreters/Session.h @@ -65,7 +65,7 @@ public: void setClientInterface(ClientInfo::Interface interface); void setClientVersion(UInt64 client_version_major, UInt64 client_version_minor, UInt64 client_version_patch, unsigned client_tcp_protocol_version); void setClientConnectionId(uint32_t connection_id); - void setHttpClientInfo(ClientInfo::HTTPMethod http_method, const String & http_user_agent, const String & http_referer); + void setHTTPClientInfo(ClientInfo::HTTPMethod http_method, const String & http_user_agent, const String & http_referer); void setForwardedFor(const String & forwarded_for); void setQuotaClientKey(const String & quota_key); void setConnectionClientVersion(UInt64 client_version_major, UInt64 client_version_minor, UInt64 client_version_patch, unsigned client_tcp_protocol_version); diff --git a/src/Interpreters/TreeRewriter.cpp b/src/Interpreters/TreeRewriter.cpp index ecd021328e7..d01bdd16c53 100644 --- a/src/Interpreters/TreeRewriter.cpp +++ b/src/Interpreters/TreeRewriter.cpp @@ -73,7 +73,6 @@ namespace ErrorCodes extern const int NOT_IMPLEMENTED; extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH; extern const int UNKNOWN_IDENTIFIER; - extern const int UNEXPECTED_EXPRESSION; } namespace @@ -262,8 +261,7 @@ struct ExistsExpressionData select_with_union_query->list_of_selects->children.push_back(std::move(select_query)); select_with_union_query->children.push_back(select_with_union_query->list_of_selects); - auto new_subquery = std::make_shared(); - new_subquery->children.push_back(select_with_union_query); + auto new_subquery = std::make_shared(std::move(select_with_union_query)); auto function = makeASTFunction("in", std::make_shared(1u), new_subquery); func = *function; @@ -787,16 +785,6 @@ void expandOrderByAll(ASTSelectQuery * select_query) for (const auto & expr : select_query->select()->children) { - if (auto * identifier = expr->as(); identifier != nullptr) - if (Poco::toUpper(identifier->name()) == "ALL" || Poco::toUpper(identifier->alias) == "ALL") - throw Exception(ErrorCodes::UNEXPECTED_EXPRESSION, - "Cannot use ORDER BY ALL to sort a column with name 'all', please disable setting `enable_order_by_all` and try again"); - - if (auto * function = expr->as(); function != nullptr) - if (Poco::toUpper(function->alias) == "ALL") - throw Exception(ErrorCodes::UNEXPECTED_EXPRESSION, - "Cannot use ORDER BY ALL to sort a column with name 'all', please disable setting `enable_order_by_all` and try again"); - auto elem = std::make_shared(); elem->direction = all_elem->direction; elem->nulls_direction = all_elem->nulls_direction; @@ -1323,8 +1311,8 @@ TreeRewriterResultPtr TreeRewriter::analyzeSelect( if (select_query->group_by_all) expandGroupByAll(select_query); - // expand ORDER BY ALL - if (settings.enable_order_by_all && select_query->order_by_all) + // expand ORDER BY * + if (select_query->order_by_all) expandOrderByAll(select_query); /// Remove unneeded columns according to 'required_result_columns'. diff --git a/src/Interpreters/convertFieldToType.cpp b/src/Interpreters/convertFieldToType.cpp index c3b8405659a..346180c3613 100644 --- a/src/Interpreters/convertFieldToType.cpp +++ b/src/Interpreters/convertFieldToType.cpp @@ -493,10 +493,12 @@ Field convertFieldToTypeImpl(const Field & src, const IDataType & type, const ID { /// Promote data type to avoid overflows. Note that overflows in the largest data type are still possible. /// But don't promote Float32, since we want to keep the exact same value + /// Also don't promote domain types (like bool) because we would otherwise use the serializer of the promoted type (e.g. UInt64 for + /// bool, which does not allow 'true' and 'false' as input values) const IDataType * type_to_parse = &type; DataTypePtr holder; - if (type.canBePromoted() && !which_type.isFloat32()) + if (type.canBePromoted() && !which_type.isFloat32() && !type.getCustomSerialization()) { holder = type.promoteNumericType(); type_to_parse = holder.get(); diff --git a/src/Interpreters/executeQuery.cpp b/src/Interpreters/executeQuery.cpp index 8b36790a269..f2aa51bd6de 100644 --- a/src/Interpreters/executeQuery.cpp +++ b/src/Interpreters/executeQuery.cpp @@ -4,6 +4,7 @@ #include #include #include +#include #include #include @@ -105,6 +106,10 @@ namespace ErrorCodes extern const int SUPPORT_IS_DISABLED; } +namespace FailPoints +{ + extern const char execute_query_calling_empty_set_result_func_on_exception[]; +} static void checkASTSizeLimits(const IAST & ast, const Settings & settings) { @@ -716,6 +721,27 @@ static std::tuple executeQueryImpl( ParserQuery parser(end, settings.allow_settings_after_format_in_insert); /// TODO: parser should fail early when max_query_size limit is reached. ast = parseQuery(parser, begin, end, "", max_query_size, settings.max_parser_depth); + +#if 0 + /// Verify that AST formatting is consistent: + /// If you format AST, parse it back, and format it again, you get the same string. + + String formatted1 = ast->formatWithPossiblyHidingSensitiveData(0, true, true); + + ASTPtr ast2 = parseQuery(parser, + formatted1.data(), + formatted1.data() + formatted1.size(), + "", max_query_size, settings.max_parser_depth); + + chassert(ast2); + + String formatted2 = ast2->formatWithPossiblyHidingSensitiveData(0, true, true); + + if (formatted1 != formatted2) + throw Exception(ErrorCodes::LOGICAL_ERROR, + "Inconsistent AST formatting: the query:\n{}\nWas parsed and formatted back as:\n{}", + formatted1, formatted2); +#endif } const char * query_end = end; @@ -927,8 +953,6 @@ static std::tuple executeQueryImpl( reason = "asynchronous insert queue is not configured"; else if (insert_query->select) reason = "insert query has select"; - else if (settings.deduplicate_blocks_in_dependent_materialized_views) - reason = "dependent materialized views block deduplication is enabled"; else if (insert_query->hasInlinedData()) async_insert = true; @@ -1361,7 +1385,7 @@ void executeQuery( BlockIO streams; OutputFormatPtr output_format; - auto update_format_for_exception_if_needed = [&]() + auto update_format_on_exception_if_needed = [&]() { if (!output_format) { @@ -1374,10 +1398,19 @@ void executeQuery( /// Force an update of the headers before we start writing result_details.content_type = output_format->getContentType(); result_details.format = format_name; + + fiu_do_on(FailPoints::execute_query_calling_empty_set_result_func_on_exception, { + // it will throw std::bad_function_call + set_result_details = nullptr; + set_result_details(result_details); + }); + if (set_result_details) { - set_result_details(result_details); + /// reset set_result_details func to avoid calling in SCOPE_EXIT() + auto set_result_details_copy = set_result_details; set_result_details = nullptr; + set_result_details_copy(result_details); } } } @@ -1397,7 +1430,7 @@ void executeQuery( { if (handle_exception_in_output_format) { - update_format_for_exception_if_needed(); + update_format_on_exception_if_needed(); if (output_format) handle_exception_in_output_format(*output_format); } @@ -1498,13 +1531,17 @@ void executeQuery( } catch (...) { + /// first execute on exception callback, it includes updating query_log + /// otherwise closing record ('ExceptionWhileProcessing') can be not appended in query_log + /// due to possible exceptions in functions called below (passed as parameter here) + streams.onException(); + if (handle_exception_in_output_format) { - update_format_for_exception_if_needed(); + update_format_on_exception_if_needed(); if (output_format) handle_exception_in_output_format(*output_format); } - streams.onException(); throw; } diff --git a/src/Interpreters/parseColumnsListForTableFunction.cpp b/src/Interpreters/parseColumnsListForTableFunction.cpp index 551a883d093..1499568cec9 100644 --- a/src/Interpreters/parseColumnsListForTableFunction.cpp +++ b/src/Interpreters/parseColumnsListForTableFunction.cpp @@ -8,7 +8,6 @@ #include #include - namespace DB { @@ -20,57 +19,64 @@ namespace ErrorCodes } -void validateDataType(const DataTypePtr & type, const DataTypeValidationSettings & settings) +void validateDataType(const DataTypePtr & type_to_check, const DataTypeValidationSettings & settings) { - if (!settings.allow_suspicious_low_cardinality_types) + auto validate_callback = [&](const IDataType & data_type) { - if (const auto * lc_type = typeid_cast(type.get())) + if (!settings.allow_suspicious_low_cardinality_types) { - if (!isStringOrFixedString(*removeNullable(lc_type->getDictionaryType()))) - throw Exception( - ErrorCodes::SUSPICIOUS_TYPE_FOR_LOW_CARDINALITY, - "Creating columns of type {} is prohibited by default due to expected negative impact on performance. " - "It can be enabled with the \"allow_suspicious_low_cardinality_types\" setting.", - lc_type->getName()); + if (const auto * lc_type = typeid_cast(&data_type)) + { + if (!isStringOrFixedString(*removeNullable(lc_type->getDictionaryType()))) + throw Exception( + ErrorCodes::SUSPICIOUS_TYPE_FOR_LOW_CARDINALITY, + "Creating columns of type {} is prohibited by default due to expected negative impact on performance. " + "It can be enabled with the \"allow_suspicious_low_cardinality_types\" setting.", + lc_type->getName()); + } } - } - if (!settings.allow_experimental_object_type) - { - if (type->hasDynamicSubcolumns()) + if (!settings.allow_experimental_object_type) { - throw Exception( - ErrorCodes::ILLEGAL_COLUMN, - "Cannot create column with type '{}' because experimental Object type is not allowed. " - "Set setting allow_experimental_object_type = 1 in order to allow it", type->getName()); - } - } - - if (!settings.allow_suspicious_fixed_string_types) - { - auto basic_type = removeLowCardinalityAndNullable(type); - if (const auto * fixed_string = typeid_cast(basic_type.get())) - { - if (fixed_string->getN() > MAX_FIXEDSTRING_SIZE_WITHOUT_SUSPICIOUS) + if (data_type.hasDynamicSubcolumns()) + { throw Exception( ErrorCodes::ILLEGAL_COLUMN, - "Cannot create column with type '{}' because fixed string with size > {} is suspicious. " - "Set setting allow_suspicious_fixed_string_types = 1 in order to allow it", - type->getName(), - MAX_FIXEDSTRING_SIZE_WITHOUT_SUSPICIOUS); + "Cannot create column with type '{}' because experimental Object type is not allowed. " + "Set setting allow_experimental_object_type = 1 in order to allow it", + data_type.getName()); + } } - } - if (!settings.allow_experimental_variant_type) - { - if (isVariant(type)) + if (!settings.allow_suspicious_fixed_string_types) { - throw Exception( - ErrorCodes::ILLEGAL_COLUMN, - "Cannot create column with type '{}' because experimental Variant type is not allowed. " - "Set setting allow_experimental_variant_type = 1 in order to allow it", type->getName()); + if (const auto * fixed_string = typeid_cast(&data_type)) + { + if (fixed_string->getN() > MAX_FIXEDSTRING_SIZE_WITHOUT_SUSPICIOUS) + throw Exception( + ErrorCodes::ILLEGAL_COLUMN, + "Cannot create column with type '{}' because fixed string with size > {} is suspicious. " + "Set setting allow_suspicious_fixed_string_types = 1 in order to allow it", + data_type.getName(), + MAX_FIXEDSTRING_SIZE_WITHOUT_SUSPICIOUS); + } } - } + + if (!settings.allow_experimental_variant_type) + { + if (isVariant(data_type)) + { + throw Exception( + ErrorCodes::ILLEGAL_COLUMN, + "Cannot create column with type '{}' because experimental Variant type is not allowed. " + "Set setting allow_experimental_variant_type = 1 in order to allow it", + data_type.getName()); + } + } + }; + + validate_callback(*type_to_check); + type_to_check->forEachChild(validate_callback); } ColumnsDescription parseColumnsListFromString(const std::string & structure, const ContextPtr & context) diff --git a/src/Parsers/ASTCreateQuery.h b/src/Parsers/ASTCreateQuery.h index 49a0140625c..b1209e72b61 100644 --- a/src/Parsers/ASTCreateQuery.h +++ b/src/Parsers/ASTCreateQuery.h @@ -96,6 +96,7 @@ public: bool is_populate{false}; bool is_create_empty{false}; /// CREATE TABLE ... EMPTY AS SELECT ... bool replace_view{false}; /// CREATE OR REPLACE VIEW + bool has_uuid{false}; // CREATE TABLE x UUID '...' ASTColumns * columns_list = nullptr; diff --git a/src/Parsers/ASTFunction.cpp b/src/Parsers/ASTFunction.cpp index e7f7b48091a..ae9b8ddbe85 100644 --- a/src/Parsers/ASTFunction.cpp +++ b/src/Parsers/ASTFunction.cpp @@ -835,34 +835,37 @@ void ASTFunction::formatImplWithoutAlias(const FormatSettings & settings, Format const auto * literal = arguments->children[0]->as(); const auto * function = arguments->children[0]->as(); - bool negate = name == "negate"; bool is_tuple = literal && literal->value.getType() == Field::Types::Tuple; // do not add parentheses for tuple literal, otherwise extra parens will be added `-((3, 7, 3), 1)` -> `-(((3, 7, 3), 1))` bool literal_need_parens = literal && !is_tuple; + // negate always requires parentheses, otherwise -(-1) will be printed as --1 - bool negate_need_parens = negate && (literal_need_parens || (function && function->name == "negate")); - // We don't need parentheses around a single literal. - bool need_parens = !literal && frame.need_parens && !negate_need_parens; + bool inside_parens = name == "negate" && (literal_need_parens || (function && function->name == "negate")); + + /// We DO need parentheses around a single literal + /// For example, SELECT (NOT 0) + (NOT 0) cannot be transformed into SELECT NOT 0 + NOT 0, since + /// this is equal to SELECT NOT (0 + NOT 0) + bool outside_parens = frame.need_parens && !inside_parens; // do not add extra parentheses for functions inside negate, i.e. -(-toUInt64(-(1))) - if (negate_need_parens) + if (inside_parens) nested_need_parens.need_parens = false; - if (need_parens) + if (outside_parens) settings.ostr << '('; settings.ostr << (settings.hilite ? hilite_operator : "") << func[1] << (settings.hilite ? hilite_none : ""); - if (negate_need_parens) + if (inside_parens) settings.ostr << '('; arguments->formatImpl(settings, state, nested_need_parens); written = true; - if (negate_need_parens) + if (inside_parens) settings.ostr << ')'; - if (need_parens) + if (outside_parens) settings.ostr << ')'; break; @@ -1034,7 +1037,15 @@ void ASTFunction::formatImplWithoutAlias(const FormatSettings & settings, Format } } - if (!written && name == "lambda"sv) + const auto & first_argument = arguments->children[0]; + const ASTIdentifier * first_argument_identifier = first_argument->as(); + const ASTFunction * first_argument_function = first_argument->as(); + bool first_argument_is_tuple = first_argument_function && first_argument_function->name == "tuple"; + + /// Only these types of arguments are accepted by the parser of the '->' operator. + bool acceptable_first_argument_for_lambda_expression = first_argument_identifier || first_argument_is_tuple; + + if (!written && name == "lambda"sv && acceptable_first_argument_for_lambda_expression) { /// Special case: zero elements tuple in lhs of lambda is printed as (). /// Special case: one-element tuple in lhs of lambda is printed as its element. @@ -1042,19 +1053,17 @@ void ASTFunction::formatImplWithoutAlias(const FormatSettings & settings, Format if (frame.need_parens) settings.ostr << '('; - const auto * first_arg_func = arguments->children[0]->as(); - if (first_arg_func - && first_arg_func->name == "tuple" - && first_arg_func->arguments - && (first_arg_func->arguments->children.size() == 1 || first_arg_func->arguments->children.empty())) + if (first_argument_is_tuple + && first_argument_function->arguments + && (first_argument_function->arguments->children.size() == 1 || first_argument_function->arguments->children.empty())) { - if (first_arg_func->arguments->children.size() == 1) - first_arg_func->arguments->children[0]->formatImpl(settings, state, nested_need_parens); + if (first_argument_function->arguments->children.size() == 1) + first_argument_function->arguments->children[0]->formatImpl(settings, state, nested_need_parens); else settings.ostr << "()"; } else - arguments->children[0]->formatImpl(settings, state, nested_need_parens); + first_argument->formatImpl(settings, state, nested_need_parens); settings.ostr << (settings.hilite ? hilite_operator : "") << " -> " << (settings.hilite ? hilite_none : ""); arguments->children[1]->formatImpl(settings, state, nested_need_parens); diff --git a/src/Parsers/ASTSelectQuery.cpp b/src/Parsers/ASTSelectQuery.cpp index 2115de1c124..d38e0933981 100644 --- a/src/Parsers/ASTSelectQuery.cpp +++ b/src/Parsers/ASTSelectQuery.cpp @@ -165,7 +165,7 @@ void ASTSelectQuery::formatImpl(const FormatSettings & s, FormatState & state, F if (order_by_all) { - s.ostr << (s.hilite ? hilite_keyword : "") << s.nl_or_ws << indent_str << "ORDER BY ALL" << (s.hilite ? hilite_none : ""); + s.ostr << (s.hilite ? hilite_keyword : "") << s.nl_or_ws << indent_str << "ORDER BY *" << (s.hilite ? hilite_none : ""); auto * elem = orderBy()->children[0]->as(); s.ostr << (s.hilite ? hilite_keyword : "") diff --git a/src/Parsers/ASTSelectWithUnionQuery.cpp b/src/Parsers/ASTSelectWithUnionQuery.cpp index 48b4ae3c38d..c377e4bd66b 100644 --- a/src/Parsers/ASTSelectWithUnionQuery.cpp +++ b/src/Parsers/ASTSelectWithUnionQuery.cpp @@ -71,8 +71,7 @@ void ASTSelectWithUnionQuery::formatQueryImpl(const FormatSettings & settings, F } else { - auto sub_query = std::make_shared(); - sub_query->children.push_back(*it); + auto sub_query = std::make_shared(*it); sub_query->formatImpl(settings, state, frame); } } diff --git a/src/Parsers/ASTSubquery.h b/src/Parsers/ASTSubquery.h index ef277a63126..e92a88b04dd 100644 --- a/src/Parsers/ASTSubquery.h +++ b/src/Parsers/ASTSubquery.h @@ -26,6 +26,13 @@ public: return clone; } + ASTSubquery() = default; + + ASTSubquery(ASTPtr child) + { + children.emplace_back(std::move(child)); + } + void updateTreeHashImpl(SipHash & hash_state, bool ignore_aliases) const override; String getAliasOrColumnName() const override; String tryGetAlias() const override; diff --git a/src/Parsers/ExpressionElementParsers.cpp b/src/Parsers/ExpressionElementParsers.cpp index 42400a0f13b..62c480e0f6b 100644 --- a/src/Parsers/ExpressionElementParsers.cpp +++ b/src/Parsers/ExpressionElementParsers.cpp @@ -123,7 +123,7 @@ bool ParserSubquery::parseImpl(Pos & pos, ASTPtr & node, Expected & expected) throw Exception(ErrorCodes::BAD_ARGUMENTS, "EXPLAIN in a subquery cannot have a table function or table override"); /// Replace subquery `(EXPLAIN SELECT ...)` - /// with `(SELECT * FROM viewExplain("", "", SELECT ...))` + /// with `(SELECT * FROM viewExplain('', '', (SELECT ...)))` String kind_str = ASTExplainQuery::toString(explain_query.getKind()); @@ -141,7 +141,7 @@ bool ParserSubquery::parseImpl(Pos & pos, ASTPtr & node, Expected & expected) auto view_explain = makeASTFunction("viewExplain", std::make_shared(kind_str), std::make_shared(settings_str), - explained_ast); + std::make_shared(explained_ast)); result_node = buildSelectFromTableFunction(view_explain); } else @@ -161,8 +161,7 @@ bool ParserSubquery::parseImpl(Pos & pos, ASTPtr & node, Expected & expected) return false; ++pos; - node = std::make_shared(); - node->children.push_back(result_node); + node = std::make_shared(std::move(result_node)); return true; } diff --git a/src/Parsers/ExpressionListParsers.cpp b/src/Parsers/ExpressionListParsers.cpp index 1e9383f96ae..6d267a7d215 100644 --- a/src/Parsers/ExpressionListParsers.cpp +++ b/src/Parsers/ExpressionListParsers.cpp @@ -225,8 +225,7 @@ static bool modifyAST(ASTPtr ast, SubqueryFunctionType type) select_with_union_query->list_of_selects->children.push_back(std::move(select_query)); select_with_union_query->children.push_back(select_with_union_query->list_of_selects); - auto new_subquery = std::make_shared(); - new_subquery->children.push_back(select_with_union_query); + auto new_subquery = std::make_shared(std::move(select_with_union_query)); ast->children[0]->children.back() = std::move(new_subquery); return true; @@ -1582,8 +1581,7 @@ public: if (!ParserToken(TokenType::ClosingRoundBracket).ignore(pos, expected)) return false; - auto subquery = std::make_shared(); - subquery->children.push_back(std::move(node)); + auto subquery = std::make_shared(std::move(node)); elements = {makeASTFunction("exists", subquery)}; finished = true; diff --git a/src/Parsers/Kusto/KustoFunctions/KQLFunctionFactory.cpp b/src/Parsers/Kusto/KustoFunctions/KQLFunctionFactory.cpp index adac892b49d..044cc2e0622 100644 --- a/src/Parsers/Kusto/KustoFunctions/KQLFunctionFactory.cpp +++ b/src/Parsers/Kusto/KustoFunctions/KQLFunctionFactory.cpp @@ -359,7 +359,7 @@ std::unique_ptr KQLFunctionFactory::get(String & kql_functio return std::make_unique(); case KQLFunctionValue::extract_json: - return std::make_unique(); + return std::make_unique(); case KQLFunctionValue::has_any_index: return std::make_unique(); @@ -389,7 +389,7 @@ std::unique_ptr KQLFunctionFactory::get(String & kql_functio return std::make_unique(); case KQLFunctionValue::parse_json: - return std::make_unique(); + return std::make_unique(); case KQLFunctionValue::parse_url: return std::make_unique(); diff --git a/src/Parsers/Kusto/KustoFunctions/KQLStringFunctions.cpp b/src/Parsers/Kusto/KustoFunctions/KQLStringFunctions.cpp index 0f9ca67d6dc..afb8809c69e 100644 --- a/src/Parsers/Kusto/KustoFunctions/KQLStringFunctions.cpp +++ b/src/Parsers/Kusto/KustoFunctions/KQLStringFunctions.cpp @@ -240,7 +240,7 @@ bool ExtractAll::convertImpl(String & out, IParser::Pos & pos) return true; } -bool ExtractJson::convertImpl(String & out, IParser::Pos & pos) +bool ExtractJSON::convertImpl(String & out, IParser::Pos & pos) { String datatype = "String"; ParserKeyword s_kql("typeof"); @@ -431,7 +431,7 @@ bool ParseCSV::convertImpl(String & out, IParser::Pos & pos) return true; } -bool ParseJson::convertImpl(String & out, IParser::Pos & pos) +bool ParseJSON::convertImpl(String & out, IParser::Pos & pos) { const String fn_name = getKQLFunctionName(pos); if (fn_name.empty()) diff --git a/src/Parsers/Kusto/KustoFunctions/KQLStringFunctions.h b/src/Parsers/Kusto/KustoFunctions/KQLStringFunctions.h index 492a59263ec..9b0c6327e01 100644 --- a/src/Parsers/Kusto/KustoFunctions/KQLStringFunctions.h +++ b/src/Parsers/Kusto/KustoFunctions/KQLStringFunctions.h @@ -62,7 +62,7 @@ protected: bool convertImpl(String & out, IParser::Pos & pos) override; }; -class ExtractJson : public IParserKQLFunction +class ExtractJSON : public IParserKQLFunction { protected: const char * getName() const override { return "extract_json(), extractjson()"; } @@ -125,7 +125,7 @@ protected: bool convertImpl(String & out, IParser::Pos & pos) override; }; -class ParseJson : public IParserKQLFunction +class ParseJSON : public IParserKQLFunction { protected: const char * getName() const override { return "parse_json()"; } diff --git a/src/Parsers/Kusto/ParserKQLQuery.cpp b/src/Parsers/Kusto/ParserKQLQuery.cpp index 47986943662..30e9921e744 100644 --- a/src/Parsers/Kusto/ParserKQLQuery.cpp +++ b/src/Parsers/Kusto/ParserKQLQuery.cpp @@ -576,20 +576,19 @@ bool ParserKQLSubquery::parseImpl(Pos & pos, ASTPtr & node, Expected & expected) if (!ParserKQLTableFunction().parse(pos, select_node, expected)) return false; - ASTPtr node_subquery = std::make_shared(); - node_subquery->children.push_back(select_node); + ASTPtr node_subquery = std::make_shared(std::move(select_node)); ASTPtr node_table_expr = std::make_shared(); node_table_expr->as()->subquery = node_subquery; node_table_expr->children.emplace_back(node_subquery); - ASTPtr node_table_in_select_query_emlement = std::make_shared(); - node_table_in_select_query_emlement->as()->table_expression = node_table_expr; + ASTPtr node_table_in_select_query_element = std::make_shared(); + node_table_in_select_query_element->as()->table_expression = node_table_expr; ASTPtr res = std::make_shared(); - res->children.emplace_back(node_table_in_select_query_emlement); + res->children.emplace_back(node_table_in_select_query_element); node = res; return true; @@ -618,20 +617,19 @@ bool ParserSimpleCHSubquery::parseImpl(Pos & pos, ASTPtr & node, Expected & expe ASTSelectQuery::Expression::TABLES, parent_select_node->as()->tables()); } - ASTPtr node_subquery = std::make_shared(); - node_subquery->children.push_back(sub_select_node); + ASTPtr node_subquery = std::make_shared(std::move(sub_select_node)); ASTPtr node_table_expr = std::make_shared(); node_table_expr->as()->subquery = node_subquery; node_table_expr->children.emplace_back(node_subquery); - ASTPtr node_table_in_select_query_emlement = std::make_shared(); - node_table_in_select_query_emlement->as()->table_expression = node_table_expr; + ASTPtr node_table_in_select_query_element = std::make_shared(); + node_table_in_select_query_element->as()->table_expression = node_table_expr; ASTPtr res = std::make_shared(); - res->children.emplace_back(node_table_in_select_query_emlement); + res->children.emplace_back(node_table_in_select_query_element); node = res; return true; diff --git a/src/Parsers/ParserCreateQuery.cpp b/src/Parsers/ParserCreateQuery.cpp index 1f6f68c9d8e..27c6e6258e3 100644 --- a/src/Parsers/ParserCreateQuery.cpp +++ b/src/Parsers/ParserCreateQuery.cpp @@ -684,6 +684,7 @@ bool ParserCreateTableQuery::parseImpl(Pos & pos, ASTPtr & node, Expected & expe query->database = table_id->getDatabase(); query->table = table_id->getTable(); query->uuid = table_id->uuid; + query->has_uuid = table_id->uuid != UUIDHelpers::Nil; if (query->database) query->children.push_back(query->database); @@ -783,6 +784,7 @@ bool ParserCreateTableQuery::parseImpl(Pos & pos, ASTPtr & node, Expected & expe query->database = table_id->getDatabase(); query->table = table_id->getTable(); query->uuid = table_id->uuid; + query->has_uuid = table_id->uuid != UUIDHelpers::Nil; query->cluster = cluster_str; if (query->database) diff --git a/src/Parsers/ParserSelectQuery.cpp b/src/Parsers/ParserSelectQuery.cpp index 641e74b5f18..6397a2a2a55 100644 --- a/src/Parsers/ParserSelectQuery.cpp +++ b/src/Parsers/ParserSelectQuery.cpp @@ -1,21 +1,23 @@ -#include +#include + +#include +#include +#include +#include #include +#include #include -#include #include #include #include -#include +#include #include -#include +#include #include #include -#include -#include -#include -#include #include +#include namespace DB { @@ -290,9 +292,9 @@ bool ParserSelectQuery::parseImpl(Pos & pos, ASTPtr & node, Expected & expected) } else if (order_expression_list->children.size() == 1) { - /// ORDER BY ALL - auto * identifier = order_expression_list->children[0]->as()->children[0]->as(); - if (identifier != nullptr && Poco::toUpper(identifier->name()) == "ALL") + /// ORDER BY * + auto * asterisk = order_expression_list->children[0]->as()->children[0]->as(); + if (asterisk != nullptr) select_query->order_by_all = true; } } diff --git a/src/Planner/PlannerJoinTree.cpp b/src/Planner/PlannerJoinTree.cpp index 227ac86d3a5..e6a459d0e8a 100644 --- a/src/Planner/PlannerJoinTree.cpp +++ b/src/Planner/PlannerJoinTree.cpp @@ -801,14 +801,18 @@ JoinTreeQueryPlan buildQueryPlanForTableExpression(QueryTreeNodePtr table_expres table_expression_query_info.prewhere_info->prewhere_actions = filter_info.actions; table_expression_query_info.prewhere_info->prewhere_column_name = filter_info.column_name; table_expression_query_info.prewhere_info->remove_prewhere_column = filter_info.do_remove_column; + table_expression_query_info.prewhere_info->need_filter = true; } - else + else if (!table_expression_query_info.prewhere_info->row_level_filter) { table_expression_query_info.prewhere_info->row_level_filter = filter_info.actions; table_expression_query_info.prewhere_info->row_level_column_name = filter_info.column_name; + table_expression_query_info.prewhere_info->need_filter = true; + } + else + { + where_filters.emplace_back(filter_info, std::move(description)); } - - table_expression_query_info.prewhere_info->need_filter = true; } else { diff --git a/src/Processors/Executors/PipelineExecutor.cpp b/src/Processors/Executors/PipelineExecutor.cpp index 580aaa2b259..a06bacd7d3b 100644 --- a/src/Processors/Executors/PipelineExecutor.cpp +++ b/src/Processors/Executors/PipelineExecutor.cpp @@ -138,8 +138,8 @@ bool PipelineExecutor::executeStep(std::atomic_bool * yield_flag) initializeExecution(1, true); // Acquire slot until we are done - single_thread_slot = slots->tryAcquire(); - chassert(single_thread_slot && "Unable to allocate slot for the first thread, but we just allocated at least one slot"); + single_thread_cpu_slot = cpu_slots->tryAcquire(); + chassert(single_thread_cpu_slot && "Unable to allocate cpu slot for the first thread, but we just allocated at least one slot"); if (yield_flag && *yield_flag) return true; @@ -155,7 +155,7 @@ bool PipelineExecutor::executeStep(std::atomic_bool * yield_flag) if (node->exception) std::rethrow_exception(node->exception); - single_thread_slot.reset(); + single_thread_cpu_slot.reset(); finalizeExecution(); return false; @@ -333,8 +333,8 @@ void PipelineExecutor::initializeExecution(size_t num_threads, bool concurrency_ /// Allocate CPU slots from concurrency control size_t min_threads = concurrency_control ? 1uz : num_threads; - slots = ConcurrencyControl::instance().allocate(min_threads, num_threads); - use_threads = slots->grantedCount(); + cpu_slots = ConcurrencyControl::instance().allocate(min_threads, num_threads); + use_threads = cpu_slots->grantedCount(); Queue queue; graph->initializeExecution(queue); @@ -348,7 +348,7 @@ void PipelineExecutor::initializeExecution(size_t num_threads, bool concurrency_ void PipelineExecutor::spawnThreads() { - while (auto slot = slots->tryAcquire()) + while (auto slot = cpu_slots->tryAcquire()) { size_t thread_num = threads.fetch_add(1); @@ -405,7 +405,7 @@ void PipelineExecutor::executeImpl(size_t num_threads, bool concurrency_control) } else { - auto slot = slots->tryAcquire(); + auto slot = cpu_slots->tryAcquire(); executeSingleThread(0); } diff --git a/src/Processors/Executors/PipelineExecutor.h b/src/Processors/Executors/PipelineExecutor.h index 862a460f0ed..cb74b524163 100644 --- a/src/Processors/Executors/PipelineExecutor.h +++ b/src/Processors/Executors/PipelineExecutor.h @@ -68,8 +68,8 @@ private: ExecutorTasks tasks; /// Concurrency control related - ConcurrencyControl::AllocationPtr slots; - ConcurrencyControl::SlotPtr single_thread_slot; // slot for single-thread mode to work using executeStep() + SlotAllocationPtr cpu_slots; + AcquiredSlotPtr single_thread_cpu_slot; // cpu slot for single-thread mode to work using executeStep() std::unique_ptr pool; std::atomic_size_t threads = 0; diff --git a/src/Processors/Formats/ISchemaReader.cpp b/src/Processors/Formats/ISchemaReader.cpp index 79b7ca17a5a..45523700a5d 100644 --- a/src/Processors/Formats/ISchemaReader.cpp +++ b/src/Processors/Formats/ISchemaReader.cpp @@ -81,7 +81,7 @@ IIRowSchemaReader::IIRowSchemaReader(ReadBuffer & in_, const FormatSettings & fo { } -void IIRowSchemaReader::setContext(ContextPtr & context) +void IIRowSchemaReader::setContext(const ContextPtr & context) { ColumnsDescription columns; if (tryParseColumnsListFromString(hints_str, columns, context, hints_parsing_error)) diff --git a/src/Processors/Formats/ISchemaReader.h b/src/Processors/Formats/ISchemaReader.h index 94df71a88b4..23c6606a6bd 100644 --- a/src/Processors/Formats/ISchemaReader.h +++ b/src/Processors/Formats/ISchemaReader.h @@ -34,7 +34,7 @@ public: virtual bool hasStrictOrderOfColumns() const { return true; } virtual bool needContext() const { return false; } - virtual void setContext(ContextPtr &) {} + virtual void setContext(const ContextPtr &) {} virtual void setMaxRowsAndBytesToRead(size_t, size_t) {} virtual size_t getNumRowsRead() const { return 0; } @@ -56,7 +56,7 @@ public: IIRowSchemaReader(ReadBuffer & in_, const FormatSettings & format_settings_, DataTypePtr default_type_ = nullptr); bool needContext() const override { return !hints_str.empty(); } - void setContext(ContextPtr & context) override; + void setContext(const ContextPtr & context) override; protected: void setMaxRowsAndBytesToRead(size_t max_rows, size_t max_bytes) override diff --git a/src/Processors/Formats/Impl/AvroRowInputFormat.cpp b/src/Processors/Formats/Impl/AvroRowInputFormat.cpp index 8dc8fa516dc..8ef2cda5587 100644 --- a/src/Processors/Formats/Impl/AvroRowInputFormat.cpp +++ b/src/Processors/Formats/Impl/AvroRowInputFormat.cpp @@ -212,7 +212,7 @@ static AvroDeserializer::DeserializeFn createDecimalDeserializeFn(const avro::No }; } -static std::string nodeToJson(avro::NodePtr root_node) +static std::string nodeToJSON(avro::NodePtr root_node) { std::ostringstream ss; // STYLE_CHECK_ALLOW_STD_STRING_STREAM ss.exceptions(std::ios::failbit); @@ -641,7 +641,7 @@ AvroDeserializer::DeserializeFn AvroDeserializer::createDeserializeFn(const avro throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Type {} is not compatible with Avro {}:\n{}", - target_type->getName(), avro::toString(root_node->type()), nodeToJson(root_node)); + target_type->getName(), avro::toString(root_node->type()), nodeToJSON(root_node)); } AvroDeserializer::SkipFn AvroDeserializer::createSkipFn(const avro::NodePtr & root_node) diff --git a/src/Processors/Formats/Impl/ConstantExpressionTemplate.cpp b/src/Processors/Formats/Impl/ConstantExpressionTemplate.cpp index bf584b759f8..f91f7cf536b 100644 --- a/src/Processors/Formats/Impl/ConstantExpressionTemplate.cpp +++ b/src/Processors/Formats/Impl/ConstantExpressionTemplate.cpp @@ -16,7 +16,6 @@ #include #include #include -#include #include #include #include @@ -28,7 +27,6 @@ #include #include #include -#include namespace DB diff --git a/src/Processors/Formats/Impl/JSONColumnsBlockInputFormatBase.cpp b/src/Processors/Formats/Impl/JSONColumnsBlockInputFormatBase.cpp index 53cb5a77898..62d33d36206 100644 --- a/src/Processors/Formats/Impl/JSONColumnsBlockInputFormatBase.cpp +++ b/src/Processors/Formats/Impl/JSONColumnsBlockInputFormatBase.cpp @@ -215,7 +215,7 @@ JSONColumnsSchemaReaderBase::JSONColumnsSchemaReaderBase( { } -void JSONColumnsSchemaReaderBase::setContext(ContextPtr & ctx) +void JSONColumnsSchemaReaderBase::setContext(const ContextPtr & ctx) { ColumnsDescription columns; if (tryParseColumnsListFromString(hints_str, columns, ctx, hints_parsing_error)) diff --git a/src/Processors/Formats/Impl/JSONColumnsBlockInputFormatBase.h b/src/Processors/Formats/Impl/JSONColumnsBlockInputFormatBase.h index fe80d77cd87..ee7e79afc54 100644 --- a/src/Processors/Formats/Impl/JSONColumnsBlockInputFormatBase.h +++ b/src/Processors/Formats/Impl/JSONColumnsBlockInputFormatBase.h @@ -84,7 +84,7 @@ public: void transformTypesFromDifferentFilesIfNeeded(DataTypePtr & type, DataTypePtr & new_type) override; bool needContext() const override { return !hints_str.empty(); } - void setContext(ContextPtr & ctx) override; + void setContext(const ContextPtr & ctx) override; void setMaxRowsAndBytesToRead(size_t max_rows, size_t max_bytes) override { diff --git a/src/Processors/Formats/Impl/JSONRowInputFormat.cpp b/src/Processors/Formats/Impl/JSONRowInputFormat.cpp index 23faa057715..67652a2cb0d 100644 --- a/src/Processors/Formats/Impl/JSONRowInputFormat.cpp +++ b/src/Processors/Formats/Impl/JSONRowInputFormat.cpp @@ -71,27 +71,36 @@ void JSONRowInputFormat::resetReadBuffer() JSONEachRowRowInputFormat::resetReadBuffer(); } -JSONRowSchemaReader::JSONRowSchemaReader(ReadBuffer & in_, const FormatSettings & format_settings_) - : JSONRowSchemaReader(std::make_unique(in_), format_settings_) +JSONRowSchemaReader::JSONRowSchemaReader(ReadBuffer & in_, const FormatSettings & format_settings_, bool fallback_to_json_each_row_) + : JSONRowSchemaReader(std::make_unique(in_), format_settings_, fallback_to_json_each_row_) { } -JSONRowSchemaReader::JSONRowSchemaReader(std::unique_ptr buf, const DB::FormatSettings & format_settings_) - : JSONEachRowSchemaReader(*buf, format_settings_), peekable_buf(std::move(buf)) +JSONRowSchemaReader::JSONRowSchemaReader(std::unique_ptr buf, const DB::FormatSettings & format_settings_, bool fallback_to_json_each_row_) + : JSONEachRowSchemaReader(*buf, format_settings_), peekable_buf(std::move(buf)), fallback_to_json_each_row(fallback_to_json_each_row_) { } NamesAndTypesList JSONRowSchemaReader::readSchema() { skipBOMIfExists(*peekable_buf); - PeekableReadBufferCheckpoint checkpoint(*peekable_buf); - /// Try to parse metadata, if failed, try to parse data as JSONEachRow format - NamesAndTypesList names_and_types; - if (JSONUtils::checkAndSkipObjectStart(*peekable_buf) && JSONUtils::tryReadMetadata(*peekable_buf, names_and_types)) - return names_and_types; - peekable_buf->rollbackToCheckpoint(true); - return JSONEachRowSchemaReader::readSchema(); + if (fallback_to_json_each_row) + { + PeekableReadBufferCheckpoint checkpoint(*peekable_buf); + /// Try to parse metadata, if failed, try to parse data as JSONEachRow format + NamesAndTypesList names_and_types; + if (JSONUtils::checkAndSkipObjectStart(*peekable_buf) && JSONUtils::tryReadMetadata(*peekable_buf, names_and_types)) + return names_and_types; + + peekable_buf->rollbackToCheckpoint(true); + return JSONEachRowSchemaReader::readSchema(); + } + else + { + JSONUtils::skipObjectStart(*peekable_buf); + return JSONUtils::readMetadata(*peekable_buf); + } } void registerInputFormatJSON(FormatFactory & factory) @@ -110,19 +119,19 @@ void registerInputFormatJSON(FormatFactory & factory) void registerJSONSchemaReader(FormatFactory & factory) { - auto register_schema_reader = [&](const String & format) + auto register_schema_reader = [&](const String & format, bool fallback_to_json_each_row) { factory.registerSchemaReader( - format, [](ReadBuffer & buf, const FormatSettings & format_settings) { return std::make_unique(buf, format_settings); }); + format, [fallback_to_json_each_row](ReadBuffer & buf, const FormatSettings & format_settings) { return std::make_unique(buf, format_settings, fallback_to_json_each_row); }); factory.registerAdditionalInfoForSchemaCacheGetter(format, [](const FormatSettings & settings) { return getAdditionalFormatInfoByEscapingRule(settings, FormatSettings::EscapingRule::JSON); }); }; - register_schema_reader("JSON"); + register_schema_reader("JSON", true); /// JSONCompact has the same suffix with metadata. - register_schema_reader("JSONCompact"); + register_schema_reader("JSONCompact", false); } } diff --git a/src/Processors/Formats/Impl/JSONRowInputFormat.h b/src/Processors/Formats/Impl/JSONRowInputFormat.h index b2e1d8a3d6d..6db5cee380a 100644 --- a/src/Processors/Formats/Impl/JSONRowInputFormat.h +++ b/src/Processors/Formats/Impl/JSONRowInputFormat.h @@ -45,16 +45,17 @@ private: class JSONRowSchemaReader : public JSONEachRowSchemaReader { public: - JSONRowSchemaReader(ReadBuffer & in_, const FormatSettings & format_settings_); + JSONRowSchemaReader(ReadBuffer & in_, const FormatSettings & format_settings_, bool fallback_to_json_each_row_); NamesAndTypesList readSchema() override; bool hasStrictOrderOfColumns() const override { return false; } private: - JSONRowSchemaReader(std::unique_ptr buf, const FormatSettings & format_settings_); + JSONRowSchemaReader(std::unique_ptr buf, const FormatSettings & format_settings_, bool fallback_to_json_each_row_); std::unique_ptr peekable_buf; + bool fallback_to_json_each_row; }; } diff --git a/src/Processors/Formats/Impl/Parquet/Write.cpp b/src/Processors/Formats/Impl/Parquet/Write.cpp index 02ca2734ff8..4d71e0102d8 100644 --- a/src/Processors/Formats/Impl/Parquet/Write.cpp +++ b/src/Processors/Formats/Impl/Parquet/Write.cpp @@ -409,7 +409,7 @@ PODArray & compress(PODArray & source, PODArray & scratch, Com #pragma clang diagnostic pop if (max_dest_size > std::numeric_limits::max()) - throw Exception(ErrorCodes::CANNOT_COMPRESS, "Cannot compress column of size {}", formatReadableSizeWithBinarySuffix(source.size())); + throw Exception(ErrorCodes::CANNOT_COMPRESS, "Cannot compress column of size {}", ReadableSize(source.size())); scratch.resize(max_dest_size); diff --git a/src/Processors/Formats/Impl/TemplateRowInputFormat.cpp b/src/Processors/Formats/Impl/TemplateRowInputFormat.cpp index a6e4600d83b..f5edfb7c9d4 100644 --- a/src/Processors/Formats/Impl/TemplateRowInputFormat.cpp +++ b/src/Processors/Formats/Impl/TemplateRowInputFormat.cpp @@ -609,7 +609,9 @@ void registerTemplateSchemaReader(FormatFactory & factory) { size_t index = 0; auto idx_getter = [&](const String &) -> std::optional { return index++; }; - auto row_format = fillRowFormat(settings, idx_getter, false); + ParsedTemplateFormatString row_format; + if (!settings.template_settings.row_format.empty()) + row_format = fillRowFormat(settings, idx_getter, false); std::unordered_set visited_escaping_rules; String result = fmt::format("row_format={}, resultset_format={}, row_between_delimiter={}", settings.template_settings.row_format, diff --git a/src/Processors/Formats/Impl/ValuesBlockInputFormat.h b/src/Processors/Formats/Impl/ValuesBlockInputFormat.h index bf2765bfd1e..f82a8c8ab64 100644 --- a/src/Processors/Formats/Impl/ValuesBlockInputFormat.h +++ b/src/Processors/Formats/Impl/ValuesBlockInputFormat.h @@ -37,7 +37,7 @@ public: void resetReadBuffer() override; /// TODO: remove context somehow. - void setContext(ContextPtr & context_) { context = Context::createCopy(context_); } + void setContext(const ContextPtr & context_) { context = Context::createCopy(context_); } const BlockMissingValues & getMissingValues() const override { return block_missing_values; } diff --git a/src/Processors/QueryPlan/PartsSplitter.cpp b/src/Processors/QueryPlan/PartsSplitter.cpp index 363fdca22c5..0fc6ddd6408 100644 --- a/src/Processors/QueryPlan/PartsSplitter.cpp +++ b/src/Processors/QueryPlan/PartsSplitter.cpp @@ -54,7 +54,7 @@ public: Values getValue(size_t part_idx, size_t mark) const { - const auto & index = parts[part_idx].data_part->index; + const auto & index = parts[part_idx].data_part->getIndex(); Values values(index.size()); for (size_t i = 0; i < values.size(); ++i) { diff --git a/src/Server/CloudPlacementInfo.cpp b/src/Server/CloudPlacementInfo.cpp new file mode 100644 index 00000000000..0790f825a45 --- /dev/null +++ b/src/Server/CloudPlacementInfo.cpp @@ -0,0 +1,83 @@ +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace DB +{ + +namespace PlacementInfo +{ + +namespace +{ + std::string getConfigPath(std::string_view path) + { + return fmt::format("{}.{}", PLACEMENT_CONFIG_PREFIX, path); + } + + String loadAvailabilityZoneFromFile(const Poco::Util::AbstractConfiguration & config) + { + auto az_file = config.getString(getConfigPath("availability_zone_from_file"), DEFAULT_AZ_FILE_PATH); + + if (!std::filesystem::exists(az_file)) + return ""; + + String availability_zone_from_file; + + ReadBufferFromFile in(az_file); + readStringUntilEOF(availability_zone_from_file, in); + Poco::trimInPlace(availability_zone_from_file); + + return availability_zone_from_file; + } +} + + +PlacementInfo & PlacementInfo::instance() +{ + static PlacementInfo instance; + return instance; +} + +void PlacementInfo::initialize(const Poco::Util::AbstractConfiguration & config) +{ + use_imds = config.getBool(getConfigPath("use_imds"), false); + + if (use_imds) + { + availability_zone = S3::getRunningAvailabilityZone(); + } + else + { + availability_zone = config.getString(getConfigPath("availability_zone"), ""); + + if (availability_zone.empty()) + availability_zone = loadAvailabilityZoneFromFile(config); + + if (availability_zone.empty()) + LOG_WARNING(log, "Availability zone info not found"); + } + + LOG_DEBUG(log, "Loaded info: availability_zone: {}", availability_zone); + initialized = true; +} + +std::string PlacementInfo::getAvailabilityZone() const +{ + if (!initialized) + { + LOG_WARNING(log, "Placement info has not been loaded"); + return ""; + } + + return availability_zone; +} + +} +} diff --git a/src/Server/CloudPlacementInfo.h b/src/Server/CloudPlacementInfo.h new file mode 100644 index 00000000000..407f668142f --- /dev/null +++ b/src/Server/CloudPlacementInfo.h @@ -0,0 +1,39 @@ +#pragma once + +#include +#include +#include +#include + +namespace DB +{ + +namespace PlacementInfo +{ + +static constexpr auto PLACEMENT_CONFIG_PREFIX = "placement"; +static constexpr auto DEFAULT_AZ_FILE_PATH = "/run/instance-metadata/node-zone"; + +/// A singleton providing information on where in cloud server is running. +class PlacementInfo : private boost::noncopyable +{ +public: + static PlacementInfo & instance(); + + void initialize(const Poco::Util::AbstractConfiguration & config); + + std::string getAvailabilityZone() const; + +private: + PlacementInfo() = default; + + LoggerPtr log = getLogger("CloudPlacementInfo"); + + bool initialized; + + bool use_imds; + std::string availability_zone; +}; + +} +} diff --git a/src/Server/HTTPHandler.cpp b/src/Server/HTTPHandler.cpp index 72e7c5552f8..35a95c0534d 100644 --- a/src/Server/HTTPHandler.cpp +++ b/src/Server/HTTPHandler.cpp @@ -125,7 +125,7 @@ namespace ErrorCodes namespace { -bool tryAddHttpOptionHeadersFromConfig(HTTPServerResponse & response, const Poco::Util::LayeredConfiguration & config) +bool tryAddHTTPOptionHeadersFromConfig(HTTPServerResponse & response, const Poco::Util::LayeredConfiguration & config) { if (config.has("http_options_response")) { @@ -153,7 +153,7 @@ bool tryAddHttpOptionHeadersFromConfig(HTTPServerResponse & response, const Poco void processOptionsRequest(HTTPServerResponse & response, const Poco::Util::LayeredConfiguration & config) { /// If can add some headers from config - if (tryAddHttpOptionHeadersFromConfig(response, config)) + if (tryAddHTTPOptionHeadersFromConfig(response, config)) { response.setKeepAlive(false); response.setStatusAndReason(HTTPResponse::HTTP_NO_CONTENT); @@ -496,7 +496,7 @@ bool HTTPHandler::authenticateUser( else if (request.getMethod() == HTTPServerRequest::HTTP_POST) http_method = ClientInfo::HTTPMethod::POST; - session->setHttpClientInfo(http_method, request.get("User-Agent", ""), request.get("Referer", "")); + session->setHTTPClientInfo(http_method, request.get("User-Agent", ""), request.get("Referer", "")); session->setForwardedFor(request.get("X-Forwarded-For", "")); session->setQuotaClientKey(quota_key); @@ -1065,7 +1065,7 @@ void HTTPHandler::handleRequest(HTTPServerRequest & request, HTTPServerResponse response.set("X-ClickHouse-Server-Display-Name", server_display_name); if (!request.get("Origin", "").empty()) - tryAddHttpOptionHeadersFromConfig(response, server.config()); + tryAddHTTPOptionHeadersFromConfig(response, server.config()); /// For keep-alive to work. if (request.getVersion() == HTTPServerRequest::HTTP_1_1) diff --git a/src/Server/HTTPHandlerFactory.cpp b/src/Server/HTTPHandlerFactory.cpp index 66b55f68217..06ca1182be5 100644 --- a/src/Server/HTTPHandlerFactory.cpp +++ b/src/Server/HTTPHandlerFactory.cpp @@ -1,3 +1,4 @@ +#include #include #include @@ -7,6 +8,7 @@ #include #include "HTTPHandler.h" +#include "Server/PrometheusMetricsWriter.h" #include "StaticRequestHandler.h" #include "ReplicasStatusHandler.h" #include "InterserverIOHTTPHandler.h" @@ -113,7 +115,10 @@ HTTPRequestHandlerFactoryPtr createHandlerFactory(IServer & server, const Poco:: else if (name == "InterserverIOHTTPHandler-factory" || name == "InterserverIOHTTPSHandler-factory") return createInterserverHTTPHandlerFactory(server, name); else if (name == "PrometheusHandler-factory") - return createPrometheusMainHandlerFactory(server, config, async_metrics, name); + { + auto metrics_writer = std::make_shared(config, "prometheus", async_metrics); + return createPrometheusMainHandlerFactory(server, config, metrics_writer, name); + } throw Exception(ErrorCodes::LOGICAL_ERROR, "LOGICAL ERROR: Unknown HTTP handler factory name."); } @@ -208,7 +213,7 @@ void addDefaultHandlersFactory( /// Otherwise it will be created separately, see createHandlerFactory(...). if (config.has("prometheus") && config.getInt("prometheus.port", 0) == 0) { - PrometheusMetricsWriter writer(config, "prometheus", async_metrics); + auto writer = std::make_shared(config, "prometheus", async_metrics); auto creator = [&server, writer] () -> std::unique_ptr { return std::make_unique(server, writer); diff --git a/src/Server/HTTPHandlerFactory.h b/src/Server/HTTPHandlerFactory.h index 94b02e52277..427d495f659 100644 --- a/src/Server/HTTPHandlerFactory.h +++ b/src/Server/HTTPHandlerFactory.h @@ -6,6 +6,7 @@ #include #include #include +#include #include @@ -130,10 +131,10 @@ createPrometheusHandlerFactory(IServer & server, AsynchronousMetrics & async_metrics, const std::string & config_prefix); -HTTPRequestHandlerFactoryPtr -createPrometheusMainHandlerFactory(IServer & server, +HTTPRequestHandlerFactoryPtr createPrometheusMainHandlerFactory( + IServer & server, const Poco::Util::AbstractConfiguration & config, - AsynchronousMetrics & async_metrics, + PrometheusMetricsWriterPtr metrics_writer, const std::string & name); /// @param server - used in handlers to check IServer::isCancelled() diff --git a/src/Server/PrometheusMetricsWriter.cpp b/src/Server/PrometheusMetricsWriter.cpp index 3d09c2165e5..d0fdcd61493 100644 --- a/src/Server/PrometheusMetricsWriter.cpp +++ b/src/Server/PrometheusMetricsWriter.cpp @@ -4,6 +4,8 @@ #include #include +#include "config.h" + namespace { @@ -38,8 +40,83 @@ void convertHelpToSingleLine(std::string & help) std::replace(help.begin(), help.end(), '\n', ' '); } +constexpr auto profile_events_prefix = "ClickHouseProfileEvents_"; +constexpr auto current_metrics_prefix = "ClickHouseMetrics_"; +constexpr auto asynchronous_metrics_prefix = "ClickHouseAsyncMetrics_"; +constexpr auto error_metrics_prefix = "ClickHouseErrorMetric_"; + +void writeEvent(DB::WriteBuffer & wb, ProfileEvents::Event event) +{ + const auto counter = ProfileEvents::global_counters[event].load(std::memory_order_relaxed); + + std::string metric_name{ProfileEvents::getName(static_cast(event))}; + std::string metric_doc{ProfileEvents::getDocumentation(static_cast(event))}; + + convertHelpToSingleLine(metric_doc); + + if (!replaceInvalidChars(metric_name)) + return; + + std::string key{profile_events_prefix + metric_name}; + + writeOutLine(wb, "# HELP", key, metric_doc); + writeOutLine(wb, "# TYPE", key, "counter"); + writeOutLine(wb, key, counter); } +void writeMetric(DB::WriteBuffer & wb, size_t metric) +{ + const auto value = CurrentMetrics::values[metric].load(std::memory_order_relaxed); + + std::string metric_name{CurrentMetrics::getName(static_cast(metric))}; + std::string metric_doc{CurrentMetrics::getDocumentation(static_cast(metric))}; + + convertHelpToSingleLine(metric_doc); + + if (!replaceInvalidChars(metric_name)) + return; + + std::string key{current_metrics_prefix + metric_name}; + + writeOutLine(wb, "# HELP", key, metric_doc); + writeOutLine(wb, "# TYPE", key, "gauge"); + writeOutLine(wb, key, value); +} + +void writeAsyncMetrics(DB::WriteBuffer & wb, const DB::AsynchronousMetricValues & values) +{ + for (const auto & name_value : values) + { + std::string key{asynchronous_metrics_prefix + name_value.first}; + + if (!replaceInvalidChars(key)) + continue; + + auto value = name_value.second; + + std::string metric_doc{value.documentation}; + convertHelpToSingleLine(metric_doc); + + writeOutLine(wb, "# HELP", key, metric_doc); + writeOutLine(wb, "# TYPE", key, "gauge"); + writeOutLine(wb, key, value.value); + } +} + +} + +#if USE_NURAFT +namespace ProfileEvents +{ + extern const std::vector keeper_profile_events; +} + +namespace CurrentMetrics +{ + extern const std::vector keeper_metrics; +} +#endif + namespace DB { @@ -60,65 +137,17 @@ void PrometheusMetricsWriter::write(WriteBuffer & wb) const if (send_events) { for (ProfileEvents::Event i = ProfileEvents::Event(0), end = ProfileEvents::end(); i < end; ++i) - { - const auto counter = ProfileEvents::global_counters[i].load(std::memory_order_relaxed); - - std::string metric_name{ProfileEvents::getName(static_cast(i))}; - std::string metric_doc{ProfileEvents::getDocumentation(static_cast(i))}; - - convertHelpToSingleLine(metric_doc); - - if (!replaceInvalidChars(metric_name)) - continue; - std::string key{profile_events_prefix + metric_name}; - - writeOutLine(wb, "# HELP", key, metric_doc); - writeOutLine(wb, "# TYPE", key, "counter"); - writeOutLine(wb, key, counter); - } + writeEvent(wb, i); } if (send_metrics) { for (size_t i = 0, end = CurrentMetrics::end(); i < end; ++i) - { - const auto value = CurrentMetrics::values[i].load(std::memory_order_relaxed); - - std::string metric_name{CurrentMetrics::getName(static_cast(i))}; - std::string metric_doc{CurrentMetrics::getDocumentation(static_cast(i))}; - - convertHelpToSingleLine(metric_doc); - - if (!replaceInvalidChars(metric_name)) - continue; - std::string key{current_metrics_prefix + metric_name}; - - writeOutLine(wb, "# HELP", key, metric_doc); - writeOutLine(wb, "# TYPE", key, "gauge"); - writeOutLine(wb, key, value); - } + writeMetric(wb, i); } if (send_asynchronous_metrics) - { - auto async_metrics_values = async_metrics.getValues(); - for (const auto & name_value : async_metrics_values) - { - std::string key{asynchronous_metrics_prefix + name_value.first}; - - if (!replaceInvalidChars(key)) - continue; - - auto value = name_value.second; - - std::string metric_doc{value.documentation}; - convertHelpToSingleLine(metric_doc); - - writeOutLine(wb, "# HELP", key, metric_doc); - writeOutLine(wb, "# TYPE", key, "gauge"); - writeOutLine(wb, key, value.value); - } - } + writeAsyncMetrics(wb, async_metrics.getValues()); if (send_errors) { @@ -152,4 +181,24 @@ void PrometheusMetricsWriter::write(WriteBuffer & wb) const } +void KeeperPrometheusMetricsWriter::write([[maybe_unused]] WriteBuffer & wb) const +{ +#if USE_NURAFT + if (send_events) + { + for (auto event : ProfileEvents::keeper_profile_events) + writeEvent(wb, event); + } + + if (send_metrics) + { + for (auto metric : CurrentMetrics::keeper_metrics) + writeMetric(wb, metric); + } + + if (send_asynchronous_metrics) + writeAsyncMetrics(wb, async_metrics.getValues()); +#endif +} + } diff --git a/src/Server/PrometheusMetricsWriter.h b/src/Server/PrometheusMetricsWriter.h index b909a0ddcf6..933ad909ee0 100644 --- a/src/Server/PrometheusMetricsWriter.h +++ b/src/Server/PrometheusMetricsWriter.h @@ -3,6 +3,7 @@ #include #include +#include #include #include @@ -19,20 +20,25 @@ public: const Poco::Util::AbstractConfiguration & config, const std::string & config_name, const AsynchronousMetrics & async_metrics_); - void write(WriteBuffer & wb) const; + virtual void write(WriteBuffer & wb) const; -private: + virtual ~PrometheusMetricsWriter() = default; + +protected: const AsynchronousMetrics & async_metrics; - const bool send_events; const bool send_metrics; const bool send_asynchronous_metrics; const bool send_errors; - - static inline constexpr auto profile_events_prefix = "ClickHouseProfileEvents_"; - static inline constexpr auto current_metrics_prefix = "ClickHouseMetrics_"; - static inline constexpr auto asynchronous_metrics_prefix = "ClickHouseAsyncMetrics_"; - static inline constexpr auto error_metrics_prefix = "ClickHouseErrorMetric_"; }; +class KeeperPrometheusMetricsWriter : public PrometheusMetricsWriter +{ + using PrometheusMetricsWriter::PrometheusMetricsWriter; + + void write(WriteBuffer & wb) const override; +}; + +using PrometheusMetricsWriterPtr = std::shared_ptr; + } diff --git a/src/Server/PrometheusRequestHandler.cpp b/src/Server/PrometheusRequestHandler.cpp index 8690ec9121e..dff960f7031 100644 --- a/src/Server/PrometheusRequestHandler.cpp +++ b/src/Server/PrometheusRequestHandler.cpp @@ -7,6 +7,7 @@ #include #include #include +#include "Server/PrometheusMetricsWriter.h" #include @@ -34,7 +35,7 @@ void PrometheusRequestHandler::handleRequest(HTTPServerRequest & request, HTTPSe WriteBufferFromHTTPServerResponse wb(response, request.getMethod() == Poco::Net::HTTPRequest::HTTP_HEAD, keep_alive_timeout, write_event); try { - metrics_writer.write(wb); + metrics_writer->write(wb); wb.finalize(); } catch (...) @@ -54,7 +55,7 @@ HTTPRequestHandlerFactoryPtr createPrometheusHandlerFactory( AsynchronousMetrics & async_metrics, const std::string & config_prefix) { - PrometheusMetricsWriter writer(config, config_prefix + ".handler", async_metrics); + auto writer = std::make_shared(config, config_prefix + ".handler", async_metrics); auto creator = [&server, writer]() -> std::unique_ptr { return std::make_unique(server, writer); @@ -66,13 +67,12 @@ HTTPRequestHandlerFactoryPtr createPrometheusHandlerFactory( } HTTPRequestHandlerFactoryPtr createPrometheusMainHandlerFactory( - IServer & server, const Poco::Util::AbstractConfiguration & config, AsynchronousMetrics & async_metrics, const std::string & name) + IServer & server, const Poco::Util::AbstractConfiguration & config, PrometheusMetricsWriterPtr metrics_writer, const std::string & name) { auto factory = std::make_shared(name); - PrometheusMetricsWriter writer(config, "prometheus", async_metrics); - auto creator = [&server, writer]() -> std::unique_ptr + auto creator = [&server, metrics_writer] { - return std::make_unique(server, writer); + return std::make_unique(server, metrics_writer); }; auto handler = std::make_shared>(std::move(creator)); diff --git a/src/Server/PrometheusRequestHandler.h b/src/Server/PrometheusRequestHandler.h index 9ec54cc2e4e..d120752c8c5 100644 --- a/src/Server/PrometheusRequestHandler.h +++ b/src/Server/PrometheusRequestHandler.h @@ -13,12 +13,12 @@ class PrometheusRequestHandler : public HTTPRequestHandler { private: IServer & server; - const PrometheusMetricsWriter & metrics_writer; + PrometheusMetricsWriterPtr metrics_writer; public: - PrometheusRequestHandler(IServer & server_, const PrometheusMetricsWriter & metrics_writer_) + PrometheusRequestHandler(IServer & server_, PrometheusMetricsWriterPtr metrics_writer_) : server(server_) - , metrics_writer(metrics_writer_) + , metrics_writer(std::move(metrics_writer_)) { } diff --git a/src/Server/TCPHandler.cpp b/src/Server/TCPHandler.cpp index e1086ac5833..833f8ecc818 100644 --- a/src/Server/TCPHandler.cpp +++ b/src/Server/TCPHandler.cpp @@ -35,7 +35,6 @@ #include #include #include -#include #include #include #include @@ -933,7 +932,7 @@ void TCPHandler::processInsertQuery() if (auto table = DatabaseCatalog::instance().tryGetTable(insert_query.table_id, query_context)) async_insert_enabled |= table->areAsynchronousInsertsEnabled(); - if (insert_queue && async_insert_enabled && !insert_query.select && !settings.deduplicate_blocks_in_dependent_materialized_views) + if (insert_queue && async_insert_enabled && !insert_query.select) { auto result = processAsyncInsertQuery(*insert_queue); if (result.status == AsynchronousInsertQueue::PushResult::OK) diff --git a/src/Storages/DataLakes/IStorageDataLake.h b/src/Storages/DataLakes/IStorageDataLake.h index db3f835494f..582b55c505b 100644 --- a/src/Storages/DataLakes/IStorageDataLake.h +++ b/src/Storages/DataLakes/IStorageDataLake.h @@ -38,25 +38,25 @@ public: static ColumnsDescription getTableStructureFromData( Configuration & base_configuration, const std::optional & format_settings, - ContextPtr local_context) + const ContextPtr & local_context) { auto configuration = getConfigurationForDataRead(base_configuration, local_context); return Storage::getTableStructureFromData(configuration, format_settings, local_context); } - static Configuration getConfiguration(ASTs & engine_args, ContextPtr local_context) + static Configuration getConfiguration(ASTs & engine_args, const ContextPtr & local_context) { return Storage::getConfiguration(engine_args, local_context, /* get_format_from_file */false); } - Configuration updateConfigurationAndGetCopy(ContextPtr local_context) override + Configuration updateConfigurationAndGetCopy(const ContextPtr & local_context) override { std::lock_guard lock(configuration_update_mutex); updateConfigurationImpl(local_context); return Storage::getConfiguration(); } - void updateConfiguration(ContextPtr local_context) override + void updateConfiguration(const ContextPtr & local_context) override { std::lock_guard lock(configuration_update_mutex); updateConfigurationImpl(local_context); @@ -64,7 +64,7 @@ public: private: static Configuration getConfigurationForDataRead( - const Configuration & base_configuration, ContextPtr local_context, const Strings & keys = {}, bool attach = false) + const Configuration & base_configuration, const ContextPtr & local_context, const Strings & keys = {}, bool attach = false) { auto configuration{base_configuration}; configuration.update(local_context); @@ -94,12 +94,12 @@ private: } } - static Strings getDataFiles(const Configuration & configuration, ContextPtr local_context) + static Strings getDataFiles(const Configuration & configuration, const ContextPtr & local_context) { return MetadataParser().getFiles(configuration, local_context); } - void updateConfigurationImpl(ContextPtr local_context) + void updateConfigurationImpl(const ContextPtr & local_context) { const bool updated = base_configuration.update(local_context); auto new_keys = getDataFiles(base_configuration, local_context); diff --git a/src/Storages/DataLakes/Iceberg/StorageIceberg.cpp b/src/Storages/DataLakes/Iceberg/StorageIceberg.cpp index 8a1a2cdbd8f..345f2553ccb 100644 --- a/src/Storages/DataLakes/Iceberg/StorageIceberg.cpp +++ b/src/Storages/DataLakes/Iceberg/StorageIceberg.cpp @@ -61,7 +61,7 @@ StorageIceberg::StorageIceberg( ColumnsDescription StorageIceberg::getTableStructureFromData( Configuration & base_configuration, const std::optional &, - ContextPtr local_context) + const ContextPtr & local_context) { auto configuration{base_configuration}; configuration.update(local_context); @@ -69,7 +69,7 @@ ColumnsDescription StorageIceberg::getTableStructureFromData( return ColumnsDescription(metadata->getTableSchema()); } -void StorageIceberg::updateConfigurationImpl(ContextPtr local_context) +void StorageIceberg::updateConfigurationImpl(const ContextPtr & local_context) { const bool updated = base_configuration.update(local_context); auto new_metadata = parseIcebergMetadata(base_configuration, local_context); diff --git a/src/Storages/DataLakes/Iceberg/StorageIceberg.h b/src/Storages/DataLakes/Iceberg/StorageIceberg.h index 4e63da5508a..7cae89442ff 100644 --- a/src/Storages/DataLakes/Iceberg/StorageIceberg.h +++ b/src/Storages/DataLakes/Iceberg/StorageIceberg.h @@ -52,28 +52,28 @@ public: static ColumnsDescription getTableStructureFromData( Configuration & base_configuration, const std::optional &, - ContextPtr local_context); + const ContextPtr & local_context); static Configuration getConfiguration(ASTs & engine_args, ContextPtr local_context) { return StorageS3::getConfiguration(engine_args, local_context, /* get_format_from_file */false); } - Configuration updateConfigurationAndGetCopy(ContextPtr local_context) override + Configuration updateConfigurationAndGetCopy(const ContextPtr & local_context) override { std::lock_guard lock(configuration_update_mutex); updateConfigurationImpl(local_context); return StorageS3::getConfiguration(); } - void updateConfiguration(ContextPtr local_context) override + void updateConfiguration(const ContextPtr & local_context) override { std::lock_guard lock(configuration_update_mutex); updateConfigurationImpl(local_context); } private: - void updateConfigurationImpl(ContextPtr local_context); + void updateConfigurationImpl(const ContextPtr & local_context); std::unique_ptr current_metadata; Configuration base_configuration; diff --git a/src/Storages/Distributed/DistributedAsyncInsertDirectoryQueue.h b/src/Storages/Distributed/DistributedAsyncInsertDirectoryQueue.h index f7d7553851a..a1b436bb9c8 100644 --- a/src/Storages/Distributed/DistributedAsyncInsertDirectoryQueue.h +++ b/src/Storages/Distributed/DistributedAsyncInsertDirectoryQueue.h @@ -6,9 +6,7 @@ #include #include #include -#include #include -#include namespace CurrentMetrics { class Increment; } diff --git a/src/Storages/HDFS/StorageHDFS.cpp b/src/Storages/HDFS/StorageHDFS.cpp index ab21c4946e4..5e937d3d31d 100644 --- a/src/Storages/HDFS/StorageHDFS.cpp +++ b/src/Storages/HDFS/StorageHDFS.cpp @@ -67,6 +67,7 @@ namespace ErrorCodes extern const int BAD_ARGUMENTS; extern const int LOGICAL_ERROR; extern const int CANNOT_COMPILE_REGEXP; + extern const int CANNOT_DETECT_FORMAT; } namespace { @@ -194,7 +195,7 @@ StorageHDFS::StorageHDFS( const ColumnsDescription & columns_, const ConstraintsDescription & constraints_, const String & comment, - ContextPtr context_, + const ContextPtr & context_, const String & compression_method_, const bool distributed_processing_, ASTPtr partition_by_) @@ -206,7 +207,8 @@ StorageHDFS::StorageHDFS( , distributed_processing(distributed_processing_) , partition_by(partition_by_) { - FormatFactory::instance().checkFormatName(format_name); + if (format_name != "auto") + FormatFactory::instance().checkFormatName(format_name); context_->getRemoteHostFilter().checkURL(Poco::URI(uri_)); checkHDFSURL(uri_); @@ -217,11 +219,19 @@ StorageHDFS::StorageHDFS( if (columns_.empty()) { - auto columns = getTableStructureFromData(format_name, uri_, compression_method, context_); + ColumnsDescription columns; + if (format_name == "auto") + std::tie(columns, format_name) = getTableStructureAndFormatFromData(uri_, compression_method_, context_); + else + columns = getTableStructureFromData(format_name, uri_, compression_method, context_); + storage_metadata.setColumns(columns); } else { + if (format_name == "auto") + format_name = getTableStructureAndFormatFromData(uri_, compression_method_, context_).second; + /// We don't allow special columns in HDFS storage. if (!columns_.hasOnlyOrdinary()) throw Exception(ErrorCodes::BAD_ARGUMENTS, "Table engine HDFS doesn't support special columns like MATERIALIZED, ALIAS or EPHEMERAL"); @@ -243,25 +253,25 @@ namespace ReadBufferIterator( const std::vector & paths_with_info_, const String & uri_without_path_, - const String & format_, + std::optional format_, const String & compression_method_, const ContextPtr & context_) : WithContext(context_) , paths_with_info(paths_with_info_) , uri_without_path(uri_without_path_) - , format(format_) + , format(std::move(format_)) , compression_method(compression_method_) { } - std::pair, std::optional> next() override + Data next() override { bool is_first = current_index == 0; /// For default mode check cached columns for all paths on first iteration. if (is_first && getContext()->getSettingsRef().schema_inference_mode == SchemaInferenceMode::DEFAULT) { if (auto cached_columns = tryGetColumnsFromCache(paths_with_info)) - return {nullptr, cached_columns}; + return {nullptr, cached_columns, format}; } StorageHDFS::PathWithInfo path_with_info; @@ -271,10 +281,17 @@ namespace if (current_index == paths_with_info.size()) { if (is_first) - throw Exception(ErrorCodes::CANNOT_EXTRACT_TABLE_STRUCTURE, - "Cannot extract table structure from {} format file, because all files are empty. " - "You must specify table structure manually", format); - return {nullptr, std::nullopt}; + { + if (format) + throw Exception(ErrorCodes::CANNOT_EXTRACT_TABLE_STRUCTURE, + "The table structure cannot be extracted from a {} format file, because all files are empty. " + "You can specify table structure manually", *format); + + throw Exception( + ErrorCodes::CANNOT_DETECT_FORMAT, + "The data format cannot be detected by the contents of the files, because all files are empty. You can specify table structure manually"); + } + return {nullptr, std::nullopt, format}; } path_with_info = paths_with_info[current_index++]; @@ -285,7 +302,7 @@ namespace { std::vector paths = {path_with_info}; if (auto cached_columns = tryGetColumnsFromCache(paths)) - return {nullptr, cached_columns}; + return {nullptr, cached_columns, format}; } auto compression = chooseCompressionMethod(path_with_info.path, compression_method); @@ -293,7 +310,7 @@ namespace if (!getContext()->getSettingsRef().hdfs_skip_empty_files || !impl->eof()) { const Int64 zstd_window_log_max = getContext()->getSettingsRef().zstd_window_log_max; - return {wrapReadBufferWithCompressionMethod(std::move(impl), compression, static_cast(zstd_window_log_max)), std::nullopt}; + return {wrapReadBufferWithCompressionMethod(std::move(impl), compression, static_cast(zstd_window_log_max)), std::nullopt, format}; } } } @@ -304,7 +321,7 @@ namespace return; String source = uri_without_path + paths_with_info[current_index - 1].path; - auto key = getKeyForSchemaCache(source, format, std::nullopt, getContext()); + auto key = getKeyForSchemaCache(source, *format, std::nullopt, getContext()); StorageHDFS::getSchemaCache(getContext()).addNumRows(key, num_rows); } @@ -315,7 +332,7 @@ namespace return; String source = uri_without_path + paths_with_info[current_index - 1].path; - auto key = getKeyForSchemaCache(source, format, std::nullopt, getContext()); + auto key = getKeyForSchemaCache(source, *format, std::nullopt, getContext()); StorageHDFS::getSchemaCache(getContext()).addColumns(key, columns); } @@ -328,10 +345,15 @@ namespace Strings sources; sources.reserve(paths_with_info.size()); std::transform(paths_with_info.begin(), paths_with_info.end(), std::back_inserter(sources), [&](const StorageHDFS::PathWithInfo & path_with_info){ return uri_without_path + path_with_info.path; }); - auto cache_keys = getKeysForSchemaCache(sources, format, {}, getContext()); + auto cache_keys = getKeysForSchemaCache(sources, *format, {}, getContext()); StorageHDFS::getSchemaCache(getContext()).addManyColumns(cache_keys, columns); } + void setFormatName(const String & format_name) override + { + format = format_name; + } + String getLastFileName() const override { if (current_index != 0) @@ -340,13 +362,27 @@ namespace return ""; } + bool supportsLastReadBufferRecreation() const override { return true; } + + std::unique_ptr recreateLastReadBuffer() override + { + chassert(current_index > 0 && current_index <= paths_with_info.size()); + auto path_with_info = paths_with_info[current_index - 1]; + auto compression = chooseCompressionMethod(path_with_info.path, compression_method); + auto impl = std::make_unique(uri_without_path, path_with_info.path, getContext()->getGlobalContext()->getConfigRef(), getContext()->getReadSettings()); + const Int64 zstd_window_log_max = getContext()->getSettingsRef().zstd_window_log_max; + return wrapReadBufferWithCompressionMethod(std::move(impl), compression, static_cast(zstd_window_log_max)); + } + private: std::optional tryGetColumnsFromCache(const std::vector & paths_with_info_) { - if (!getContext()->getSettingsRef().schema_inference_use_cache_for_hdfs) + auto context = getContext(); + + if (!context->getSettingsRef().schema_inference_use_cache_for_hdfs) return std::nullopt; - auto & schema_cache = StorageHDFS::getSchemaCache(getContext()); + auto & schema_cache = StorageHDFS::getSchemaCache(context); for (const auto & path_with_info : paths_with_info_) { auto get_last_mod_time = [&]() -> std::optional @@ -354,7 +390,7 @@ namespace if (path_with_info.info) return path_with_info.info->last_mod_time; - auto builder = createHDFSBuilder(uri_without_path + "/", getContext()->getGlobalContext()->getConfigRef()); + auto builder = createHDFSBuilder(uri_without_path + "/", context->getGlobalContext()->getConfigRef()); auto fs = createHDFSFS(builder.get()); HDFSFileInfoPtr hdfs_info(hdfsGetPathInfo(fs.get(), path_with_info.path.c_str())); if (hdfs_info) @@ -364,10 +400,28 @@ namespace }; String url = uri_without_path + path_with_info.path; - auto cache_key = getKeyForSchemaCache(url, format, {}, getContext()); - auto columns = schema_cache.tryGetColumns(cache_key, get_last_mod_time); - if (columns) - return columns; + if (format) + { + auto cache_key = getKeyForSchemaCache(url, *format, {}, context); + if (auto columns = schema_cache.tryGetColumns(cache_key, get_last_mod_time)) + return columns; + } + else + { + /// If format is unknown, we can iterate through all possible input formats + /// and check if we have an entry with this format and this file in schema cache. + /// If we have such entry for some format, we can use this format to read the file. + for (const auto & format_name : FormatFactory::instance().getAllInputFormats()) + { + auto cache_key = getKeyForSchemaCache(url, format_name, {}, context); + if (auto columns = schema_cache.tryGetColumns(cache_key, get_last_mod_time)) + { + /// Now format is known. It should be the same for all files. + format = format_name; + return columns; + } + } + } } return std::nullopt; @@ -375,29 +429,49 @@ namespace const std::vector & paths_with_info; const String & uri_without_path; - const String & format; + std::optional format; const String & compression_method; size_t current_index = 0; }; } -ColumnsDescription StorageHDFS::getTableStructureFromData( - const String & format, +std::pair StorageHDFS::getTableStructureAndFormatFromDataImpl( + std::optional format, const String & uri, const String & compression_method, - ContextPtr ctx) + const ContextPtr & ctx) { const auto [path_from_uri, uri_without_path] = getPathFromUriAndUriWithoutPath(uri); auto paths_with_info = getPathsList(path_from_uri, uri, ctx); - if (paths_with_info.empty() && !FormatFactory::instance().checkIfFormatHasExternalSchemaReader(format)) + if (paths_with_info.empty() && (!format || !FormatFactory::instance().checkIfFormatHasExternalSchemaReader(*format))) + { + if (format) + throw Exception( + ErrorCodes::CANNOT_EXTRACT_TABLE_STRUCTURE, + "The table structure cannot be extracted from a {} format file, because there are no files in HDFS with provided path." + " You can specify table structure manually", *format); + throw Exception( ErrorCodes::CANNOT_EXTRACT_TABLE_STRUCTURE, - "Cannot extract table structure from {} format file, because there are no files in HDFS with provided path." - " You must specify table structure manually", format); + "The data format cannot be detected by the contents of the files, because there are no files in HDFS with provided path." + " You can specify the format manually"); + } ReadBufferIterator read_buffer_iterator(paths_with_info, uri_without_path, format, compression_method, ctx); - return readSchemaFromFormat(format, std::nullopt, read_buffer_iterator, paths_with_info.size() > 1, ctx); + if (format) + return {readSchemaFromFormat(*format, std::nullopt, read_buffer_iterator, ctx), *format}; + return detectFormatAndReadSchema(std::nullopt, read_buffer_iterator, ctx); +} + +std::pair StorageHDFS::getTableStructureAndFormatFromData(const String & uri, const String & compression_method, const ContextPtr & ctx) +{ + return getTableStructureAndFormatFromDataImpl(std::nullopt, uri, compression_method, ctx); +} + +ColumnsDescription StorageHDFS::getTableStructureFromData(const String & format, const String & uri, const String & compression_method, const DB::ContextPtr & ctx) +{ + return getTableStructureAndFormatFromDataImpl(format, uri, compression_method, ctx).first; } class HDFSSource::DisclosedGlobIterator::Impl @@ -533,7 +607,7 @@ StorageHDFS::PathWithInfo HDFSSource::URISIterator::next() HDFSSource::HDFSSource( const ReadFromFormatInfo & info, StorageHDFSPtr storage_, - ContextPtr context_, + const ContextPtr & context_, UInt64 max_block_size_, std::shared_ptr file_iterator_, bool need_only_count_) @@ -712,7 +786,7 @@ public: HDFSSink(const String & uri, const String & format, const Block & sample_block, - ContextPtr context, + const ContextPtr & context, const CompressionMethod compression_method) : SinkToStorage(sample_block) { @@ -1073,7 +1147,7 @@ void registerStorageHDFS(StorageFactory & factory) } if (format_name == "auto") - format_name = FormatFactory::instance().getFormatFromFileName(url, true); + format_name = FormatFactory::instance().tryGetFormatFromFileName(url).value_or("auto"); String compression_method; if (engine_args.size() == 3) diff --git a/src/Storages/HDFS/StorageHDFS.h b/src/Storages/HDFS/StorageHDFS.h index 7170763c959..b36ff7ea37e 100644 --- a/src/Storages/HDFS/StorageHDFS.h +++ b/src/Storages/HDFS/StorageHDFS.h @@ -44,7 +44,7 @@ public: const ColumnsDescription & columns_, const ConstraintsDescription & constraints_, const String & comment, - ContextPtr context_, + const ContextPtr & context_, const String & compression_method_ = "", bool distributed_processing_ = false, ASTPtr partition_by = nullptr); @@ -86,7 +86,12 @@ public: const String & format, const String & uri, const String & compression_method, - ContextPtr ctx); + const ContextPtr & ctx); + + static std::pair getTableStructureAndFormatFromData( + const String & uri, + const String & compression_method, + const ContextPtr & ctx); static SchemaCache & getSchemaCache(const ContextPtr & ctx); @@ -97,6 +102,12 @@ protected: friend class ReadFromHDFS; private: + static std::pair getTableStructureAndFormatFromDataImpl( + std::optional format, + const String & uri, + const String & compression_method, + const ContextPtr & ctx); + std::vector uris; String format_name; String compression_method; @@ -141,7 +152,7 @@ public: HDFSSource( const ReadFromFormatInfo & info, StorageHDFSPtr storage_, - ContextPtr context_, + const ContextPtr & context_, UInt64 max_block_size_, std::shared_ptr file_iterator_, bool need_only_count_); diff --git a/src/Storages/HDFS/StorageHDFSCluster.cpp b/src/Storages/HDFS/StorageHDFSCluster.cpp index fad29436102..714d6391543 100644 --- a/src/Storages/HDFS/StorageHDFSCluster.cpp +++ b/src/Storages/HDFS/StorageHDFSCluster.cpp @@ -43,12 +43,10 @@ StorageHDFSCluster::StorageHDFSCluster( const String & format_name_, const ColumnsDescription & columns_, const ConstraintsDescription & constraints_, - const String & compression_method_, - bool structure_argument_was_provided_) - : IStorageCluster(cluster_name_, table_id_, getLogger("StorageHDFSCluster (" + table_id_.table_name + ")"), structure_argument_was_provided_) + const String & compression_method) + : IStorageCluster(cluster_name_, table_id_, getLogger("StorageHDFSCluster (" + table_id_.table_name + ")")) , uri(uri_) , format_name(format_name_) - , compression_method(compression_method_) { checkHDFSURL(uri_); context_->getRemoteHostFilter().checkURL(Poco::URI(uri_)); @@ -57,11 +55,20 @@ StorageHDFSCluster::StorageHDFSCluster( if (columns_.empty()) { - auto columns = StorageHDFS::getTableStructureFromData(format_name, uri_, compression_method, context_); + ColumnsDescription columns; + if (format_name == "auto") + std::tie(columns, format_name) = StorageHDFS::getTableStructureAndFormatFromData(uri_, compression_method, context_); + else + columns = StorageHDFS::getTableStructureFromData(format_name, uri_, compression_method, context_); storage_metadata.setColumns(columns); } else + { + if (format_name == "auto") + format_name = StorageHDFS::getTableStructureAndFormatFromData(uri_, compression_method, context_).second; + storage_metadata.setColumns(columns_); + } storage_metadata.setConstraints(constraints_); setInMemoryMetadata(storage_metadata); @@ -69,13 +76,14 @@ StorageHDFSCluster::StorageHDFSCluster( virtual_columns = VirtualColumnUtils::getPathFileAndSizeVirtualsForStorage(storage_metadata.getSampleBlock().getNamesAndTypesList()); } -void StorageHDFSCluster::addColumnsStructureToQuery(ASTPtr & query, const String & structure, const ContextPtr & context) +void StorageHDFSCluster::updateQueryToSendIfNeeded(DB::ASTPtr & query, const DB::StorageSnapshotPtr & storage_snapshot, const DB::ContextPtr & context) { ASTExpressionList * expression_list = extractTableFunctionArgumentsFromSelectQuery(query); if (!expression_list) throw Exception(ErrorCodes::LOGICAL_ERROR, "Expected SELECT query from table function hdfsCluster, got '{}'", queryToString(query)); - TableFunctionHDFSCluster::addColumnsStructureToArguments(expression_list->children, structure, context); + TableFunctionHDFSCluster::updateStructureAndFormatArgumentsIfNeeded( + expression_list->children, storage_snapshot->metadata->getColumns().getAll().toNamesAndTypesDescription(), format_name, context); } diff --git a/src/Storages/HDFS/StorageHDFSCluster.h b/src/Storages/HDFS/StorageHDFSCluster.h index 7c4c41a573a..40884f98984 100644 --- a/src/Storages/HDFS/StorageHDFSCluster.h +++ b/src/Storages/HDFS/StorageHDFSCluster.h @@ -28,8 +28,7 @@ public: const String & format_name_, const ColumnsDescription & columns_, const ConstraintsDescription & constraints_, - const String & compression_method_, - bool structure_argument_was_provided_); + const String & compression_method); std::string getName() const override { return "HDFSCluster"; } @@ -42,11 +41,10 @@ public: bool supportsTrivialCountOptimization() const override { return true; } private: - void addColumnsStructureToQuery(ASTPtr & query, const String & structure, const ContextPtr & context) override; + void updateQueryToSendIfNeeded(ASTPtr & query, const StorageSnapshotPtr & storage_snapshot, const ContextPtr & context) override; String uri; String format_name; - String compression_method; NamesAndTypesList virtual_columns; }; diff --git a/src/Storages/IStorageCluster.cpp b/src/Storages/IStorageCluster.cpp index 812b213cf33..3129da30f54 100644 --- a/src/Storages/IStorageCluster.cpp +++ b/src/Storages/IStorageCluster.cpp @@ -32,12 +32,10 @@ namespace DB IStorageCluster::IStorageCluster( const String & cluster_name_, const StorageID & table_id_, - LoggerPtr log_, - bool structure_argument_was_provided_) + LoggerPtr log_) : IStorage(table_id_) , log(log_) , cluster_name(cluster_name_) - , structure_argument_was_provided(structure_argument_was_provided_) { } @@ -130,8 +128,7 @@ void IStorageCluster::read( query_to_send = interpreter.getQueryInfo().query->clone(); } - if (!structure_argument_was_provided) - addColumnsStructureToQuery(query_to_send, storage_snapshot->metadata->getColumns().getAll().toNamesAndTypesDescription(), context); + updateQueryToSendIfNeeded(query_to_send, storage_snapshot, context); RestoreQualifiedNamesVisitor::Data data; data.distributed_table = DatabaseAndTableWithAlias(*getTableExpression(query_info.query->as(), 0)); diff --git a/src/Storages/IStorageCluster.h b/src/Storages/IStorageCluster.h index 8d93e94be9a..f3283247672 100644 --- a/src/Storages/IStorageCluster.h +++ b/src/Storages/IStorageCluster.h @@ -19,8 +19,7 @@ public: IStorageCluster( const String & cluster_name_, const StorageID & table_id_, - LoggerPtr log_, - bool structure_argument_was_provided_); + LoggerPtr log_); void read( QueryPlan & query_plan, @@ -42,13 +41,11 @@ public: protected: virtual void updateBeforeRead(const ContextPtr &) {} - - virtual void addColumnsStructureToQuery(ASTPtr & query, const String & structure, const ContextPtr & context) = 0; + virtual void updateQueryToSendIfNeeded(ASTPtr & /*query*/, const StorageSnapshotPtr & /*storage_snapshot*/, const ContextPtr & /*context*/) {} private: LoggerPtr log; String cluster_name; - bool structure_argument_was_provided; }; diff --git a/src/Storages/MergeTree/DataPartStorageOnDiskBase.cpp b/src/Storages/MergeTree/DataPartStorageOnDiskBase.cpp index 000d36752cb..e31d991ef09 100644 --- a/src/Storages/MergeTree/DataPartStorageOnDiskBase.cpp +++ b/src/Storages/MergeTree/DataPartStorageOnDiskBase.cpp @@ -701,7 +701,9 @@ void DataPartStorageOnDiskBase::remove( if (file_name.starts_with(proj_dir_name)) files_not_to_remove_for_projection.emplace(fs::path(file_name).filename()); - LOG_DEBUG(log, "Will not remove files [{}] for projection {}", fmt::join(files_not_to_remove_for_projection, ", "), projection.name); + if (!files_not_to_remove_for_projection.empty()) + LOG_DEBUG( + log, "Will not remove files [{}] for projection {}", fmt::join(files_not_to_remove_for_projection, ", "), projection.name); CanRemoveDescription proj_description { diff --git a/src/Storages/MergeTree/IMergeTreeDataPart.cpp b/src/Storages/MergeTree/IMergeTreeDataPart.cpp index 0f82e00edff..11ede661f78 100644 --- a/src/Storages/MergeTree/IMergeTreeDataPart.cpp +++ b/src/Storages/MergeTree/IMergeTreeDataPart.cpp @@ -313,13 +313,13 @@ IMergeTreeDataPart::IMergeTreeDataPart( const IMergeTreeDataPart * parent_part_) : DataPartStorageHolder(data_part_storage_) , storage(storage_) - , mutable_name(name_) , name(mutable_name) , info(info_) , index_granularity_info(storage_, part_type_) , part_type(part_type_) , parent_part(parent_part_) , parent_part_name(parent_part ? parent_part->name : "") + , mutable_name(name_) { if (parent_part) { @@ -342,6 +342,27 @@ IMergeTreeDataPart::~IMergeTreeDataPart() decrementTypeMetric(part_type); } + +const IMergeTreeDataPart::Index & IMergeTreeDataPart::getIndex() const +{ + std::scoped_lock lock(index_mutex); + if (!index_loaded) + loadIndex(); + index_loaded = true; + return index; +} + + +void IMergeTreeDataPart::setIndex(Columns index_) +{ + std::scoped_lock lock(index_mutex); + if (!index.empty()) + throw Exception(ErrorCodes::LOGICAL_ERROR, "The index of data part can be set only once"); + index = std::move(index_); + index_loaded = true; +} + + void IMergeTreeDataPart::setName(const String & new_name) { mutable_name = new_name; @@ -548,6 +569,7 @@ void IMergeTreeDataPart::removeIfNeeded() UInt64 IMergeTreeDataPart::getIndexSizeInBytes() const { + std::scoped_lock lock(index_mutex); UInt64 res = 0; for (const ColumnPtr & column : index) res += column->byteSize(); @@ -556,6 +578,7 @@ UInt64 IMergeTreeDataPart::getIndexSizeInBytes() const UInt64 IMergeTreeDataPart::getIndexSizeInAllocatedBytes() const { + std::scoped_lock lock(index_mutex); UInt64 res = 0; for (const ColumnPtr & column : index) res += column->allocatedBytes(); @@ -669,8 +692,11 @@ void IMergeTreeDataPart::loadColumnsChecksumsIndexes(bool require_columns_checks loadColumns(require_columns_checksums); loadChecksums(require_columns_checksums); loadIndexGranularity(); + + if (!storage.getSettings()->primary_key_lazy_load) + getIndex(); + calculateColumnsAndSecondaryIndicesSizesOnDisk(); - loadIndex(); /// Must be called after loadIndexGranularity as it uses the value of `index_granularity` loadRowsCount(); /// Must be called after loadIndexGranularity() as it uses the value of `index_granularity`. loadPartitionAndMinMaxIndex(); bool has_broken_projections = false; @@ -804,8 +830,11 @@ void IMergeTreeDataPart::appendFilesOfIndexGranularity(Strings & /* files */) co { } -void IMergeTreeDataPart::loadIndex() +void IMergeTreeDataPart::loadIndex() const { + /// Memory for index must not be accounted as memory usage for query, because it belongs to a table. + MemoryTrackerBlockerInThread temporarily_disable_memory_tracker; + /// It can be empty in case of mutations if (!index_granularity.isInitialized()) throw Exception(ErrorCodes::LOGICAL_ERROR, "Index granularity is not loaded before index loading"); @@ -842,6 +871,7 @@ void IMergeTreeDataPart::loadIndex() for (size_t i = 0; i < key_size; ++i) { + loaded_index[i]->shrinkToFit(); loaded_index[i]->protect(); if (loaded_index[i]->size() != marks_count) throw Exception(ErrorCodes::CANNOT_READ_ALL_DATA, "Cannot read all data from index file {}(expected size: " diff --git a/src/Storages/MergeTree/IMergeTreeDataPart.h b/src/Storages/MergeTree/IMergeTreeDataPart.h index fcf9d5bd17d..0d7acfab891 100644 --- a/src/Storages/MergeTree/IMergeTreeDataPart.h +++ b/src/Storages/MergeTree/IMergeTreeDataPart.h @@ -3,6 +3,7 @@ #include #include #include +#include #include #include #include @@ -75,6 +76,7 @@ public: using ColumnSizeByName = std::unordered_map; using NameToNumber = std::unordered_map; + using Index = Columns; using IndexSizeByName = std::unordered_map; using Type = MergeTreeDataPartType; @@ -212,10 +214,6 @@ public: const MergeTreeData & storage; -private: - String mutable_name; - mutable MergeTreeDataPartState state{MergeTreeDataPartState::Temporary}; - public: const String & name; // const ref to private mutable_name MergeTreePartInfo info; @@ -309,12 +307,6 @@ public: /// Throws an exception if state of the part is not in affordable_states void assertState(const std::initializer_list & affordable_states) const; - /// Primary key (correspond to primary.idx file). - /// Always loaded in RAM. Contains each index_granularity-th value of primary key tuple. - /// Note that marks (also correspond to primary key) is not always in RAM, but cached. See MarkCache.h. - using Index = Columns; - Index index; - MergeTreePartition partition; /// Amount of rows between marks @@ -369,6 +361,9 @@ public: /// Version of part metadata (columns, pk and so on). Managed properly only for replicated merge tree. int32_t metadata_version; + const Index & getIndex() const; + void setIndex(Columns index_); + /// For data in RAM ('index') UInt64 getIndexSizeInBytes() const; UInt64 getIndexSizeInAllocatedBytes() const; @@ -567,6 +562,12 @@ public: mutable std::atomic last_removal_attempt_time = 0; protected: + /// Primary key (correspond to primary.idx file). + /// Lazily loaded in RAM. Contains each index_granularity-th value of primary key tuple. + /// Note that marks (also correspond to primary key) are not always in RAM, but cached. See MarkCache.h. + mutable std::mutex index_mutex; + mutable Index index TSA_GUARDED_BY(index_mutex); + mutable bool index_loaded TSA_GUARDED_BY(index_mutex) = false; /// Total size of all columns, calculated once in calcuateColumnSizesOnDisk ColumnSize total_columns_size; @@ -623,6 +624,9 @@ protected: void initializeIndexGranularityInfo(); private: + String mutable_name; + mutable MergeTreeDataPartState state{MergeTreeDataPartState::Temporary}; + /// In compact parts order of columns is necessary NameToNumber column_name_to_position; @@ -660,8 +664,8 @@ private: virtual void appendFilesOfIndexGranularity(Strings & files) const; - /// Loads index file. - void loadIndex(); + /// Loads the index file. + void loadIndex() const TSA_REQUIRES(index_mutex); void appendFilesOfIndex(Strings & files) const; diff --git a/src/Storages/MergeTree/MergeTask.cpp b/src/Storages/MergeTree/MergeTask.cpp index 9cbcdbaaaaa..e6ae63da7e3 100644 --- a/src/Storages/MergeTree/MergeTask.cpp +++ b/src/Storages/MergeTree/MergeTask.cpp @@ -436,7 +436,7 @@ MergeTask::StageRuntimeContextPtr MergeTask::VerticalMergeStage::getContextForNe bool MergeTask::ExecuteAndFinalizeHorizontalPart::execute() { assert(subtasks_iterator != subtasks.end()); - if ((*subtasks_iterator)()) + if ((this->**subtasks_iterator)()) return true; /// Move to the next subtask in an array of subtasks @@ -827,7 +827,7 @@ bool MergeTask::MergeProjectionsStage::finalizeProjectionsAndWholeMerge() const bool MergeTask::VerticalMergeStage::execute() { assert(subtasks_iterator != subtasks.end()); - if ((*subtasks_iterator)()) + if ((this->**subtasks_iterator)()) return true; /// Move to the next subtask in an array of subtasks @@ -838,7 +838,7 @@ bool MergeTask::VerticalMergeStage::execute() bool MergeTask::MergeProjectionsStage::execute() { assert(subtasks_iterator != subtasks.end()); - if ((*subtasks_iterator)()) + if ((this->**subtasks_iterator)()) return true; /// Move to the next subtask in an array of subtasks diff --git a/src/Storages/MergeTree/MergeTask.h b/src/Storages/MergeTree/MergeTask.h index 6f5336baaad..7fb4797e482 100644 --- a/src/Storages/MergeTree/MergeTask.h +++ b/src/Storages/MergeTree/MergeTask.h @@ -246,15 +246,16 @@ private: bool prepare(); bool executeImpl(); - using ExecuteAndFinalizeHorizontalPartSubtasks = std::array, 2>; + /// NOTE: Using pointer-to-member instead of std::function and lambda makes stacktraces much more concise and readable + using ExecuteAndFinalizeHorizontalPartSubtasks = std::array; - ExecuteAndFinalizeHorizontalPartSubtasks subtasks + const ExecuteAndFinalizeHorizontalPartSubtasks subtasks { - [this] () { return prepare(); }, - [this] () { return executeImpl(); } + &ExecuteAndFinalizeHorizontalPart::prepare, + &ExecuteAndFinalizeHorizontalPart::executeImpl }; - ExecuteAndFinalizeHorizontalPartSubtasks::iterator subtasks_iterator = subtasks.begin(); + ExecuteAndFinalizeHorizontalPartSubtasks::const_iterator subtasks_iterator = subtasks.begin(); MergeAlgorithm chooseMergeAlgorithm() const; @@ -323,16 +324,17 @@ private: bool executeVerticalMergeForAllColumns() const; bool finalizeVerticalMergeForAllColumns() const; - using VerticalMergeStageSubtasks = std::array, 3>; + /// NOTE: Using pointer-to-member instead of std::function and lambda makes stacktraces much more concise and readable + using VerticalMergeStageSubtasks = std::array; - VerticalMergeStageSubtasks subtasks + const VerticalMergeStageSubtasks subtasks { - [this] () { return prepareVerticalMergeForAllColumns(); }, - [this] () { return executeVerticalMergeForAllColumns(); }, - [this] () { return finalizeVerticalMergeForAllColumns(); } + &VerticalMergeStage::prepareVerticalMergeForAllColumns, + &VerticalMergeStage::executeVerticalMergeForAllColumns, + &VerticalMergeStage::finalizeVerticalMergeForAllColumns }; - VerticalMergeStageSubtasks::iterator subtasks_iterator = subtasks.begin(); + VerticalMergeStageSubtasks::const_iterator subtasks_iterator = subtasks.begin(); void prepareVerticalMergeForOneColumn() const; bool executeVerticalMergeForOneColumn() const; @@ -373,16 +375,17 @@ private: bool executeProjections() const; bool finalizeProjectionsAndWholeMerge() const; - using MergeProjectionsStageSubtasks = std::array, 3>; + /// NOTE: Using pointer-to-member instead of std::function and lambda makes stacktraces much more concise and readable + using MergeProjectionsStageSubtasks = std::array; - MergeProjectionsStageSubtasks subtasks + const MergeProjectionsStageSubtasks subtasks { - [this] () { return mergeMinMaxIndexAndPrepareProjections(); }, - [this] () { return executeProjections(); }, - [this] () { return finalizeProjectionsAndWholeMerge(); } + &MergeProjectionsStage::mergeMinMaxIndexAndPrepareProjections, + &MergeProjectionsStage::executeProjections, + &MergeProjectionsStage::finalizeProjectionsAndWholeMerge }; - MergeProjectionsStageSubtasks::iterator subtasks_iterator = subtasks.begin(); + MergeProjectionsStageSubtasks::const_iterator subtasks_iterator = subtasks.begin(); MergeProjectionsRuntimeContextPtr ctx; GlobalRuntimeContextPtr global_ctx; @@ -392,14 +395,14 @@ private: using Stages = std::array; - Stages stages + const Stages stages { std::make_shared(), std::make_shared(), std::make_shared() }; - Stages::iterator stages_iterator = stages.begin(); + Stages::const_iterator stages_iterator = stages.begin(); /// Check for persisting block number column static bool supportsBlockNumberColumn(GlobalRuntimeContextPtr global_ctx) diff --git a/src/Storages/MergeTree/MergeTreeData.cpp b/src/Storages/MergeTree/MergeTreeData.cpp index c8262914702..e14a358745e 100644 --- a/src/Storages/MergeTree/MergeTreeData.cpp +++ b/src/Storages/MergeTree/MergeTreeData.cpp @@ -6824,7 +6824,7 @@ Block MergeTreeData::getMinMaxCountProjectionBlock( { for (const auto & part : real_parts) { - const auto & primary_key_column = *part->index[0]; + const auto & primary_key_column = *part->getIndex()[0]; auto & min_column = assert_cast(*partition_minmax_count_columns[pos]); insert(min_column, primary_key_column[0]); } @@ -6835,7 +6835,7 @@ Block MergeTreeData::getMinMaxCountProjectionBlock( { for (const auto & part : real_parts) { - const auto & primary_key_column = *part->index[0]; + const auto & primary_key_column = *part->getIndex()[0]; auto & max_column = assert_cast(*partition_minmax_count_columns[pos]); insert(max_column, primary_key_column[primary_key_column.size() - 1]); } diff --git a/src/Storages/MergeTree/MergeTreeData.h b/src/Storages/MergeTree/MergeTreeData.h index 4ad440dae00..1de79ed17ca 100644 --- a/src/Storages/MergeTree/MergeTreeData.h +++ b/src/Storages/MergeTree/MergeTreeData.h @@ -35,7 +35,7 @@ #include #include #include -#include +#include #include diff --git a/src/Storages/MergeTree/MergeTreeDataSelectExecutor.cpp b/src/Storages/MergeTree/MergeTreeDataSelectExecutor.cpp index a76d370d057..1ba28713680 100644 --- a/src/Storages/MergeTree/MergeTreeDataSelectExecutor.cpp +++ b/src/Storages/MergeTree/MergeTreeDataSelectExecutor.cpp @@ -1087,7 +1087,7 @@ MarkRanges MergeTreeDataSelectExecutor::markRangesFromPKRange( MarkRanges res; size_t marks_count = part->index_granularity.getMarksCount(); - const auto & index = part->index; + const auto & index = part->getIndex(); if (marks_count == 0) return res; diff --git a/src/Storages/MergeTree/MergeTreeMarksLoader.cpp b/src/Storages/MergeTree/MergeTreeMarksLoader.cpp index 7531c03a011..8250050412f 100644 --- a/src/Storages/MergeTree/MergeTreeMarksLoader.cpp +++ b/src/Storages/MergeTree/MergeTreeMarksLoader.cpp @@ -1,12 +1,11 @@ #include #include -#include +#include #include #include #include #include #include -#include #include #include diff --git a/src/Storages/MergeTree/MergeTreePrefetchedReadPool.cpp b/src/Storages/MergeTree/MergeTreePrefetchedReadPool.cpp index 47c2fe07bb4..8d8b0f1cc79 100644 --- a/src/Storages/MergeTree/MergeTreePrefetchedReadPool.cpp +++ b/src/Storages/MergeTree/MergeTreePrefetchedReadPool.cpp @@ -1,6 +1,6 @@ #include #include -#include +#include #include #include #include diff --git a/src/Storages/MergeTree/MergeTreeSettings.h b/src/Storages/MergeTree/MergeTreeSettings.h index 96cab9c0293..b64632b6139 100644 --- a/src/Storages/MergeTree/MergeTreeSettings.h +++ b/src/Storages/MergeTree/MergeTreeSettings.h @@ -201,6 +201,7 @@ struct Settings; M(String, primary_key_compression_codec, "ZSTD(3)", "Compression encoding used by primary, primary key is small enough and cached, so the default compression is ZSTD(3).", 0) \ M(UInt64, marks_compress_block_size, 65536, "Mark compress block size, the actual size of the block to compress.", 0) \ M(UInt64, primary_key_compress_block_size, 65536, "Primary compress block size, the actual size of the block to compress.", 0) \ + M(Bool, primary_key_lazy_load, true, "Load primary key in memory on first use instead of on table initialization. This can save memory in the presence of a large number of tables.", 0) \ \ /** Projection settings. */ \ M(UInt64, max_projections, 25, "The maximum number of merge tree projections.", 0) \ diff --git a/src/Storages/MergeTree/MergeTreeSource.cpp b/src/Storages/MergeTree/MergeTreeSource.cpp index a450505f7a8..e1d1d0951e4 100644 --- a/src/Storages/MergeTree/MergeTreeSource.cpp +++ b/src/Storages/MergeTree/MergeTreeSource.cpp @@ -1,6 +1,6 @@ #include #include -#include +#include #include #include diff --git a/src/Storages/MergeTree/MergedBlockOutputStream.cpp b/src/Storages/MergeTree/MergedBlockOutputStream.cpp index 1d10a1433ef..f2fe2e0f255 100644 --- a/src/Storages/MergeTree/MergedBlockOutputStream.cpp +++ b/src/Storages/MergeTree/MergedBlockOutputStream.cpp @@ -181,7 +181,7 @@ MergedBlockOutputStream::Finalizer MergedBlockOutputStream::finalizePartAsync( new_part->rows_count = rows_count; new_part->modification_time = time(nullptr); - new_part->index = writer->releaseIndexColumns(); + new_part->setIndex(writer->releaseIndexColumns()); new_part->checksums = checksums; new_part->setBytesOnDisk(checksums.getTotalSizeOnDisk()); new_part->setBytesUncompressedOnDisk(checksums.getTotalSizeUncompressedOnDisk()); diff --git a/src/Storages/MergeTree/MutateTask.cpp b/src/Storages/MergeTree/MutateTask.cpp index 9959688d889..6bacce9e2c5 100644 --- a/src/Storages/MergeTree/MutateTask.cpp +++ b/src/Storages/MergeTree/MutateTask.cpp @@ -73,6 +73,7 @@ static void splitAndModifyMutationCommands( LoggerPtr log) { auto part_columns = part->getColumnsDescription(); + const auto & table_columns = metadata_snapshot->getColumns(); if (!isWidePart(part) || !isFullPartStorage(part->getDataPartStorage())) { @@ -81,9 +82,19 @@ static void splitAndModifyMutationCommands( for (const auto & command : commands) { + if (command.type == MutationCommand::Type::MATERIALIZE_COLUMN) + { + /// For ordinary column with default or materialized expression, MATERIALIZE COLUMN should not override past values + /// So we only mutate column if `command.column_name` is a default/materialized column or if the part does not have physical column file + auto column_ordinary = table_columns.getOrdinary().tryGetByName(command.column_name); + if (!column_ordinary || !part->tryGetColumn(command.column_name) || !part->hasColumnFiles(*column_ordinary)) + { + for_interpreter.push_back(command); + mutated_columns.emplace(command.column_name); + } + } if (command.type == MutationCommand::Type::MATERIALIZE_INDEX || command.type == MutationCommand::Type::MATERIALIZE_STATISTIC - || command.type == MutationCommand::Type::MATERIALIZE_COLUMN || command.type == MutationCommand::Type::MATERIALIZE_PROJECTION || command.type == MutationCommand::Type::MATERIALIZE_TTL || command.type == MutationCommand::Type::DELETE @@ -93,9 +104,6 @@ static void splitAndModifyMutationCommands( for_interpreter.push_back(command); for (const auto & [column_name, expr] : command.column_to_update_expression) mutated_columns.emplace(column_name); - - if (command.type == MutationCommand::Type::MATERIALIZE_COLUMN) - mutated_columns.emplace(command.column_name); } else if (command.type == MutationCommand::Type::DROP_INDEX || command.type == MutationCommand::Type::DROP_PROJECTION @@ -205,8 +213,15 @@ static void splitAndModifyMutationCommands( { for (const auto & command : commands) { - if (command.type == MutationCommand::Type::MATERIALIZE_INDEX - || command.type == MutationCommand::Type::MATERIALIZE_COLUMN + if (command.type == MutationCommand::Type::MATERIALIZE_COLUMN) + { + /// For ordinary column with default or materialized expression, MATERIALIZE COLUMN should not override past values + /// So we only mutate column if `command.column_name` is a default/materialized column or if the part does not have physical column file + auto column_ordinary = table_columns.getOrdinary().tryGetByName(command.column_name); + if (!column_ordinary || !part->tryGetColumn(command.column_name) || !part->hasColumnFiles(*column_ordinary)) + for_interpreter.push_back(command); + } + else if (command.type == MutationCommand::Type::MATERIALIZE_INDEX || command.type == MutationCommand::Type::MATERIALIZE_STATISTIC || command.type == MutationCommand::Type::MATERIALIZE_PROJECTION || command.type == MutationCommand::Type::MATERIALIZE_TTL @@ -899,7 +914,7 @@ void finalizeMutatedPart( new_data_part->rows_count = source_part->rows_count; new_data_part->index_granularity = source_part->index_granularity; - new_data_part->index = source_part->index; + new_data_part->setIndex(source_part->getIndex()); new_data_part->minmax_idx = source_part->minmax_idx; new_data_part->modification_time = time(nullptr); diff --git a/src/Storages/MergeTree/ReplicatedMergeTreeQueue.cpp b/src/Storages/MergeTree/ReplicatedMergeTreeQueue.cpp index 8d921bdcb1c..e26a36202dd 100644 --- a/src/Storages/MergeTree/ReplicatedMergeTreeQueue.cpp +++ b/src/Storages/MergeTree/ReplicatedMergeTreeQueue.cpp @@ -860,6 +860,9 @@ ActiveDataPartSet getPartNamesToMutate( int32_t ReplicatedMergeTreeQueue::updateMutations(zkutil::ZooKeeperPtr zookeeper, Coordination::WatchCallbackPtr watch_callback) { + if (pull_log_blocker.isCancelled()) + throw Exception(ErrorCodes::ABORTED, "Log pulling is cancelled"); + std::lock_guard lock(update_mutations_mutex); Coordination::Stat mutations_stat; diff --git a/src/Storages/MergeTree/registerStorageMergeTree.cpp b/src/Storages/MergeTree/registerStorageMergeTree.cpp index 0a723e7adb4..e2f89067b34 100644 --- a/src/Storages/MergeTree/registerStorageMergeTree.cpp +++ b/src/Storages/MergeTree/registerStorageMergeTree.cpp @@ -315,7 +315,8 @@ static StoragePtr create(const StorageFactory::Arguments & args) DatabaseCatalog::instance().getDatabase(args.table_id.database_name)->getEngineName() == "Replicated"; /// Allow implicit {uuid} macros only for zookeeper_path in ON CLUSTER queries - bool allow_uuid_macro = is_on_cluster || is_replicated_database || args.query.attach; + /// and if UUID was explicitly passed in CREATE TABLE (like for ATTACH) + bool allow_uuid_macro = is_on_cluster || is_replicated_database || args.query.attach || args.query.has_uuid; auto expand_macro = [&] (ASTLiteral * ast_zk_path, ASTLiteral * ast_replica_name) { diff --git a/src/Storages/RabbitMQ/RabbitMQConsumer.cpp b/src/Storages/RabbitMQ/RabbitMQConsumer.cpp index 1843bebe3c7..28dc239ae37 100644 --- a/src/Storages/RabbitMQ/RabbitMQConsumer.cpp +++ b/src/Storages/RabbitMQ/RabbitMQConsumer.cpp @@ -128,6 +128,32 @@ bool RabbitMQConsumer::ackMessages(const CommitInfo & commit_info) return false; } +bool RabbitMQConsumer::nackMessages(const CommitInfo & commit_info) +{ + if (state != State::OK) + return false; + + /// Nothing to nack. + if (!commit_info.delivery_tag || commit_info.delivery_tag <= last_commited_delivery_tag) + return false; + + if (consumer_channel->reject(commit_info.delivery_tag, AMQP::multiple)) + { + LOG_TRACE( + log, "Consumer rejected messages with deliveryTags from {} to {} on channel {}", + last_commited_delivery_tag, commit_info.delivery_tag, channel_id); + + return true; + } + + LOG_ERROR( + log, + "Failed to reject messages for {}:{}, (current commit point {}:{})", + commit_info.channel_id, commit_info.delivery_tag, + channel_id, last_commited_delivery_tag); + + return false; +} void RabbitMQConsumer::updateChannel(RabbitMQConnection & connection) { @@ -161,7 +187,7 @@ void RabbitMQConsumer::updateChannel(RabbitMQConnection & connection) consumer_channel->onError([&](const char * message) { - LOG_ERROR(log, "Channel {} in an error state: {}", channel_id, message); + LOG_ERROR(log, "Channel {} in in error state: {}", channel_id, message); state = State::ERROR; }); } diff --git a/src/Storages/RabbitMQ/RabbitMQConsumer.h b/src/Storages/RabbitMQ/RabbitMQConsumer.h index c78b33bfc7c..9dad175dda3 100644 --- a/src/Storages/RabbitMQ/RabbitMQConsumer.h +++ b/src/Storages/RabbitMQ/RabbitMQConsumer.h @@ -50,7 +50,9 @@ public: UInt64 delivery_tag = 0; String channel_id; }; + const MessageData & currentMessage() { return current; } + const String & getChannelID() const { return channel_id; } /// Return read buffer containing next available message /// or nullptr if there are no messages to process. @@ -63,6 +65,7 @@ public: bool isConsumerStopped() const { return stopped.load(); } bool ackMessages(const CommitInfo & commit_info); + bool nackMessages(const CommitInfo & commit_info); bool hasPendingMessages() { return !received.empty(); } diff --git a/src/Storages/RabbitMQ/RabbitMQSource.cpp b/src/Storages/RabbitMQ/RabbitMQSource.cpp index 3cec448fc11..72196e7dd3c 100644 --- a/src/Storages/RabbitMQ/RabbitMQSource.cpp +++ b/src/Storages/RabbitMQ/RabbitMQSource.cpp @@ -123,7 +123,11 @@ Chunk RabbitMQSource::generateImpl() } if (is_finished || !consumer || consumer->isConsumerStopped()) + { + LOG_TRACE(log, "RabbitMQSource is stopped (is_finished: {}, consumer_stopped: {})", + is_finished, consumer ? toString(consumer->isConsumerStopped()) : "No consumer"); return {}; + } /// Currently it is one time usage source: to make sure data is flushed /// strictly by timeout or by block size. @@ -254,13 +258,12 @@ Chunk RabbitMQSource::generateImpl() bool RabbitMQSource::sendAck() { - if (!consumer) - return false; + return consumer && consumer->ackMessages(commit_info); +} - if (!consumer->ackMessages(commit_info)) - return false; - - return true; +bool RabbitMQSource::sendNack() +{ + return consumer && consumer->nackMessages(commit_info); } } diff --git a/src/Storages/RabbitMQ/RabbitMQSource.h b/src/Storages/RabbitMQ/RabbitMQSource.h index 21d059bfae2..0d6fad97054 100644 --- a/src/Storages/RabbitMQ/RabbitMQSource.h +++ b/src/Storages/RabbitMQ/RabbitMQSource.h @@ -33,6 +33,7 @@ public: bool needChannelUpdate(); void updateChannel(); bool sendAck(); + bool sendNack(); private: StorageRabbitMQ & storage; diff --git a/src/Storages/RabbitMQ/StorageRabbitMQ.cpp b/src/Storages/RabbitMQ/StorageRabbitMQ.cpp index 868f48d0b7d..ec2048cca70 100644 --- a/src/Storages/RabbitMQ/StorageRabbitMQ.cpp +++ b/src/Storages/RabbitMQ/StorageRabbitMQ.cpp @@ -1061,7 +1061,8 @@ bool StorageRabbitMQ::tryStreamToViews() for (size_t i = 0; i < num_created_consumers; ++i) { auto source = std::make_shared( - *this, storage_snapshot, rabbitmq_context, column_names, block_size, max_execution_time_ms, rabbitmq_settings->rabbitmq_handle_error_mode, false); + *this, storage_snapshot, rabbitmq_context, column_names, block_size, + max_execution_time_ms, rabbitmq_settings->rabbitmq_handle_error_mode, false); sources.emplace_back(source); pipes.emplace_back(source); @@ -1069,13 +1070,25 @@ bool StorageRabbitMQ::tryStreamToViews() block_io.pipeline.complete(Pipe::unitePipes(std::move(pipes))); + std::atomic_size_t rows = 0; + block_io.pipeline.setProgressCallback([&](const Progress & progress) { rows += progress.read_rows.load(); }); + if (!connection->getHandler().loopRunning()) startLoop(); + bool write_failed = false; + try { CompletedPipelineExecutor executor(block_io.pipeline); executor.execute(); } + catch (...) + { + LOG_ERROR(log, "Failed to push to views. Error: {}", getCurrentExceptionMessage(true)); + write_failed = true; + } + + LOG_TRACE(log, "Processed {} rows", rows); /* Note: sending ack() with loop running in another thread will lead to a lot of data races inside the library, but only in case * error occurs or connection is lost while ack is being sent @@ -1083,13 +1096,6 @@ bool StorageRabbitMQ::tryStreamToViews() deactivateTask(looping_task, false, true); size_t queue_empty = 0; - if (!hasDependencies(getStorageID())) - { - /// Do not commit to rabbitmq if the dependency was removed. - LOG_TRACE(log, "No dependencies, reschedule"); - return false; - } - if (!connection->isConnected()) { if (shutdown_called) @@ -1130,7 +1136,7 @@ bool StorageRabbitMQ::tryStreamToViews() * the same channel will also commit all previously not-committed messages. Anyway I do not think that for ack frame this * will ever happen. */ - if (!source->sendAck()) + if (write_failed ? source->sendNack() : source->sendAck()) { /// Iterate loop to activate error callbacks if they happened connection->getHandler().iterateLoop(); @@ -1142,6 +1148,19 @@ bool StorageRabbitMQ::tryStreamToViews() } } + if (write_failed) + { + LOG_TRACE(log, "Write failed, reschedule"); + return false; + } + + if (!hasDependencies(getStorageID())) + { + /// Do not commit to rabbitmq if the dependency was removed. + LOG_TRACE(log, "No dependencies, reschedule"); + return false; + } + if ((queue_empty == num_created_consumers) && (++read_attempts == MAX_FAILED_READ_ATTEMPTS)) { connection->heartbeat(); diff --git a/src/Storages/S3Queue/S3QueueFilesMetadata.cpp b/src/Storages/S3Queue/S3QueueFilesMetadata.cpp index ac80ded5792..8583de27e00 100644 --- a/src/Storages/S3Queue/S3QueueFilesMetadata.cpp +++ b/src/Storages/S3Queue/S3QueueFilesMetadata.cpp @@ -214,7 +214,7 @@ size_t S3QueueFilesMetadata::registerNewShard() } const auto zk_client = getZooKeeper(); - zk_client->createAncestors(zookeeper_shards_path / ""); + zk_client->createIfNotExists(zookeeper_shards_path, ""); std::string shard_node_path; size_t shard_id = 0; @@ -287,7 +287,10 @@ void S3QueueFilesMetadata::unregisterShard(size_t shard_id) const auto zk_client = getZooKeeper(); const auto node_path = getZooKeeperPathForShard(shard_id); - zk_client->remove(node_path); + auto error_code = zk_client->tryRemove(node_path); + if (error_code != Coordination::Error::ZOK + && error_code != Coordination::Error::ZNONODE) + throw zkutil::KeeperException::fromPath(error_code, node_path); } size_t S3QueueFilesMetadata::getProcessingIdsNum() const diff --git a/src/Storages/S3Queue/S3QueueSource.cpp b/src/Storages/S3Queue/S3QueueSource.cpp index b4f5f957f76..933238d8614 100644 --- a/src/Storages/S3Queue/S3QueueSource.cpp +++ b/src/Storages/S3Queue/S3QueueSource.cpp @@ -352,7 +352,11 @@ void StorageS3QueueSource::applyActionAfterProcessing(const String & path) } } -void StorageS3QueueSource::appendLogElement(const std::string & filename, S3QueueFilesMetadata::FileStatus & file_status_, size_t processed_rows, bool processed) +void StorageS3QueueSource::appendLogElement( + const std::string & filename, + S3QueueFilesMetadata::FileStatus & file_status_, + size_t processed_rows, + bool processed) { if (!s3_queue_log) return; @@ -363,6 +367,9 @@ void StorageS3QueueSource::appendLogElement(const std::string & filename, S3Queu elem = S3QueueLogElement { .event_time = std::chrono::system_clock::to_time_t(std::chrono::system_clock::now()), + .database = storage_id.database_name, + .table = storage_id.table_name, + .uuid = toString(storage_id.uuid), .file_name = filename, .rows_processed = processed_rows, .status = processed ? S3QueueLogElement::S3QueueStatus::Processed : S3QueueLogElement::S3QueueStatus::Failed, diff --git a/src/Storages/S3Queue/StorageS3Queue.cpp b/src/Storages/S3Queue/StorageS3Queue.cpp index 0723205b544..4f73f4ee205 100644 --- a/src/Storages/S3Queue/StorageS3Queue.cpp +++ b/src/Storages/S3Queue/StorageS3Queue.cpp @@ -138,11 +138,17 @@ StorageS3Queue::StorageS3Queue( StorageInMemoryMetadata storage_metadata; if (columns_.empty()) { - auto columns = StorageS3::getTableStructureFromDataImpl(configuration, format_settings, context_); + ColumnsDescription columns; + if (configuration.format == "auto") + std::tie(columns, configuration.format) = StorageS3::getTableStructureAndFormatFromData(configuration, format_settings, context_); + else + columns = StorageS3::getTableStructureFromData(configuration, format_settings, context_); storage_metadata.setColumns(columns); } else { + if (configuration.format == "auto") + configuration.format = StorageS3::getTableStructureAndFormatFromData(configuration, format_settings, context_).second; storage_metadata.setColumns(columns_); } diff --git a/src/Storages/StorageAzureBlob.cpp b/src/Storages/StorageAzureBlob.cpp index c09db0bfb7b..f5fcf01c59e 100644 --- a/src/Storages/StorageAzureBlob.cpp +++ b/src/Storages/StorageAzureBlob.cpp @@ -65,6 +65,7 @@ namespace ErrorCodes extern const int DATABASE_ACCESS_DENIED; extern const int CANNOT_COMPILE_REGEXP; extern const int CANNOT_EXTRACT_TABLE_STRUCTURE; + extern const int CANNOT_DETECT_FORMAT; extern const int LOGICAL_ERROR; extern const int NOT_IMPLEMENTED; @@ -127,7 +128,7 @@ void StorageAzureBlob::processNamedCollectionResult(StorageAzureBlob::Configurat } -StorageAzureBlob::Configuration StorageAzureBlob::getConfiguration(ASTs & engine_args, ContextPtr local_context) +StorageAzureBlob::Configuration StorageAzureBlob::getConfiguration(ASTs & engine_args, const ContextPtr & local_context) { StorageAzureBlob::Configuration configuration; @@ -143,7 +144,7 @@ StorageAzureBlob::Configuration StorageAzureBlob::getConfiguration(ASTs & engine configuration.blobs_paths = {configuration.blob_path}; if (configuration.format == "auto") - configuration.format = FormatFactory::instance().getFormatFromFileName(configuration.blob_path, true); + configuration.format = FormatFactory::instance().tryGetFormatFromFileName(configuration.blob_path).value_or("auto"); return configuration; } @@ -236,13 +237,13 @@ StorageAzureBlob::Configuration StorageAzureBlob::getConfiguration(ASTs & engine configuration.blobs_paths = {configuration.blob_path}; if (configuration.format == "auto") - configuration.format = FormatFactory::instance().getFormatFromFileName(configuration.blob_path, true); + configuration.format = FormatFactory::instance().tryGetFormatFromFileName(configuration.blob_path).value_or("auto"); return configuration; } -AzureObjectStorage::SettingsPtr StorageAzureBlob::createSettings(ContextPtr local_context) +AzureObjectStorage::SettingsPtr StorageAzureBlob::createSettings(const ContextPtr & local_context) { const auto & context_settings = local_context->getSettingsRef(); auto settings_ptr = std::make_unique(); @@ -447,7 +448,7 @@ Poco::URI StorageAzureBlob::Configuration::getConnectionURL() const StorageAzureBlob::StorageAzureBlob( const Configuration & configuration_, std::unique_ptr && object_storage_, - ContextPtr context, + const ContextPtr & context, const StorageID & table_id_, const ColumnsDescription & columns_, const ConstraintsDescription & constraints_, @@ -463,17 +464,25 @@ StorageAzureBlob::StorageAzureBlob( , format_settings(format_settings_) , partition_by(partition_by_) { - FormatFactory::instance().checkFormatName(configuration.format); + if (configuration.format != "auto") + FormatFactory::instance().checkFormatName(configuration.format); context->getGlobalContext()->getRemoteHostFilter().checkURL(configuration.getConnectionURL()); StorageInMemoryMetadata storage_metadata; if (columns_.empty()) { - auto columns = getTableStructureFromData(object_storage.get(), configuration, format_settings, context, distributed_processing); + ColumnsDescription columns; + if (configuration.format == "auto") + std::tie(columns, configuration.format) = getTableStructureAndFormatFromData(object_storage.get(), configuration, format_settings, context); + else + columns = getTableStructureFromData(object_storage.get(), configuration, format_settings, context); storage_metadata.setColumns(columns); } else { + if (configuration.format == "auto") + configuration.format = getTableStructureAndFormatFromData(object_storage.get(), configuration, format_settings, context).second; + /// We don't allow special columns in File storage. if (!columns_.hasOnlyOrdinary()) throw Exception(ErrorCodes::BAD_ARGUMENTS, "Table engine AzureBlobStorage doesn't support special columns like MATERIALIZED, ALIAS or EPHEMERAL"); @@ -517,7 +526,7 @@ public: StorageAzureBlobSink( const String & format, const Block & sample_block_, - ContextPtr context, + const ContextPtr & context, std::optional format_settings_, const CompressionMethod compression_method, AzureObjectStorage * object_storage, @@ -607,22 +616,21 @@ private: std::mutex cancel_mutex; }; -class PartitionedStorageAzureBlobSink : public PartitionedSink +class PartitionedStorageAzureBlobSink : public PartitionedSink, WithContext { public: PartitionedStorageAzureBlobSink( const ASTPtr & partition_by, const String & format_, const Block & sample_block_, - ContextPtr context_, + const ContextPtr & context_, std::optional format_settings_, const CompressionMethod compression_method_, AzureObjectStorage * object_storage_, const String & blob_) - : PartitionedSink(partition_by, context_, sample_block_) + : PartitionedSink(partition_by, context_, sample_block_), WithContext(context_) , format(format_) , sample_block(sample_block_) - , context(context_) , compression_method(compression_method_) , object_storage(object_storage_) , blob(blob_) @@ -638,7 +646,7 @@ public: return std::make_shared( format, sample_block, - context, + getContext(), format_settings, compression_method, object_storage, @@ -649,7 +657,6 @@ public: private: const String format; const Block sample_block; - const ContextPtr context; const CompressionMethod compression_method; AzureObjectStorage * object_storage; const String blob; @@ -913,7 +920,7 @@ StorageAzureBlobSource::GlobIterator::GlobIterator( String blob_path_with_globs_, const ActionsDAG::Node * predicate, const NamesAndTypesList & virtual_columns_, - ContextPtr context_, + const ContextPtr & context_, RelativePathsWithMetadata * outer_blobs_, std::function file_progress_callback_) : IIterator(context_) @@ -1028,7 +1035,7 @@ StorageAzureBlobSource::KeysIterator::KeysIterator( const Strings & keys_, const ActionsDAG::Node * predicate, const NamesAndTypesList & virtual_columns_, - ContextPtr context_, + const ContextPtr & context_, RelativePathsWithMetadata * outer_blobs, std::function file_progress_callback) : IIterator(context_) @@ -1147,7 +1154,7 @@ StorageAzureBlobSource::StorageAzureBlobSource( const ReadFromFormatInfo & info, const String & format_, String name_, - ContextPtr context_, + const ContextPtr & context_, std::optional format_settings_, UInt64 max_block_size_, String compression_hint_, @@ -1290,6 +1297,7 @@ namespace ReadBufferIterator( const std::shared_ptr & file_iterator_, AzureObjectStorage * object_storage_, + std::optional format_, const StorageAzureBlob::Configuration & configuration_, const std::optional & format_settings_, const RelativePathsWithMetadata & read_keys_, @@ -1298,19 +1306,38 @@ namespace , file_iterator(file_iterator_) , object_storage(object_storage_) , configuration(configuration_) + , format(std::move(format_)) , format_settings(format_settings_) , read_keys(read_keys_) , prev_read_keys_size(read_keys_.size()) { } - std::pair, std::optional> next() override + Data next() override { /// For default mode check cached columns for currently read keys on first iteration. - if (first && getContext()->getSettingsRef().schema_inference_mode == SchemaInferenceMode::DEFAULT) + if (first) { - if (auto cached_columns = tryGetColumnsFromCache(read_keys.begin(), read_keys.end())) - return {nullptr, cached_columns}; + /// If format is unknown we iterate through all currently read keys on first iteration and + /// try to determine format by file name. + if (!format) + { + for (const auto & key : read_keys) + { + if (auto format_from_path = FormatFactory::instance().tryGetFormatFromFileName(key.relative_path)) + { + format = format_from_path; + break; + } + } + } + + /// For default mode check cached columns for currently read keys on first iteration. + if (getContext()->getSettingsRef().schema_inference_mode == SchemaInferenceMode::DEFAULT) + { + if (auto cached_columns = tryGetColumnsFromCache(read_keys.begin(), read_keys.end())) + return {nullptr, cached_columns, format}; + } } current_path_with_metadata = file_iterator->next(); @@ -1318,29 +1345,55 @@ namespace if (current_path_with_metadata.relative_path.empty()) { if (first) - throw Exception( - ErrorCodes::CANNOT_EXTRACT_TABLE_STRUCTURE, - "Cannot extract table structure from {} format file, because there are no files with provided path " - "in AzureBlobStorage. You must specify table structure manually", configuration.format); + { + if (format) + throw Exception( + ErrorCodes::CANNOT_EXTRACT_TABLE_STRUCTURE, + "The table structure cannot be extracted from a {} format file, because there are no files with provided path " + "in AzureBlobStorage. You can specify table structure manually", *format); - return {nullptr, std::nullopt}; + throw Exception( + ErrorCodes::CANNOT_DETECT_FORMAT, + "The data format cannot be detected by the contents of the files, because there are no files with provided path " + "in AzureBlobStorage. You can specify table structure manually"); + } + + return {nullptr, std::nullopt, format}; } first = false; - /// AzureBlobStorage file iterator could get new keys after new iteration, check them in schema cache if schema inference mode is default. - if (getContext()->getSettingsRef().schema_inference_mode == SchemaInferenceMode::DEFAULT && read_keys.size() > prev_read_keys_size) + /// AzureBlobStorage file iterator could get new keys after new iteration. + if (read_keys.size() > prev_read_keys_size) { - auto columns_from_cache = tryGetColumnsFromCache(read_keys.begin() + prev_read_keys_size, read_keys.end()); + /// If format is unknown we can try to determine it by new file names. + if (!format) + { + for (auto it = read_keys.begin() + prev_read_keys_size; it != read_keys.end(); ++it) + { + if (auto format_from_file_name = FormatFactory::instance().tryGetFormatFromFileName((*it).relative_path)) + { + format = format_from_file_name; + break; + } + } + } + /// Check new files in schema cache if schema inference mode is default. + if (getContext()->getSettingsRef().schema_inference_mode == SchemaInferenceMode::DEFAULT) + { + auto columns_from_cache = tryGetColumnsFromCache(read_keys.begin() + prev_read_keys_size, read_keys.end()); + if (columns_from_cache) + return {nullptr, columns_from_cache, format}; + } + prev_read_keys_size = read_keys.size(); - if (columns_from_cache) - return {nullptr, columns_from_cache}; } - else if (getContext()->getSettingsRef().schema_inference_mode == SchemaInferenceMode::UNION) + + if (getContext()->getSettingsRef().schema_inference_mode == SchemaInferenceMode::UNION) { RelativePathsWithMetadata paths = {current_path_with_metadata}; if (auto columns_from_cache = tryGetColumnsFromCache(paths.begin(), paths.end())) - return {nullptr, columns_from_cache}; + return {nullptr, columns_from_cache, format}; } first = false; @@ -1348,7 +1401,7 @@ namespace return {wrapReadBufferWithCompressionMethod( object_storage->readObject(StoredObject(current_path_with_metadata.relative_path), getContext()->getReadSettings(), {}, current_path_with_metadata.metadata.size_bytes), chooseCompressionMethod(current_path_with_metadata.relative_path, configuration.compression_method), - zstd_window_log_max), std::nullopt}; + zstd_window_log_max), std::nullopt, format}; } void setNumRowsToLastFile(size_t num_rows) override @@ -1357,7 +1410,7 @@ namespace return; String source = fs::path(configuration.connection_url) / configuration.container / current_path_with_metadata.relative_path; - auto key = getKeyForSchemaCache(source, configuration.format, format_settings, getContext()); + auto key = getKeyForSchemaCache(source, *format, format_settings, getContext()); StorageAzureBlob::getSchemaCache(getContext()).addNumRows(key, num_rows); } @@ -1368,7 +1421,7 @@ namespace return; String source = fs::path(configuration.connection_url) / configuration.container / current_path_with_metadata.relative_path; - auto key = getKeyForSchemaCache(source, configuration.format, format_settings, getContext()); + auto key = getKeyForSchemaCache(source, *format, format_settings, getContext()); StorageAzureBlob::getSchemaCache(getContext()).addColumns(key, columns); } @@ -1382,16 +1435,36 @@ namespace Strings sources; sources.reserve(read_keys.size()); std::transform(read_keys.begin(), read_keys.end(), std::back_inserter(sources), [&](const auto & elem){ return host_and_bucket + '/' + elem.relative_path; }); - auto cache_keys = getKeysForSchemaCache(sources, configuration.format, format_settings, getContext()); + auto cache_keys = getKeysForSchemaCache(sources, *format, format_settings, getContext()); StorageAzureBlob::getSchemaCache(getContext()).addManyColumns(cache_keys, columns); } + void setFormatName(const String & format_name) override + { + format = format_name; + } + String getLastFileName() const override { return current_path_with_metadata.relative_path; } + bool supportsLastReadBufferRecreation() const override { return true; } + + std::unique_ptr recreateLastReadBuffer() override + { + int zstd_window_log_max = static_cast(getContext()->getSettingsRef().zstd_window_log_max); + return wrapReadBufferWithCompressionMethod( + object_storage->readObject(StoredObject(current_path_with_metadata.relative_path), getContext()->getReadSettings(), {}, current_path_with_metadata.metadata.size_bytes), + chooseCompressionMethod(current_path_with_metadata.relative_path, configuration.compression_method), + zstd_window_log_max); + } + private: std::optional tryGetColumnsFromCache(const RelativePathsWithMetadata::const_iterator & begin, const RelativePathsWithMetadata::const_iterator & end) { - auto & schema_cache = StorageAzureBlob::getSchemaCache(getContext()); + auto context = getContext(); + if (!context->getSettingsRef().schema_inference_use_cache_for_azure) + return std::nullopt; + + auto & schema_cache = StorageAzureBlob::getSchemaCache(context); for (auto it = begin; it < end; ++it) { auto get_last_mod_time = [&] -> std::optional @@ -1403,10 +1476,28 @@ namespace auto host_and_bucket = configuration.connection_url + '/' + configuration.container; String source = host_and_bucket + '/' + it->relative_path; - auto cache_key = getKeyForSchemaCache(source, configuration.format, format_settings, getContext()); - auto columns = schema_cache.tryGetColumns(cache_key, get_last_mod_time); - if (columns) - return columns; + if (format) + { + auto cache_key = getKeyForSchemaCache(source, *format, format_settings, context); + if (auto columns = schema_cache.tryGetColumns(cache_key, get_last_mod_time)) + return columns; + } + else + { + /// If format is unknown, we can iterate through all possible input formats + /// and check if we have an entry with this format and this file in schema cache. + /// If we have such entry for some format, we can use this format to read the file. + for (const auto & format_name : FormatFactory::instance().getAllInputFormats()) + { + auto cache_key = getKeyForSchemaCache(source, format_name, format_settings, context); + if (auto columns = schema_cache.tryGetColumns(cache_key, get_last_mod_time)) + { + /// Now format is known. It should be the same for all files. + format = format_name; + return columns; + } + } + } } return std::nullopt; @@ -1415,6 +1506,7 @@ namespace std::shared_ptr file_iterator; AzureObjectStorage * object_storage; const StorageAzureBlob::Configuration & configuration; + std::optional format; const std::optional & format_settings; const RelativePathsWithMetadata & read_keys; size_t prev_read_keys_size; @@ -1423,21 +1515,16 @@ namespace }; } -ColumnsDescription StorageAzureBlob::getTableStructureFromData( +std::pair StorageAzureBlob::getTableStructureAndFormatFromDataImpl( + std::optional format, AzureObjectStorage * object_storage, const Configuration & configuration, const std::optional & format_settings, - ContextPtr ctx, - bool distributed_processing) + const ContextPtr & ctx) { RelativePathsWithMetadata read_keys; std::shared_ptr file_iterator; - if (distributed_processing) - { - file_iterator = std::make_shared(ctx, - ctx->getReadTaskCallback()); - } - else if (configuration.withGlobs()) + if (configuration.withGlobs()) { file_iterator = std::make_shared( object_storage, configuration.container, configuration.blob_path, nullptr, NamesAndTypesList{}, ctx, &read_keys); @@ -1448,8 +1535,28 @@ ColumnsDescription StorageAzureBlob::getTableStructureFromData( object_storage, configuration.container, configuration.blobs_paths, nullptr, NamesAndTypesList{}, ctx, &read_keys); } - ReadBufferIterator read_buffer_iterator(file_iterator, object_storage, configuration, format_settings, read_keys, ctx); - return readSchemaFromFormat(configuration.format, format_settings, read_buffer_iterator, configuration.withGlobs(), ctx); + ReadBufferIterator read_buffer_iterator(file_iterator, object_storage, format, configuration, format_settings, read_keys, ctx); + if (format) + return {readSchemaFromFormat(*format, format_settings, read_buffer_iterator, ctx), *format}; + return detectFormatAndReadSchema(format_settings, read_buffer_iterator, ctx); +} + +std::pair StorageAzureBlob::getTableStructureAndFormatFromData( + DB::AzureObjectStorage * object_storage, + const DB::StorageAzureBlob::Configuration & configuration, + const std::optional & format_settings, + const DB::ContextPtr & ctx) +{ + return getTableStructureAndFormatFromDataImpl(std::nullopt, object_storage, configuration, format_settings, ctx); +} + +ColumnsDescription StorageAzureBlob::getTableStructureFromData( + DB::AzureObjectStorage * object_storage, + const DB::StorageAzureBlob::Configuration & configuration, + const std::optional & format_settings, + const DB::ContextPtr & ctx) +{ + return getTableStructureAndFormatFromDataImpl(configuration.format, object_storage, configuration, format_settings, ctx).first; } SchemaCache & StorageAzureBlob::getSchemaCache(const ContextPtr & ctx) diff --git a/src/Storages/StorageAzureBlob.h b/src/Storages/StorageAzureBlob.h index 6fc3c5ce592..2ab96c84e49 100644 --- a/src/Storages/StorageAzureBlob.h +++ b/src/Storages/StorageAzureBlob.h @@ -31,9 +31,9 @@ public: String getPath() const { return blob_path; } - bool update(ContextPtr context); + bool update(const ContextPtr & context); - void connect(ContextPtr context); + void connect(const ContextPtr & context); bool withGlobs() const { return blob_path.find_first_of("*?{") != std::string::npos; } @@ -59,7 +59,7 @@ public: StorageAzureBlob( const Configuration & configuration_, std::unique_ptr && object_storage_, - ContextPtr context_, + const ContextPtr & context_, const StorageID & table_id_, const ColumnsDescription & columns_, const ConstraintsDescription & constraints_, @@ -68,10 +68,10 @@ public: bool distributed_processing_, ASTPtr partition_by_); - static StorageAzureBlob::Configuration getConfiguration(ASTs & engine_args, ContextPtr local_context); + static StorageAzureBlob::Configuration getConfiguration(ASTs & engine_args, const ContextPtr & local_context); static AzureClientPtr createClient(StorageAzureBlob::Configuration configuration, bool is_read_only); - static AzureObjectStorage::SettingsPtr createSettings(ContextPtr local_context); + static AzureObjectStorage::SettingsPtr createSettings(const ContextPtr & local_context); static void processNamedCollectionResult(StorageAzureBlob::Configuration & configuration, const NamedCollection & collection); @@ -115,10 +115,22 @@ public: AzureObjectStorage * object_storage, const Configuration & configuration, const std::optional & format_settings, - ContextPtr ctx, - bool distributed_processing = false); + const ContextPtr & ctx); + + static std::pair getTableStructureAndFormatFromData( + AzureObjectStorage * object_storage, + const Configuration & configuration, + const std::optional & format_settings, + const ContextPtr & ctx); private: + static std::pair getTableStructureAndFormatFromDataImpl( + std::optional format, + AzureObjectStorage * object_storage, + const Configuration & configuration, + const std::optional & format_settings, + const ContextPtr & ctx); + friend class ReadFromAzureBlob; std::string name; @@ -137,7 +149,7 @@ public: class IIterator : public WithContext { public: - IIterator(ContextPtr context_):WithContext(context_) {} + IIterator(const ContextPtr & context_):WithContext(context_) {} virtual ~IIterator() = default; virtual RelativePathWithMetadata next() = 0; @@ -153,7 +165,7 @@ public: String blob_path_with_globs_, const ActionsDAG::Node * predicate, const NamesAndTypesList & virtual_columns_, - ContextPtr context_, + const ContextPtr & context_, RelativePathsWithMetadata * outer_blobs_, std::function file_progress_callback_ = {}); @@ -186,7 +198,7 @@ public: class ReadIterator : public IIterator { public: - explicit ReadIterator(ContextPtr context_, + explicit ReadIterator(const ContextPtr & context_, const ReadTaskCallback & callback_) : IIterator(context_), callback(callback_) { } RelativePathWithMetadata next() override @@ -207,7 +219,7 @@ public: const Strings & keys_, const ActionsDAG::Node * predicate, const NamesAndTypesList & virtual_columns_, - ContextPtr context_, + const ContextPtr & context_, RelativePathsWithMetadata * outer_blobs, std::function file_progress_callback = {}); @@ -229,7 +241,7 @@ public: const ReadFromFormatInfo & info, const String & format_, String name_, - ContextPtr context_, + const ContextPtr & context_, std::optional format_settings_, UInt64 max_block_size_, String compression_hint_, diff --git a/src/Storages/StorageAzureBlobCluster.cpp b/src/Storages/StorageAzureBlobCluster.cpp index 1d587512f38..32445556611 100644 --- a/src/Storages/StorageAzureBlobCluster.cpp +++ b/src/Storages/StorageAzureBlobCluster.cpp @@ -36,23 +36,30 @@ StorageAzureBlobCluster::StorageAzureBlobCluster( const StorageID & table_id_, const ColumnsDescription & columns_, const ConstraintsDescription & constraints_, - ContextPtr context_, - bool structure_argument_was_provided_) - : IStorageCluster(cluster_name_, table_id_, getLogger("StorageAzureBlobCluster (" + table_id_.table_name + ")"), structure_argument_was_provided_) + const ContextPtr & context) + : IStorageCluster(cluster_name_, table_id_, getLogger("StorageAzureBlobCluster (" + table_id_.table_name + ")")) , configuration{configuration_} , object_storage(std::move(object_storage_)) { - context_->getGlobalContext()->getRemoteHostFilter().checkURL(configuration_.getConnectionURL()); + context->getGlobalContext()->getRemoteHostFilter().checkURL(configuration_.getConnectionURL()); StorageInMemoryMetadata storage_metadata; if (columns_.empty()) { + ColumnsDescription columns; /// `format_settings` is set to std::nullopt, because StorageAzureBlobCluster is used only as table function - auto columns = StorageAzureBlob::getTableStructureFromData(object_storage.get(), configuration, /*format_settings=*/std::nullopt, context_, false); + if (configuration.format == "auto") + std::tie(columns, configuration.format) = StorageAzureBlob::getTableStructureAndFormatFromData(object_storage.get(), configuration, /*format_settings=*/std::nullopt, context); + else + columns = StorageAzureBlob::getTableStructureFromData(object_storage.get(), configuration, /*format_settings=*/std::nullopt, context); storage_metadata.setColumns(columns); } else + { + if (configuration.format == "auto") + configuration.format = StorageAzureBlob::getTableStructureAndFormatFromData(object_storage.get(), configuration, /*format_settings=*/std::nullopt, context).second; storage_metadata.setColumns(columns_); + } storage_metadata.setConstraints(constraints_); setInMemoryMetadata(storage_metadata); @@ -60,13 +67,14 @@ StorageAzureBlobCluster::StorageAzureBlobCluster( virtual_columns = VirtualColumnUtils::getPathFileAndSizeVirtualsForStorage(storage_metadata.getSampleBlock().getNamesAndTypesList()); } -void StorageAzureBlobCluster::addColumnsStructureToQuery(ASTPtr & query, const String & structure, const ContextPtr & context) +void StorageAzureBlobCluster::updateQueryToSendIfNeeded(DB::ASTPtr & query, const DB::StorageSnapshotPtr & storage_snapshot, const DB::ContextPtr & context) { ASTExpressionList * expression_list = extractTableFunctionArgumentsFromSelectQuery(query); if (!expression_list) throw Exception(ErrorCodes::LOGICAL_ERROR, "Expected SELECT query from table function s3Cluster, got '{}'", queryToString(query)); - TableFunctionAzureBlobStorageCluster::addColumnsStructureToArguments(expression_list->children, structure, context); + TableFunctionAzureBlobStorageCluster::updateStructureAndFormatArgumentsIfNeeded( + expression_list->children, storage_snapshot->metadata->getColumns().getAll().toNamesAndTypesDescription(), configuration.format, context); } RemoteQueryExecutor::Extension StorageAzureBlobCluster::getTaskIteratorExtension(const ActionsDAG::Node * predicate, const ContextPtr & context) const diff --git a/src/Storages/StorageAzureBlobCluster.h b/src/Storages/StorageAzureBlobCluster.h index 2831b94f825..476f21c6742 100644 --- a/src/Storages/StorageAzureBlobCluster.h +++ b/src/Storages/StorageAzureBlobCluster.h @@ -27,8 +27,7 @@ public: const StorageID & table_id_, const ColumnsDescription & columns_, const ConstraintsDescription & constraints_, - ContextPtr context_, - bool structure_argument_was_provided_); + const ContextPtr & context); std::string getName() const override { return "AzureBlobStorageCluster"; } @@ -43,7 +42,7 @@ public: private: void updateBeforeRead(const ContextPtr & /*context*/) override {} - void addColumnsStructureToQuery(ASTPtr & query, const String & structure, const ContextPtr & context) override; + void updateQueryToSendIfNeeded(ASTPtr & query, const StorageSnapshotPtr & storage_snapshot, const ContextPtr & context) override; StorageAzureBlob::Configuration configuration; NamesAndTypesList virtual_columns; diff --git a/src/Storages/StorageBuffer.cpp b/src/Storages/StorageBuffer.cpp index d5c135bb81d..2925038ec8e 100644 --- a/src/Storages/StorageBuffer.cpp +++ b/src/Storages/StorageBuffer.cpp @@ -1,40 +1,41 @@ -#include -#include #include +#include +#include #include #include -#include #include -#include -#include -#include -#include -#include -#include -#include #include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include +#include +#include +#include +#include #include +#include +#include +#include #include #include #include -#include -#include -#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include namespace ProfileEvents @@ -56,6 +57,9 @@ namespace CurrentMetrics { extern const Metric StorageBufferRows; extern const Metric StorageBufferBytes; + extern const Metric StorageBufferFlushThreads; + extern const Metric StorageBufferFlushThreadsActive; + extern const Metric StorageBufferFlushThreadsScheduled; } @@ -153,6 +157,12 @@ StorageBuffer::StorageBuffer( storage_metadata.setComment(comment); setInMemoryMetadata(storage_metadata); + if (num_shards > 1) + { + flush_pool = std::make_unique( + CurrentMetrics::StorageBufferFlushThreads, CurrentMetrics::StorageBufferFlushThreadsActive, CurrentMetrics::StorageBufferFlushThreadsScheduled, + num_shards, 0, num_shards); + } flush_handle = bg_pool.createTask(log->name() + "/Bg", [this]{ backgroundFlush(); }); } @@ -802,7 +812,22 @@ bool StorageBuffer::checkThresholdsImpl(bool direct, size_t rows, size_t bytes, void StorageBuffer::flushAllBuffers(bool check_thresholds) { for (auto & buf : buffers) - flushBuffer(buf, check_thresholds, false); + { + if (flush_pool) + { + scheduleFromThreadPool([&] () + { + flushBuffer(buf, check_thresholds, false); + }, *flush_pool, "BufferFlush"); + } + else + { + flushBuffer(buf, check_thresholds, false); + } + } + + if (flush_pool) + flush_pool->wait(); } diff --git a/src/Storages/StorageBuffer.h b/src/Storages/StorageBuffer.h index 47f6239b173..6c15c7e0238 100644 --- a/src/Storages/StorageBuffer.h +++ b/src/Storages/StorageBuffer.h @@ -3,6 +3,7 @@ #include #include #include +#include #include @@ -149,6 +150,7 @@ private: /// There are `num_shards` of independent buffers. const size_t num_shards; + std::unique_ptr flush_pool; std::vector buffers; const Thresholds min_thresholds; diff --git a/src/Storages/StorageDistributed.cpp b/src/Storages/StorageDistributed.cpp index 9efa0819d3b..86ed1d03b94 100644 --- a/src/Storages/StorageDistributed.cpp +++ b/src/Storages/StorageDistributed.cpp @@ -976,8 +976,10 @@ std::optional StorageDistributed::distributedWriteBetweenDistribu new_query->select = select_with_union_query; } - const Cluster::AddressesWithFailover & src_addresses = src_distributed.getCluster()->getShardsAddresses(); - const Cluster::AddressesWithFailover & dst_addresses = getCluster()->getShardsAddresses(); + const auto src_cluster = src_distributed.getCluster(); + const auto dst_cluster = getCluster(); + const Cluster::AddressesWithFailover & src_addresses = src_cluster->getShardsAddresses(); + const Cluster::AddressesWithFailover & dst_addresses = dst_cluster->getShardsAddresses(); /// Compare addresses instead of cluster name, to handle remote()/cluster(). /// (since for remote()/cluster() the getClusterName() is empty string) if (src_addresses != dst_addresses) @@ -1006,8 +1008,7 @@ std::optional StorageDistributed::distributedWriteBetweenDistribu new_query->table_function.reset(); } - const auto & cluster = getCluster(); - const auto & shards_info = cluster->getShardsInfo(); + const auto & shards_info = dst_cluster->getShardsInfo(); String new_query_str; { @@ -1138,7 +1139,8 @@ std::optional StorageDistributed::distributedWriteFromClusterStor auto timeouts = ConnectionTimeouts::getTCPTimeoutsWithFailover(current_settings); /// Here we take addresses from destination cluster and assume source table exists on these nodes - for (const auto & replicas : getCluster()->getShardsInfo()) + const auto cluster = getCluster(); + for (const auto & replicas : cluster->getShardsInfo()) { /// Skip unavailable hosts if necessary auto try_results = replicas.pool->getMany(timeouts, current_settings, PoolMode::GET_MANY, /*async_callback*/ {}, /*skip_unavailable_endpoints*/ true); diff --git a/src/Storages/StorageFile.cpp b/src/Storages/StorageFile.cpp index 0d9e79d1d54..595573b566d 100644 --- a/src/Storages/StorageFile.cpp +++ b/src/Storages/StorageFile.cpp @@ -90,6 +90,7 @@ namespace ErrorCodes extern const int LOGICAL_ERROR; extern const int CANNOT_APPEND_TO_FILE; extern const int CANNOT_EXTRACT_TABLE_STRUCTURE; + extern const int CANNOT_DETECT_FORMAT; extern const int CANNOT_COMPILE_REGEXP; } @@ -328,7 +329,7 @@ std::unique_ptr createReadBuffer( } -Strings StorageFile::getPathsList(const String & table_path, const String & user_files_path, ContextPtr context, size_t & total_bytes_to_read) +Strings StorageFile::getPathsList(const String & table_path, const String & user_files_path, const ContextPtr & context, size_t & total_bytes_to_read) { fs::path user_files_absolute_path = fs::weakly_canonical(user_files_path); fs::path fs_table_path(table_path); @@ -375,27 +376,44 @@ namespace public: ReadBufferFromFileIterator( const std::vector & paths_, - const String & format_, + std::optional format_, const String & compression_method_, const std::optional & format_settings_, - ContextPtr context_) + const ContextPtr & context_) : WithContext(context_) , paths(paths_) - , format(format_) + , format(std::move(format_)) , compression_method(compression_method_) , format_settings(format_settings_) { } - std::pair, std::optional> next() override + Data next() override { bool is_first = current_index == 0; - /// For default mode check cached columns for all paths on first iteration. - /// If we have cached columns, next() won't be called again. - if (is_first && getContext()->getSettingsRef().schema_inference_mode == SchemaInferenceMode::DEFAULT) + if (is_first) { - if (auto cached_columns = tryGetColumnsFromCache(paths)) - return {nullptr, cached_columns}; + /// If format is unknown we iterate through all paths on first iteration and + /// try to determine format by file name. + if (!format) + { + for (const auto & path : paths) + { + if (auto format_from_path = FormatFactory::instance().tryGetFormatFromFileName(path)) + { + format = format_from_path; + break; + } + } + } + + /// For default mode check cached columns for all paths on first iteration. + /// If we have cached columns, next() won't be called again. + if (getContext()->getSettingsRef().schema_inference_mode == SchemaInferenceMode::DEFAULT) + { + if (auto cached_columns = tryGetColumnsFromCache(paths)) + return {nullptr, cached_columns, format}; + } } String path; @@ -406,11 +424,18 @@ namespace if (current_index == paths.size()) { if (is_first) + { + if (format) + throw Exception( + ErrorCodes::CANNOT_EXTRACT_TABLE_STRUCTURE, + "The table structure cannot be extracted from a {} format file, because all files are empty. You can specify the format manually", + *format); + throw Exception( - ErrorCodes::CANNOT_EXTRACT_TABLE_STRUCTURE, - "Cannot extract table structure from {} format file, because all files are empty. You must specify table structure manually", - format); - return {nullptr, std::nullopt}; + ErrorCodes::CANNOT_DETECT_FORMAT, + "The data format cannot be detected by the contents of the files, because all files are empty. You can specify table structure manually"); + } + return {nullptr, std::nullopt, std::nullopt}; } path = paths[current_index++]; @@ -421,10 +446,10 @@ namespace if (getContext()->getSettingsRef().schema_inference_mode == SchemaInferenceMode::UNION) { if (auto cached_columns = tryGetColumnsFromCache({path})) - return {nullptr, cached_columns}; + return {nullptr, cached_columns, format}; } - return {createReadBuffer(path, file_stat, false, -1, compression_method, getContext()), std::nullopt}; + return {createReadBuffer(path, file_stat, false, -1, compression_method, getContext()), std::nullopt, format}; } void setNumRowsToLastFile(size_t num_rows) override @@ -432,7 +457,7 @@ namespace if (!getContext()->getSettingsRef().use_cache_for_count_from_files) return; - auto key = getKeyForSchemaCache(paths[current_index - 1], format, format_settings, getContext()); + auto key = getKeyForSchemaCache(paths[current_index - 1], *format, format_settings, getContext()); StorageFile::getSchemaCache(getContext()).addNumRows(key, num_rows); } @@ -444,7 +469,7 @@ namespace /// For union mode, schema can be different for different files, so we need to /// cache last inferred schema only for last processed file. - auto cache_key = getKeyForSchemaCache(paths[current_index - 1], format, format_settings, getContext()); + auto cache_key = getKeyForSchemaCache(paths[current_index - 1], *format, format_settings, getContext()); StorageFile::getSchemaCache(getContext()).addColumns(cache_key, columns); } @@ -455,7 +480,7 @@ namespace return; /// For default mode we cache resulting schema for all paths. - auto cache_keys = getKeysForSchemaCache(paths, format, format_settings, getContext()); + auto cache_keys = getKeysForSchemaCache(paths, *format, format_settings, getContext()); StorageFile::getSchemaCache(getContext()).addManyColumns(cache_keys, columns); } @@ -466,14 +491,30 @@ namespace return ""; } + void setFormatName(const String & format_name) override + { + format = format_name; + } + + bool supportsLastReadBufferRecreation() const override { return true; } + + std::unique_ptr recreateLastReadBuffer() override + { + chassert(current_index > 0 && current_index <= paths.size()); + auto path = paths[current_index - 1]; + auto file_stat = getFileStat(path, false, -1, "File"); + return createReadBuffer(path, file_stat, false, -1, compression_method, getContext()); + } + private: std::optional tryGetColumnsFromCache(const Strings & paths_) { - if (!getContext()->getSettingsRef().schema_inference_use_cache_for_file) + auto context = getContext(); + if (!context->getSettingsRef().schema_inference_use_cache_for_file) return std::nullopt; /// Check if the cache contains one of the paths. - auto & schema_cache = StorageFile::getSchemaCache(getContext()); + auto & schema_cache = StorageFile::getSchemaCache(context); struct stat file_stat{}; for (const auto & path : paths_) { @@ -485,10 +526,28 @@ namespace return file_stat.st_mtime; }; - auto cache_key = getKeyForSchemaCache(path, format, format_settings, getContext()); - auto columns = schema_cache.tryGetColumns(cache_key, get_last_mod_time); - if (columns) - return columns; + if (format) + { + auto cache_key = getKeyForSchemaCache(path, *format, format_settings, context); + if (auto columns = schema_cache.tryGetColumns(cache_key, get_last_mod_time)) + return columns; + } + else + { + /// If format is unknown, we can iterate through all possible input formats + /// and check if we have an entry with this format and this file in schema cache. + /// If we have such entry for some format, we can use this format to read the file. + for (const auto & format_name : FormatFactory::instance().getAllInputFormats()) + { + auto cache_key = getKeyForSchemaCache(path, format_name, format_settings, context); + if (auto columns = schema_cache.tryGetColumns(cache_key, get_last_mod_time)) + { + /// Now format is known. It should be the same for all files. + format = format_name; + return columns; + } + } + } } return std::nullopt; @@ -497,7 +556,7 @@ namespace const std::vector & paths; size_t current_index = 0; - String format; + std::optional format; String compression_method; const std::optional & format_settings; }; @@ -507,17 +566,17 @@ namespace public: ReadBufferFromArchiveIterator( const StorageFile::ArchiveInfo & archive_info_, - const String & format_, + std::optional format_, const std::optional & format_settings_, - ContextPtr context_) + const ContextPtr & context_) : WithContext(context_) , archive_info(archive_info_) - , format(format_) + , format(std::move(format_)) , format_settings(format_settings_) { } - std::pair, std::optional> next() override + Data next() override { /// For default mode check cached columns for all initial archive paths (maybe with globs) on first iteration. /// If we have cached columns, next() won't be called again. @@ -525,8 +584,8 @@ namespace { for (const auto & archive : archive_info.paths_to_archives) { - if (auto cached_columns = tryGetColumnsFromSchemaCache(archive, archive_info.path_in_archive)) - return {nullptr, cached_columns}; + if (auto cached_schema = tryGetSchemaFromCache(archive, fmt::format("{}::{}", archive, archive_info.path_in_archive))) + return {nullptr, cached_schema, format}; } } @@ -536,12 +595,19 @@ namespace if (current_archive_index == archive_info.paths_to_archives.size()) { if (is_first) - throw Exception( - ErrorCodes::CANNOT_EXTRACT_TABLE_STRUCTURE, - "Cannot extract table structure from {} format file, because all files are empty. You must specify table structure manually", - format); + { + if (format) + throw Exception( + ErrorCodes::CANNOT_EXTRACT_TABLE_STRUCTURE, + "The table structure cannot be extracted from a {} format file, because all files are empty. You can specify table structure manually", + *format); - return {nullptr, std::nullopt}; + throw Exception( + ErrorCodes::CANNOT_DETECT_FORMAT, + "The data format cannot be detected by the contents of the files, because all files are empty. You can specify the format manually"); + } + + return {nullptr, std::nullopt, format}; } const auto & archive = archive_info.paths_to_archives[current_archive_index]; @@ -555,11 +621,18 @@ namespace continue; } + if (format) + throw Exception( + ErrorCodes::CANNOT_DETECT_FORMAT, + "The table structure cannot be extracted from a {} format file, because the archive {} is empty. " + "You can specify table structure manually", + *format, + archive); + throw Exception( - ErrorCodes::CANNOT_EXTRACT_TABLE_STRUCTURE, - "Cannot extract table structure from {} format file, because the archive {} is empty. " - "You must specify table structure manually", - format, + ErrorCodes::CANNOT_DETECT_FORMAT, + "The data format cannot be detected by the contents of the files, because the archive {} is empty. " + "You can specify the format manually", archive); } @@ -575,8 +648,8 @@ namespace last_read_file_path = paths_for_schema_cache.emplace_back(fmt::format("{}::{}", archive_reader->getPath(), archive_info.path_in_archive)); is_first = false; - if (auto cached_columns = tryGetColumnsFromSchemaCache(archive, last_read_file_path)) - return {nullptr, cached_columns}; + if (auto cached_schema = tryGetSchemaFromCache(archive, last_read_file_path)) + return {nullptr, cached_schema, format}; } else { @@ -612,13 +685,20 @@ namespace last_read_file_path = paths_for_schema_cache.emplace_back(fmt::format("{}::{}", archive_reader->getPath(), *filename)); is_first = false; - if (auto cached_columns = tryGetColumnsFromSchemaCache(archive, last_read_file_path)) + /// If format is unknown we can try to determine it by the file name. + if (!format) + { + if (auto format_from_file = FormatFactory::instance().tryGetFormatFromFileName(*filename)) + format = format_from_file; + } + + if (auto cached_schema = tryGetSchemaFromCache(archive, last_read_file_path)) { /// For union mode next() will be called again even if we found cached columns, /// so we need to remember last_read_buffer to continue iterating through files in archive. if (getContext()->getSettingsRef().schema_inference_mode == SchemaInferenceMode::UNION) last_read_buffer = archive_reader->readFile(std::move(file_enumerator)); - return {nullptr, cached_columns}; + return {nullptr, cached_schema, format}; } read_buf = archive_reader->readFile(std::move(file_enumerator)); @@ -627,7 +707,7 @@ namespace break; } - return {std::move(read_buf), std::nullopt}; + return {std::move(read_buf), std::nullopt, format}; } void setPreviousReadBuffer(std::unique_ptr buffer) override @@ -641,7 +721,7 @@ namespace if (!getContext()->getSettingsRef().use_cache_for_count_from_files) return; - auto key = getKeyForSchemaCache(last_read_file_path, format, format_settings, getContext()); + auto key = getKeyForSchemaCache(last_read_file_path, *format, format_settings, getContext()); StorageFile::getSchemaCache(getContext()).addNumRows(key, num_rows); } @@ -654,7 +734,7 @@ namespace /// For union mode, schema can be different for different files in archive, so we need to /// cache last inferred schema only for last processed file. auto & schema_cache = StorageFile::getSchemaCache(getContext()); - auto cache_key = getKeyForSchemaCache(last_read_file_path, format, format_settings, getContext()); + auto cache_key = getKeyForSchemaCache(last_read_file_path, *format, format_settings, getContext()); schema_cache.addColumns(cache_key, columns); } @@ -670,17 +750,42 @@ namespace for (const auto & archive : archive_info.paths_to_archives) paths_for_schema_cache.emplace_back(fmt::format("{}::{}", archive, archive_info.path_in_archive)); auto & schema_cache = StorageFile::getSchemaCache(getContext()); - auto cache_keys = getKeysForSchemaCache(paths_for_schema_cache, format, format_settings, getContext()); + auto cache_keys = getKeysForSchemaCache(paths_for_schema_cache, *format, format_settings, getContext()); schema_cache.addManyColumns(cache_keys, columns); } + void setFormatName(const String & format_name) override + { + format = format_name; + } + String getLastFileName() const override { return last_read_file_path; } + bool supportsLastReadBufferRecreation() const override { return true; } + + std::unique_ptr recreateLastReadBuffer() override + { + if (archive_info.isSingleFileRead()) + { + chassert(current_archive_index > 0 && current_archive_index <= archive_info.paths_to_archives.size()); + const auto & archive = archive_info.paths_to_archives[current_archive_index - 1]; + auto archive_reader = createArchiveReader(archive); + return archive_reader->readFile(archive_info.path_in_archive, false); + } + + chassert(current_archive_index >= 0 && current_archive_index < archive_info.paths_to_archives.size()); + const auto & archive = archive_info.paths_to_archives[current_archive_index]; + auto archive_reader = createArchiveReader(archive); + chassert(last_read_buffer); + file_enumerator = archive_reader->currentFile(std::move(last_read_buffer)); + return archive_reader->readFile(std::move(file_enumerator)); + } + private: - std::optional tryGetColumnsFromSchemaCache(const std::string & archive_path, const std::string & full_path) + std::optional tryGetSchemaFromCache(const std::string & archive_path, const std::string & full_path) { auto context = getContext(); if (!context->getSettingsRef().schema_inference_use_cache_for_file) @@ -696,11 +801,28 @@ namespace return file_stat.st_mtime; }; - auto cache_key = getKeyForSchemaCache(full_path, format, format_settings, context); - auto columns = schema_cache.tryGetColumns(cache_key, get_last_mod_time); - - if (columns) - return columns; + if (format) + { + auto cache_key = getKeyForSchemaCache(full_path, *format, format_settings, context); + if (auto columns = schema_cache.tryGetColumns(cache_key, get_last_mod_time)) + return columns; + } + else + { + /// If format is unknown, we can iterate through all possible input formats + /// and check if we have an entry with this format and this file in schema cache. + /// If we have such entry for some format, we can use this format to read the file. + for (const auto & format_name : FormatFactory::instance().getAllInputFormats()) + { + auto cache_key = getKeyForSchemaCache(full_path, format_name, format_settings, context); + if (auto columns = schema_cache.tryGetColumns(cache_key, get_last_mod_time)) + { + /// Now format is known. It should be the same for all files. + format = format_name; + return columns; + } + } + } return std::nullopt; } @@ -716,13 +838,13 @@ namespace std::unique_ptr file_enumerator; std::unique_ptr last_read_buffer; - String format; + std::optional format; const std::optional & format_settings; std::vector paths_for_schema_cache; }; } -ColumnsDescription StorageFile::getTableStructureFromFileDescriptor(ContextPtr context) +std::pair StorageFile::getTableStructureAndFormatFromFileDescriptor(std::optional format, const ContextPtr & context) { /// If we want to read schema from file descriptor we should create /// a read buffer from fd, create a checkpoint, read some data required @@ -739,22 +861,29 @@ ColumnsDescription StorageFile::getTableStructureFromFileDescriptor(ContextPtr c read_buf->setCheckpoint(); auto read_buffer_iterator = SingleReadBufferIterator(std::move(read_buf)); - auto columns = readSchemaFromFormat(format_name, format_settings, read_buffer_iterator, false, context, peekable_read_buffer_from_fd); + ColumnsDescription columns; + if (format) + columns = readSchemaFromFormat(*format, format_settings, read_buffer_iterator, context); + else + std::tie(columns, format) = detectFormatAndReadSchema(format_settings, read_buffer_iterator, context); + + peekable_read_buffer_from_fd = read_buffer_iterator.releaseBuffer(); if (peekable_read_buffer_from_fd) { /// If we have created read buffer in readSchemaFromFormat we should rollback to checkpoint. assert_cast(peekable_read_buffer_from_fd.get())->rollbackToCheckpoint(); has_peekable_read_buffer_from_fd = true; } - return columns; + + return {columns, *format}; } -ColumnsDescription StorageFile::getTableStructureFromFile( - const String & format, +std::pair StorageFile::getTableStructureAndFormatFromFileImpl( + std::optional format, const std::vector & paths, const String & compression_method, const std::optional & format_settings, - ContextPtr context, + const ContextPtr & context, const std::optional & archive_info) { if (format == "Distributed") @@ -762,29 +891,60 @@ ColumnsDescription StorageFile::getTableStructureFromFile( if (paths.empty()) throw Exception(ErrorCodes::INCORRECT_FILE_NAME, "Cannot get table structure from file, because no files match specified name"); - return ColumnsDescription(DistributedAsyncInsertSource(paths[0]).getOutputs().front().getHeader().getNamesAndTypesList()); + return {ColumnsDescription(DistributedAsyncInsertSource(paths[0]).getOutputs().front().getHeader().getNamesAndTypesList()), *format}; } if (((archive_info && archive_info->paths_to_archives.empty()) || (!archive_info && paths.empty())) - && !FormatFactory::instance().checkIfFormatHasExternalSchemaReader(format)) + && (!format || !FormatFactory::instance().checkIfFormatHasExternalSchemaReader(*format))) + { + if (format) + throw Exception( + ErrorCodes::CANNOT_EXTRACT_TABLE_STRUCTURE, + "The table structure cannot be extracted from a {} format file, because there are no files with provided path. " + "You can specify table structure manually", *format); + throw Exception( ErrorCodes::CANNOT_EXTRACT_TABLE_STRUCTURE, - "Cannot extract table structure from {} format file, because there are no files with provided path. " - "You must specify table structure manually", format); + "The data format cannot be detected by the contents of the files, because there are no files with provided path. " + "You can specify the format manually"); + + } if (archive_info) { ReadBufferFromArchiveIterator read_buffer_iterator(*archive_info, format, format_settings, context); - return readSchemaFromFormat( - format, - format_settings, - read_buffer_iterator, - /*retry=*/archive_info->paths_to_archives.size() > 1 || !archive_info->isSingleFileRead(), - context); + if (format) + return {readSchemaFromFormat(*format, format_settings, read_buffer_iterator, context), *format}; + + return detectFormatAndReadSchema(format_settings, read_buffer_iterator, context); } ReadBufferFromFileIterator read_buffer_iterator(paths, format, compression_method, format_settings, context); - return readSchemaFromFormat(format, format_settings, read_buffer_iterator, paths.size() > 1, context); + if (format) + return {readSchemaFromFormat(*format, format_settings, read_buffer_iterator, context), *format}; + + return detectFormatAndReadSchema(format_settings, read_buffer_iterator, context); +} + +ColumnsDescription StorageFile::getTableStructureFromFile( + const DB::String & format, + const std::vector & paths, + const DB::String & compression_method, + const std::optional & format_settings, + const ContextPtr & context, + const std::optional & archive_info) +{ + return getTableStructureAndFormatFromFileImpl(format, paths, compression_method, format_settings, context, archive_info).first; +} + +std::pair StorageFile::getTableStructureAndFormatFromFile( + const std::vector & paths, + const DB::String & compression_method, + const std::optional & format_settings, + const ContextPtr & context, + const std::optional & archive_info) +{ + return getTableStructureAndFormatFromFileImpl(std::nullopt, paths, compression_method, format_settings, context, archive_info); } bool StorageFile::supportsSubsetOfColumns(const ContextPtr & context) const @@ -875,7 +1035,7 @@ StorageFile::StorageFile(CommonArguments args) , compression_method(args.compression_method) , base_path(args.getContext()->getPath()) { - if (format_name != "Distributed") + if (format_name != "Distributed" && format_name != "auto") FormatFactory::instance().checkFormatName(format_name); } @@ -887,16 +1047,19 @@ void StorageFile::setStorageMetadata(CommonArguments args) { ColumnsDescription columns; if (use_table_fd) - columns = getTableStructureFromFileDescriptor(args.getContext()); + { + if (format_name == "auto") + std::tie(columns, format_name) = getTableStructureAndFormatFromFileDescriptor(std::nullopt, args.getContext()); + else + columns = getTableStructureAndFormatFromFileDescriptor(format_name, args.getContext()).first; + } else { - columns = getTableStructureFromFile( - format_name, - paths, - compression_method, - format_settings, - args.getContext(), - archive_info); + if (format_name == "auto") + std::tie(columns, format_name) = getTableStructureAndFormatFromFile(paths, compression_method, format_settings, args.getContext(), archive_info); + else + columns = getTableStructureFromFile(format_name, paths, compression_method, format_settings, args.getContext(), archive_info); + if (!args.columns.empty() && args.columns != columns) throw Exception(ErrorCodes::INCOMPATIBLE_COLUMNS, "Table structure and file structure are different"); } @@ -904,6 +1067,8 @@ void StorageFile::setStorageMetadata(CommonArguments args) } else { + if (format_name == "auto") + format_name = getTableStructureAndFormatFromFile(paths, compression_method, format_settings, args.getContext(), archive_info).second; /// We don't allow special columns in File storage. if (!args.columns.hasOnlyOrdinary()) throw Exception(ErrorCodes::BAD_ARGUMENTS, "Table engine File doesn't support special columns like MATERIALIZED, ALIAS or EPHEMERAL"); @@ -918,7 +1083,7 @@ void StorageFile::setStorageMetadata(CommonArguments args) } -static std::chrono::seconds getLockTimeout(ContextPtr context) +static std::chrono::seconds getLockTimeout(const ContextPtr & context) { const Settings & settings = context->getSettingsRef(); Int64 lock_timeout = settings.lock_acquire_timeout.totalSeconds(); @@ -934,9 +1099,9 @@ StorageFileSource::FilesIterator::FilesIterator( std::optional archive_info_, const ActionsDAG::Node * predicate, const NamesAndTypesList & virtual_columns, - ContextPtr context_, + const ContextPtr & context_, bool distributed_processing_) - : files(files_), archive_info(std::move(archive_info_)), distributed_processing(distributed_processing_), context(context_) + : WithContext(context_), files(files_), archive_info(std::move(archive_info_)), distributed_processing(distributed_processing_) { ActionsDAGPtr filter_dag; if (!distributed_processing && !archive_info && !files.empty()) @@ -949,7 +1114,7 @@ StorageFileSource::FilesIterator::FilesIterator( String StorageFileSource::FilesIterator::next() { if (distributed_processing) - return context->getReadTaskCallback()(); + return getContext()->getReadTaskCallback()(); else { const auto & fs = isReadFromArchive() ? archive_info->paths_to_archives : files; @@ -973,12 +1138,12 @@ const String & StorageFileSource::FilesIterator::getFileNameInArchive() StorageFileSource::StorageFileSource( const ReadFromFormatInfo & info, std::shared_ptr storage_, - ContextPtr context_, + const ContextPtr & context_, UInt64 max_block_size_, FilesIteratorPtr files_iterator_, std::unique_ptr read_buf_, bool need_only_count_) - : SourceWithKeyCondition(info.source_header, false) + : SourceWithKeyCondition(info.source_header, false), WithContext(context_) , storage(std::move(storage_)) , files_iterator(std::move(files_iterator_)) , read_buf(std::move(read_buf_)) @@ -986,13 +1151,12 @@ StorageFileSource::StorageFileSource( , requested_columns(info.requested_columns) , requested_virtual_columns(info.requested_virtual_columns) , block_for_format(info.format_header) - , context(context_) , max_block_size(max_block_size_) , need_only_count(need_only_count_) { if (!storage->use_table_fd) { - shared_lock = std::shared_lock(storage->rwlock, getLockTimeout(context)); + shared_lock = std::shared_lock(storage->rwlock, getLockTimeout(getContext())); if (!shared_lock) throw Exception(ErrorCodes::TIMEOUT_EXCEEDED, "Lock timeout exceeded"); storage->readers_counter.fetch_add(1, std::memory_order_release); @@ -1009,7 +1173,7 @@ void StorageFileSource::beforeDestroy() if (std::uncaught_exceptions() == 0 && cnt == 1 && !storage->was_renamed) { shared_lock.unlock(); - auto exclusive_lock = std::unique_lock{storage->rwlock, getLockTimeout(context)}; + auto exclusive_lock = std::unique_lock{storage->rwlock, getLockTimeout(getContext())}; if (!exclusive_lock) return; @@ -1028,7 +1192,7 @@ void StorageFileSource::beforeDestroy() file_path = file_path.lexically_normal(); // Checking access rights - checkCreationIsAllowed(context, context->getUserFilesPath(), file_path, true); + checkCreationIsAllowed(getContext(), getContext()->getUserFilesPath(), file_path, true); // Checking an existing of new file if (fs::exists(file_path)) @@ -1061,7 +1225,7 @@ void StorageFileSource::setKeyCondition(const ActionsDAG::NodeRawConstPtrs & nod bool StorageFileSource::tryGetCountFromCache(const struct stat & file_stat) { - if (!context->getSettingsRef().use_cache_for_count_from_files) + if (!getContext()->getSettingsRef().use_cache_for_count_from_files) return false; auto num_rows_from_cache = tryGetNumRowsFromCache(current_path, file_stat.st_mtime); @@ -1103,7 +1267,7 @@ Chunk StorageFileSource::generate() return {}; auto file_stat = getFileStat(archive, storage->use_table_fd, storage->table_fd, storage->getName()); - if (context->getSettingsRef().engine_file_skip_empty_files && file_stat.st_size == 0) + if (getContext()->getSettingsRef().engine_file_skip_empty_files && file_stat.st_size == 0) continue; archive_reader = createArchiveReader(archive); @@ -1117,7 +1281,7 @@ Chunk StorageFileSource::generate() if (!read_buf) continue; - if (auto progress_callback = context->getFileProgressCallback()) + if (auto progress_callback = getContext()->getFileProgressCallback()) progress_callback(FileProgress(0, tryGetFileSizeFromReadBuffer(*read_buf).value_or(0))); } else @@ -1131,7 +1295,7 @@ Chunk StorageFileSource::generate() return {}; current_archive_stat = getFileStat(archive, storage->use_table_fd, storage->table_fd, storage->getName()); - if (context->getSettingsRef().engine_file_skip_empty_files && current_archive_stat.st_size == 0) + if (getContext()->getSettingsRef().engine_file_skip_empty_files && current_archive_stat.st_size == 0) continue; archive_reader = createArchiveReader(archive); @@ -1165,7 +1329,7 @@ Chunk StorageFileSource::generate() continue; read_buf = archive_reader->readFile(std::move(file_enumerator)); - if (auto progress_callback = context->getFileProgressCallback()) + if (auto progress_callback = getContext()->getFileProgressCallback()) progress_callback(FileProgress(0, tryGetFileSizeFromReadBuffer(*read_buf).value_or(0))); } } @@ -1191,16 +1355,16 @@ Chunk StorageFileSource::generate() file_stat = getFileStat(current_path, storage->use_table_fd, storage->table_fd, storage->getName()); current_file_size = file_stat.st_size; - if (context->getSettingsRef().engine_file_skip_empty_files && file_stat.st_size == 0) + if (getContext()->getSettingsRef().engine_file_skip_empty_files && file_stat.st_size == 0) continue; if (need_only_count && tryGetCountFromCache(file_stat)) continue; - read_buf = createReadBuffer(current_path, file_stat, storage->use_table_fd, storage->table_fd, storage->compression_method, context); + read_buf = createReadBuffer(current_path, file_stat, storage->use_table_fd, storage->table_fd, storage->compression_method, getContext()); } - const Settings & settings = context->getSettingsRef(); + const Settings & settings = getContext()->getSettingsRef(); size_t file_num = 0; if (storage->archive_info) @@ -1212,7 +1376,7 @@ Chunk StorageFileSource::generate() const auto max_parsing_threads = std::max(settings.max_threads / file_num, 1UL); input_format = FormatFactory::instance().getInput( - storage->format_name, *read_buf, block_for_format, context, max_block_size, storage->format_settings, + storage->format_name, *read_buf, block_for_format, getContext(), max_block_size, storage->format_settings, max_parsing_threads, std::nullopt, /*is_remote_fs*/ false, CompressionMethod::None, need_only_count); if (key_condition) @@ -1228,7 +1392,7 @@ Chunk StorageFileSource::generate() { builder.addSimpleTransform([&](const Block & header) { - return std::make_shared(header, columns_description, *input_format, context); + return std::make_shared(header, columns_description, *input_format, getContext()); }); } @@ -1265,7 +1429,7 @@ Chunk StorageFileSource::generate() if (storage->use_table_fd) finished_generate = true; - if (input_format && storage->format_name != "Distributed" && context->getSettingsRef().use_cache_for_count_from_files) + if (input_format && storage->format_name != "Distributed" && getContext()->getSettingsRef().use_cache_for_count_from_files) addNumRowsToCache(current_path, total_rows_in_file); total_rows_in_file = 0; @@ -1296,14 +1460,14 @@ Chunk StorageFileSource::generate() void StorageFileSource::addNumRowsToCache(const String & path, size_t num_rows) const { - auto key = getKeyForSchemaCache(path, storage->format_name, storage->format_settings, context); - StorageFile::getSchemaCache(context).addNumRows(key, num_rows); + auto key = getKeyForSchemaCache(path, storage->format_name, storage->format_settings, getContext()); + StorageFile::getSchemaCache(getContext()).addNumRows(key, num_rows); } std::optional StorageFileSource::tryGetNumRowsFromCache(const String & path, time_t last_mod_time) const { - auto & schema_cache = StorageFile::getSchemaCache(context); - auto key = getKeyForSchemaCache(path, storage->format_name, storage->format_settings, context); + auto & schema_cache = StorageFile::getSchemaCache(getContext()); + auto key = getKeyForSchemaCache(path, storage->format_name, storage->format_settings, getContext()); auto get_last_mod_time = [&]() -> std::optional { return last_mod_time; @@ -1312,7 +1476,7 @@ std::optional StorageFileSource::tryGetNumRowsFromCache(const String & p return schema_cache.tryGetNumRows(key, get_last_mod_time); } -class ReadFromFile : public SourceStepWithFilter +class ReadFromFile : public SourceStepWithFilter, WithContext { public: std::string getName() const override { return "ReadFromFile"; } @@ -1324,14 +1488,13 @@ public: std::shared_ptr storage_, ReadFromFormatInfo info_, const bool need_only_count_, - ContextPtr context_, + const ContextPtr & context_, size_t max_block_size_, size_t num_streams_) - : SourceStepWithFilter(DataStream{.header = std::move(sample_block)}) + : SourceStepWithFilter(DataStream{.header = std::move(sample_block)}), WithContext(context_) , storage(std::move(storage_)) , info(std::move(info_)) , need_only_count(need_only_count_) - , context(std::move(context_)) , max_block_size(max_block_size_) , max_num_streams(num_streams_) { @@ -1342,7 +1505,6 @@ private: ReadFromFormatInfo info; const bool need_only_count; - ContextPtr context; size_t max_block_size; const size_t max_num_streams; @@ -1423,7 +1585,7 @@ void ReadFromFile::createIterator(const ActionsDAG::Node * predicate) storage->archive_info, predicate, storage->virtual_columns, - context, + getContext(), storage->distributed_processing); } @@ -1445,8 +1607,10 @@ void ReadFromFile::initializePipeline(QueryPipelineBuilder & pipeline, const Bui Pipes pipes; pipes.reserve(num_streams); + auto ctx = getContext(); + /// Set total number of bytes to process. For progress bar. - auto progress_callback = context->getFileProgressCallback(); + auto progress_callback = ctx->getFileProgressCallback(); if (progress_callback && !storage->archive_info) progress_callback(FileProgress(0, storage->total_bytes_to_read)); @@ -1464,20 +1628,20 @@ void ReadFromFile::initializePipeline(QueryPipelineBuilder & pipeline, const Bui auto source = std::make_shared( info, storage, - context, + ctx, max_block_size, files_iterator, std::move(read_buffer), need_only_count); - source->setKeyCondition(filter_nodes.nodes, context); + source->setKeyCondition(filter_nodes.nodes, ctx); pipes.emplace_back(std::move(source)); } auto pipe = Pipe::unitePipes(std::move(pipes)); size_t output_ports = pipe.numOutputPorts(); - const bool parallelize_output = context->getSettingsRef().parallelize_output_from_storages; - if (parallelize_output && storage->parallelizeOutputAfterReading(context) && output_ports > 0 && output_ports < max_num_streams) + const bool parallelize_output = ctx->getSettingsRef().parallelize_output_from_storages; + if (parallelize_output && storage->parallelizeOutputAfterReading(ctx) && output_ports > 0 && output_ports < max_num_streams) pipe.resize(max_num_streams); if (pipe.empty()) @@ -1490,7 +1654,7 @@ void ReadFromFile::initializePipeline(QueryPipelineBuilder & pipeline, const Bui } -class StorageFileSink final : public SinkToStorage +class StorageFileSink final : public SinkToStorage, WithContext { public: StorageFileSink( @@ -1503,9 +1667,9 @@ public: const CompressionMethod compression_method_, const std::optional & format_settings_, const String format_name_, - ContextPtr context_, + const ContextPtr & context_, int flags_) - : SinkToStorage(metadata_snapshot_->getSampleBlock()) + : SinkToStorage(metadata_snapshot_->getSampleBlock()), WithContext(context_) , metadata_snapshot(metadata_snapshot_) , table_name_for_log(table_name_for_log_) , table_fd(table_fd_) @@ -1515,7 +1679,6 @@ public: , compression_method(compression_method_) , format_name(format_name_) , format_settings(format_settings_) - , context(context_) , flags(flags_) { initialize(); @@ -1532,9 +1695,9 @@ public: const CompressionMethod compression_method_, const std::optional & format_settings_, const String format_name_, - ContextPtr context_, + const ContextPtr & context_, int flags_) - : SinkToStorage(metadata_snapshot_->getSampleBlock()) + : SinkToStorage(metadata_snapshot_->getSampleBlock()), WithContext(context_) , metadata_snapshot(metadata_snapshot_) , table_name_for_log(table_name_for_log_) , table_fd(table_fd_) @@ -1544,7 +1707,6 @@ public: , compression_method(compression_method_) , format_name(format_name_) , format_settings(format_settings_) - , context(context_) , flags(flags_) , lock(std::move(lock_)) { @@ -1568,7 +1730,7 @@ public: /// In case of formats with prefixes if file is not empty we have already written prefix. bool do_not_write_prefix = naked_buffer->size(); - const auto & settings = context->getSettingsRef(); + const auto & settings = getContext()->getSettingsRef(); write_buf = wrapWriteBufferWithCompressionMethod( std::move(naked_buffer), compression_method, @@ -1576,7 +1738,7 @@ public: static_cast(settings.output_format_compression_zstd_window_log)); writer = FormatFactory::instance().getOutputFormatParallelIfPossible(format_name, - *write_buf, metadata_snapshot->getSampleBlock(), context, format_settings); + *write_buf, metadata_snapshot->getSampleBlock(), getContext(), format_settings); if (do_not_write_prefix) writer->doNotWritePrefix(); @@ -1659,7 +1821,6 @@ private: std::string format_name; std::optional format_settings; - ContextPtr context; int flags; std::unique_lock lock; @@ -2044,7 +2205,7 @@ StorageFile::ArchiveInfo StorageFile::getArchiveInfo( const std::string & path_to_archive, const std::string & file_in_archive, const std::string & user_files_path, - ContextPtr context, + const ContextPtr & context, size_t & total_bytes_to_read ) { diff --git a/src/Storages/StorageFile.h b/src/Storages/StorageFile.h index 2955eb0f1aa..5c7a8da9f53 100644 --- a/src/Storages/StorageFile.h +++ b/src/Storages/StorageFile.h @@ -84,7 +84,7 @@ public: static Names getVirtualColumnNames(); - static Strings getPathsList(const String & table_path, const String & user_files_path, ContextPtr context, size_t & total_bytes_to_read); + static Strings getPathsList(const String & table_path, const String & user_files_path, const ContextPtr & context, size_t & total_bytes_to_read); /// Check if the format supports reading only some subset of columns. /// Is is useful because such formats could effectively skip unknown columns @@ -112,14 +112,19 @@ public: } }; - ColumnsDescription getTableStructureFromFileDescriptor(ContextPtr context); - static ColumnsDescription getTableStructureFromFile( const String & format, const std::vector & paths, const String & compression_method, const std::optional & format_settings, - ContextPtr context, + const ContextPtr & context, + const std::optional & archive_info = std::nullopt); + + static std::pair getTableStructureAndFormatFromFile( + const std::vector & paths, + const String & compression_method, + const std::optional & format_settings, + const ContextPtr & context, const std::optional & archive_info = std::nullopt); static SchemaCache & getSchemaCache(const ContextPtr & context); @@ -130,7 +135,7 @@ public: const std::string & path_to_archive, const std::string & file_in_archive, const std::string & user_files_path, - ContextPtr context, + const ContextPtr & context, size_t & total_bytes_to_read); bool supportsTrivialCountOptimization() const override { return true; } @@ -141,6 +146,16 @@ protected: friend class ReadFromFile; private: + std::pair getTableStructureAndFormatFromFileDescriptor(std::optional format, const ContextPtr & context); + + static std::pair getTableStructureAndFormatFromFileImpl( + std::optional format, + const std::vector & paths, + const String & compression_method, + const std::optional & format_settings, + const ContextPtr & context, + const std::optional & archive_info = std::nullopt); + void setStorageMetadata(CommonArguments args); std::string format_name; @@ -187,10 +202,10 @@ private: bool distributed_processing = false; }; -class StorageFileSource : public SourceWithKeyCondition +class StorageFileSource : public SourceWithKeyCondition, WithContext { public: - class FilesIterator + class FilesIterator : WithContext { public: explicit FilesIterator( @@ -198,7 +213,7 @@ public: std::optional archive_info_, const ActionsDAG::Node * predicate, const NamesAndTypesList & virtual_columns, - ContextPtr context_, + const ContextPtr & context_, bool distributed_processing_ = false); String next(); @@ -227,8 +242,6 @@ private: std::atomic index = 0; bool distributed_processing; - - ContextPtr context; }; using FilesIteratorPtr = std::shared_ptr; @@ -236,7 +249,7 @@ private: StorageFileSource( const ReadFromFormatInfo & info, std::shared_ptr storage_, - ContextPtr context_, + const ContextPtr & context_, UInt64 max_block_size_, FilesIteratorPtr files_iterator_, std::unique_ptr read_buf_, @@ -286,7 +299,6 @@ private: NamesAndTypesList requested_virtual_columns; Block block_for_format; - ContextPtr context; /// TODO Untangle potential issues with context lifetime. UInt64 max_block_size; bool finished_generate = false; diff --git a/src/Storages/StorageFileCluster.cpp b/src/Storages/StorageFileCluster.cpp index 0cc961bb464..0cc18abef5f 100644 --- a/src/Storages/StorageFileCluster.cpp +++ b/src/Storages/StorageFileCluster.cpp @@ -25,36 +25,39 @@ extern const int LOGICAL_ERROR; } StorageFileCluster::StorageFileCluster( - ContextPtr context_, + const ContextPtr & context, const String & cluster_name_, const String & filename_, const String & format_name_, - const String & compression_method_, + const String & compression_method, const StorageID & table_id_, const ColumnsDescription & columns_, - const ConstraintsDescription & constraints_, - bool structure_argument_was_provided_) - : IStorageCluster(cluster_name_, table_id_, getLogger("StorageFileCluster (" + table_id_.table_name + ")"), structure_argument_was_provided_) + const ConstraintsDescription & constraints_) + : IStorageCluster(cluster_name_, table_id_, getLogger("StorageFileCluster (" + table_id_.table_name + ")")) , filename(filename_) , format_name(format_name_) - , compression_method(compression_method_) { StorageInMemoryMetadata storage_metadata; size_t total_bytes_to_read; // its value isn't used as we are not reading files (just listing them). But it is required by getPathsList - paths = StorageFile::getPathsList(filename_, context_->getUserFilesPath(), context_, total_bytes_to_read); + paths = StorageFile::getPathsList(filename_, context->getUserFilesPath(), context, total_bytes_to_read); if (columns_.empty()) { - auto columns = StorageFile::getTableStructureFromFile(format_name, - paths, - compression_method, - std::nullopt, - context_); + ColumnsDescription columns; + if (format_name == "auto") + std::tie(columns, format_name) = StorageFile::getTableStructureAndFormatFromFile(paths, compression_method, std::nullopt, context); + else + columns = StorageFile::getTableStructureFromFile(format_name, paths, compression_method, std::nullopt, context); + storage_metadata.setColumns(columns); } else + { + if (format_name == "auto") + format_name = StorageFile::getTableStructureAndFormatFromFile(paths, compression_method, std::nullopt, context).second; storage_metadata.setColumns(columns_); + } storage_metadata.setConstraints(constraints_); setInMemoryMetadata(storage_metadata); @@ -62,13 +65,14 @@ StorageFileCluster::StorageFileCluster( virtual_columns = VirtualColumnUtils::getPathFileAndSizeVirtualsForStorage(storage_metadata.getSampleBlock().getNamesAndTypesList()); } -void StorageFileCluster::addColumnsStructureToQuery(ASTPtr & query, const String & structure, const ContextPtr & context) +void StorageFileCluster::updateQueryToSendIfNeeded(DB::ASTPtr & query, const StorageSnapshotPtr & storage_snapshot, const DB::ContextPtr & context) { ASTExpressionList * expression_list = extractTableFunctionArgumentsFromSelectQuery(query); if (!expression_list) throw Exception(ErrorCodes::LOGICAL_ERROR, "Expected SELECT query from table function fileCluster, got '{}'", queryToString(query)); - TableFunctionFileCluster::addColumnsStructureToArguments(expression_list->children, structure, context); + TableFunctionFileCluster::updateStructureAndFormatArgumentsIfNeeded( + expression_list->children, storage_snapshot->metadata->getColumns().getAll().toNamesAndTypesDescription(), format_name, context); } RemoteQueryExecutor::Extension StorageFileCluster::getTaskIteratorExtension(const ActionsDAG::Node * predicate, const ContextPtr & context) const diff --git a/src/Storages/StorageFileCluster.h b/src/Storages/StorageFileCluster.h index a6e57c3bb4f..2803c8b6e5b 100644 --- a/src/Storages/StorageFileCluster.h +++ b/src/Storages/StorageFileCluster.h @@ -17,15 +17,14 @@ class StorageFileCluster : public IStorageCluster { public: StorageFileCluster( - ContextPtr context_, + const ContextPtr & context_, const String & cluster_name_, const String & filename_, const String & format_name_, const String & compression_method_, const StorageID & table_id_, const ColumnsDescription & columns_, - const ConstraintsDescription & constraints_, - bool structure_argument_was_provided_); + const ConstraintsDescription & constraints_); std::string getName() const override { return "FileCluster"; } @@ -38,12 +37,11 @@ public: bool supportsTrivialCountOptimization() const override { return true; } private: - void addColumnsStructureToQuery(ASTPtr & query, const String & structure, const ContextPtr & context) override; + void updateQueryToSendIfNeeded(ASTPtr & query, const StorageSnapshotPtr & storage_snapshot, const ContextPtr & context) override; Strings paths; String filename; String format_name; - String compression_method; NamesAndTypesList virtual_columns; }; diff --git a/src/Storages/StorageReplicatedMergeTree.cpp b/src/Storages/StorageReplicatedMergeTree.cpp index 6119541ff52..8e1598a1eef 100644 --- a/src/Storages/StorageReplicatedMergeTree.cpp +++ b/src/Storages/StorageReplicatedMergeTree.cpp @@ -513,8 +513,15 @@ StorageReplicatedMergeTree::StorageReplicatedMergeTree( if (same_structure) { Coordination::Stat metadata_stat; - current_zookeeper->get(zookeeper_path + "/metadata", &metadata_stat); + current_zookeeper->get(fs::path(zookeeper_path) / "metadata", &metadata_stat); + + /** We change metadata_snapshot so that `createReplica` method will create `metadata_version` node in ZooKeeper + * with version of table '/metadata' node in Zookeeper. + * + * Otherwise `metadata_version` for not first replica will be initialized with 0 by default. + */ setInMemoryMetadata(metadata_snapshot->withMetadataVersion(metadata_stat.version)); + metadata_snapshot = getInMemoryMetadataPtr(); } } catch (Coordination::Exception & e) @@ -5817,6 +5824,7 @@ bool StorageReplicatedMergeTree::executeMetadataAlter(const StorageReplicatedMer Coordination::Requests requests; requests.emplace_back(zkutil::makeSetRequest(fs::path(replica_path) / "columns", entry.columns_str, -1)); requests.emplace_back(zkutil::makeSetRequest(fs::path(replica_path) / "metadata", entry.metadata_str, -1)); + requests.emplace_back(zkutil::makeSetRequest(fs::path(replica_path) / "metadata_version", std::to_string(entry.alter_version), -1)); auto table_id = getStorageID(); auto alter_context = getContext(); @@ -5863,10 +5871,6 @@ bool StorageReplicatedMergeTree::executeMetadataAlter(const StorageReplicatedMer resetObjectColumnsFromActiveParts(parts_lock); } - /// This transaction may not happen, but it's OK, because on the next retry we will eventually create/update this node - /// TODO Maybe do in in one transaction for Replicated database? - zookeeper->createOrUpdate(fs::path(replica_path) / "metadata_version", std::to_string(current_metadata->getMetadataVersion()), zkutil::CreateMode::Persistent); - return true; } diff --git a/src/Storages/StorageS3.cpp b/src/Storages/StorageS3.cpp index 4fde6fd04f3..2d8ef3df1c8 100644 --- a/src/Storages/StorageS3.cpp +++ b/src/Storages/StorageS3.cpp @@ -129,6 +129,7 @@ namespace ErrorCodes extern const int UNEXPECTED_EXPRESSION; extern const int DATABASE_ACCESS_DENIED; extern const int CANNOT_EXTRACT_TABLE_STRUCTURE; + extern const int CANNOT_DETECT_FORMAT; extern const int NOT_IMPLEMENTED; extern const int CANNOT_COMPILE_REGEXP; extern const int FILE_DOESNT_EXIST; @@ -428,7 +429,7 @@ StorageS3Source::DisclosedGlobIterator::DisclosedGlobIterator( const S3::URI & globbed_uri_, const ActionsDAG::Node * predicate, const NamesAndTypesList & virtual_columns_, - ContextPtr context, + const ContextPtr & context, KeysWithInfo * read_keys_, const S3Settings::RequestSettings & request_settings_, std::function file_progress_callback_) @@ -563,7 +564,7 @@ StorageS3Source::StorageS3Source( const ReadFromFormatInfo & info, const String & format_, String name_, - ContextPtr context_, + const ContextPtr & context_, std::optional format_settings_, UInt64 max_block_size_, const S3Settings::RequestSettings & request_settings_, @@ -841,7 +842,7 @@ public: StorageS3Sink( const String & format, const Block & sample_block_, - ContextPtr context, + const ContextPtr & context, std::optional format_settings_, const CompressionMethod compression_method, const StorageS3::Configuration & configuration_, @@ -949,23 +950,22 @@ private: }; -class PartitionedStorageS3Sink : public PartitionedSink +class PartitionedStorageS3Sink : public PartitionedSink, WithContext { public: PartitionedStorageS3Sink( const ASTPtr & partition_by, const String & format_, const Block & sample_block_, - ContextPtr context_, + const ContextPtr & context_, std::optional format_settings_, const CompressionMethod compression_method_, const StorageS3::Configuration & configuration_, const String & bucket_, const String & key_) - : PartitionedSink(partition_by, context_, sample_block_) + : PartitionedSink(partition_by, context_, sample_block_), WithContext(context_) , format(format_) , sample_block(sample_block_) - , context(context_) , compression_method(compression_method_) , configuration(configuration_) , bucket(bucket_) @@ -985,7 +985,7 @@ public: return std::make_shared( format, sample_block, - context, + getContext(), format_settings, compression_method, configuration, @@ -997,7 +997,6 @@ public: private: const String format; const Block sample_block; - const ContextPtr context; const CompressionMethod compression_method; const StorageS3::Configuration configuration; const String bucket; @@ -1033,7 +1032,7 @@ private: StorageS3::StorageS3( const Configuration & configuration_, - ContextPtr context_, + const ContextPtr & context_, const StorageID & table_id_, const ColumnsDescription & columns_, const ConstraintsDescription & constraints_, @@ -1050,18 +1049,27 @@ StorageS3::StorageS3( { updateConfiguration(context_); // NOLINT(clang-analyzer-optin.cplusplus.VirtualCall) - FormatFactory::instance().checkFormatName(configuration.format); + if (configuration.format != "auto") + FormatFactory::instance().checkFormatName(configuration.format); context_->getGlobalContext()->getRemoteHostFilter().checkURL(configuration.url.uri); context_->getGlobalContext()->getHTTPHeaderFilter().checkHeaders(configuration.headers_from_ast); StorageInMemoryMetadata storage_metadata; if (columns_.empty()) { - auto columns = getTableStructureFromDataImpl(configuration, format_settings, context_); + ColumnsDescription columns; + if (configuration.format == "auto") + std::tie(columns, configuration.format) = getTableStructureAndFormatFromData(configuration, format_settings, context_); + else + columns = getTableStructureFromData(configuration, format_settings, context_); + storage_metadata.setColumns(columns); } else { + if (configuration.format == "auto") + configuration.format = getTableStructureAndFormatFromData(configuration, format_settings, context_).second; + /// We don't allow special columns in S3 storage. if (!columns_.hasOnlyOrdinary()) throw Exception(ErrorCodes::BAD_ARGUMENTS, "Table engine S3 doesn't support special columns like MATERIALIZED, ALIAS or EPHEMERAL"); @@ -1350,14 +1358,14 @@ void StorageS3::truncate(const ASTPtr & /* query */, const StorageMetadataPtr &, LOG_WARNING(getLogger("StorageS3"), "Failed to delete {}, error: {}", error.GetKey(), error.GetMessage()); } -StorageS3::Configuration StorageS3::updateConfigurationAndGetCopy(ContextPtr local_context) +StorageS3::Configuration StorageS3::updateConfigurationAndGetCopy(const ContextPtr & local_context) { std::lock_guard lock(configuration_update_mutex); configuration.update(local_context); return configuration; } -void StorageS3::updateConfiguration(ContextPtr local_context) +void StorageS3::updateConfiguration(const ContextPtr & local_context) { std::lock_guard lock(configuration_update_mutex); configuration.update(local_context); @@ -1375,9 +1383,9 @@ const StorageS3::Configuration & StorageS3::getConfiguration() return configuration; } -bool StorageS3::Configuration::update(ContextPtr context) +bool StorageS3::Configuration::update(const ContextPtr & context) { - auto s3_settings = context->getStorageS3Settings().getSettings(url.uri.toString()); + auto s3_settings = context->getStorageS3Settings().getSettings(url.uri.toString(), context->getUserName()); request_settings = s3_settings.request_settings; request_settings.updateFromSettings(context->getSettings()); @@ -1390,7 +1398,7 @@ bool StorageS3::Configuration::update(ContextPtr context) return true; } -void StorageS3::Configuration::connect(ContextPtr context) +void StorageS3::Configuration::connect(const ContextPtr & context) { const Settings & global_settings = context->getGlobalContext()->getSettingsRef(); const Settings & local_settings = context->getSettingsRef(); @@ -1462,7 +1470,7 @@ void StorageS3::processNamedCollectionResult(StorageS3::Configuration & configur configuration.request_settings = S3Settings::RequestSettings(collection); } -StorageS3::Configuration StorageS3::getConfiguration(ASTs & engine_args, ContextPtr local_context, bool get_format_from_file) +StorageS3::Configuration StorageS3::getConfiguration(ASTs & engine_args, const ContextPtr & local_context, bool get_format_from_file) { StorageS3::Configuration configuration; @@ -1601,7 +1609,7 @@ StorageS3::Configuration StorageS3::getConfiguration(ASTs & engine_args, Context configuration.keys = {configuration.url.key}; if (configuration.format == "auto" && get_format_from_file) - configuration.format = FormatFactory::instance().getFormatFromFileName(configuration.url.key, true); + configuration.format = FormatFactory::instance().tryGetFormatFromFileName(configuration.url.key).value_or("auto"); return configuration; } @@ -1609,9 +1617,17 @@ StorageS3::Configuration StorageS3::getConfiguration(ASTs & engine_args, Context ColumnsDescription StorageS3::getTableStructureFromData( const StorageS3::Configuration & configuration, const std::optional & format_settings, - ContextPtr ctx) + const ContextPtr & ctx) { - return getTableStructureFromDataImpl(configuration, format_settings, ctx); + return getTableStructureAndFormatFromDataImpl(configuration.format, configuration, format_settings, ctx).first; +} + +std::pair StorageS3::getTableStructureAndFormatFromData( + const StorageS3::Configuration & configuration, + const std::optional & format_settings, + const ContextPtr & ctx) +{ + return getTableStructureAndFormatFromDataImpl(std::nullopt, configuration, format_settings, ctx); } namespace @@ -1623,24 +1639,43 @@ namespace std::shared_ptr file_iterator_, const StorageS3Source::KeysWithInfo & read_keys_, const StorageS3::Configuration & configuration_, + std::optional format_, const std::optional & format_settings_, const ContextPtr & context_) : WithContext(context_) , file_iterator(file_iterator_) , read_keys(read_keys_) , configuration(configuration_) + , format(std::move(format_)) , format_settings(format_settings_) , prev_read_keys_size(read_keys_.size()) { } - std::pair, std::optional> next() override + Data next() override { - /// For default mode check cached columns for currently read keys on first iteration. - if (first && getContext()->getSettingsRef().schema_inference_mode == SchemaInferenceMode::DEFAULT) + if (first) { - if (auto cached_columns = tryGetColumnsFromCache(read_keys.begin(), read_keys.end())) - return {nullptr, cached_columns}; + /// If format is unknown we iterate through all currently read keys on first iteration and + /// try to determine format by file name. + if (!format) + { + for (const auto & key_with_info : read_keys) + { + if (auto format_from_file_name = FormatFactory::instance().tryGetFormatFromFileName(key_with_info->key)) + { + format = format_from_file_name; + break; + } + } + } + + /// For default mode check cached columns for currently read keys on first iteration. + if (first && getContext()->getSettingsRef().schema_inference_mode == SchemaInferenceMode::DEFAULT) + { + if (auto cached_columns = tryGetColumnsFromCache(read_keys.begin(), read_keys.end())) + return {nullptr, cached_columns, format}; + } } while (true) @@ -1650,22 +1685,48 @@ namespace if (!current_key_with_info || current_key_with_info->key.empty()) { if (first) - throw Exception( - ErrorCodes::CANNOT_EXTRACT_TABLE_STRUCTURE, - "Cannot extract table structure from {} format file, because there are no files with provided path " - "in S3 or all files are empty. You must specify table structure manually", - configuration.format); + { + if (format) + throw Exception( + ErrorCodes::CANNOT_EXTRACT_TABLE_STRUCTURE, + "The table structure cannot be extracted from a {} format file, because there are no files with provided path " + "in S3 or all files are empty. You can specify table structure manually", + *format); - return {nullptr, std::nullopt}; + throw Exception( + ErrorCodes::CANNOT_DETECT_FORMAT, + "The data format cannot be detected by the contents of the files, because there are no files with provided path " + "in S3 or all files are empty. You can specify the format manually"); + } + + return {nullptr, std::nullopt, format}; } - /// S3 file iterator could get new keys after new iteration, check them in schema cache if schema inference mode is default. - if (getContext()->getSettingsRef().schema_inference_mode == SchemaInferenceMode::DEFAULT && read_keys.size() > prev_read_keys_size) + /// S3 file iterator could get new keys after new iteration + if (read_keys.size() > prev_read_keys_size) { - auto columns_from_cache = tryGetColumnsFromCache(read_keys.begin() + prev_read_keys_size, read_keys.end()); + /// If format is unknown we can try to determine it by new file names. + if (!format) + { + for (auto it = read_keys.begin() + prev_read_keys_size; it != read_keys.end(); ++it) + { + if (auto format_from_file_name = FormatFactory::instance().tryGetFormatFromFileName((*it)->key)) + { + format = format_from_file_name; + break; + } + } + } + + /// Check new files in schema cache if schema inference mode is default. + if (getContext()->getSettingsRef().schema_inference_mode == SchemaInferenceMode::DEFAULT) + { + auto columns_from_cache = tryGetColumnsFromCache(read_keys.begin() + prev_read_keys_size, read_keys.end()); + if (columns_from_cache) + return {nullptr, columns_from_cache, format}; + } + prev_read_keys_size = read_keys.size(); - if (columns_from_cache) - return {nullptr, columns_from_cache}; } if (getContext()->getSettingsRef().s3_skip_empty_files && current_key_with_info->info && current_key_with_info->info->size == 0) @@ -1678,7 +1739,7 @@ namespace if (auto columns_from_cache = tryGetColumnsFromCache(keys.begin(), keys.end())) { first = false; - return {nullptr, columns_from_cache}; + return {nullptr, columns_from_cache, format}; } } @@ -1687,7 +1748,7 @@ namespace if (!getContext()->getSettingsRef().s3_skip_empty_files || !impl->eof()) { first = false; - return {wrapReadBufferWithCompressionMethod(std::move(impl), chooseCompressionMethod(current_key_with_info->key, configuration.compression_method), zstd_window_log_max), std::nullopt}; + return {wrapReadBufferWithCompressionMethod(std::move(impl), chooseCompressionMethod(current_key_with_info->key, configuration.compression_method), zstd_window_log_max), std::nullopt, format}; } } } @@ -1698,7 +1759,7 @@ namespace return; String source = fs::path(configuration.url.uri.getHost() + std::to_string(configuration.url.uri.getPort())) / configuration.url.bucket / current_key_with_info->key; - auto key = getKeyForSchemaCache(source, configuration.format, format_settings, getContext()); + auto key = getKeyForSchemaCache(source, *format, format_settings, getContext()); StorageS3::getSchemaCache(getContext()).addNumRows(key, num_rows); } @@ -1709,7 +1770,7 @@ namespace return; String source = fs::path(configuration.url.uri.getHost() + std::to_string(configuration.url.uri.getPort())) / configuration.url.bucket / current_key_with_info->key; - auto cache_key = getKeyForSchemaCache(source, configuration.format, format_settings, getContext()); + auto cache_key = getKeyForSchemaCache(source, *format, format_settings, getContext()); StorageS3::getSchemaCache(getContext()).addColumns(cache_key, columns); } @@ -1723,10 +1784,15 @@ namespace Strings sources; sources.reserve(read_keys.size()); std::transform(read_keys.begin(), read_keys.end(), std::back_inserter(sources), [&](const auto & elem){ return host_and_bucket / elem->key; }); - auto cache_keys = getKeysForSchemaCache(sources, configuration.format, format_settings, getContext()); + auto cache_keys = getKeysForSchemaCache(sources, *format, format_settings, getContext()); StorageS3::getSchemaCache(getContext()).addManyColumns(cache_keys, columns); } + void setFormatName(const String & format_name) override + { + format = format_name; + } + String getLastFileName() const override { if (current_key_with_info) @@ -1734,15 +1800,26 @@ namespace return ""; } + bool supportsLastReadBufferRecreation() const override { return true; } + + std::unique_ptr recreateLastReadBuffer() override + { + chassert(current_key_with_info); + int zstd_window_log_max = static_cast(getContext()->getSettingsRef().zstd_window_log_max); + auto impl = std::make_unique(configuration.client, configuration.url.bucket, current_key_with_info->key, configuration.url.version_id, configuration.request_settings, getContext()->getReadSettings()); + return wrapReadBufferWithCompressionMethod(std::move(impl), chooseCompressionMethod(current_key_with_info->key, configuration.compression_method), zstd_window_log_max); + } + private: std::optional tryGetColumnsFromCache( const StorageS3::KeysWithInfo::const_iterator & begin, const StorageS3::KeysWithInfo::const_iterator & end) { - if (!getContext()->getSettingsRef().schema_inference_use_cache_for_s3) + auto context = getContext(); + if (!context->getSettingsRef().schema_inference_use_cache_for_s3) return std::nullopt; - auto & schema_cache = StorageS3::getSchemaCache(getContext()); + auto & schema_cache = StorageS3::getSchemaCache(context); for (auto it = begin; it < end; ++it) { auto get_last_mod_time = [&] @@ -1773,10 +1850,29 @@ namespace String path = fs::path(configuration.url.bucket) / (*it)->key; String source = fs::path(configuration.url.uri.getHost() + std::to_string(configuration.url.uri.getPort())) / path; - auto cache_key = getKeyForSchemaCache(source, configuration.format, format_settings, getContext()); - auto columns = schema_cache.tryGetColumns(cache_key, get_last_mod_time); - if (columns) - return columns; + + if (format) + { + auto cache_key = getKeyForSchemaCache(source, *format, format_settings, context); + if (auto columns = schema_cache.tryGetColumns(cache_key, get_last_mod_time)) + return columns; + } + else + { + /// If format is unknown, we can iterate through all possible input formats + /// and check if we have an entry with this format and this file in schema cache. + /// If we have such entry for some format, we can use this format to read the file. + for (const auto & format_name : FormatFactory::instance().getAllInputFormats()) + { + auto cache_key = getKeyForSchemaCache(source, format_name, format_settings, context); + if (auto columns = schema_cache.tryGetColumns(cache_key, get_last_mod_time)) + { + /// Now format is known. It should be the same for all files. + format = format_name; + return columns; + } + } + } } return std::nullopt; @@ -1785,6 +1881,7 @@ namespace std::shared_ptr file_iterator; const StorageS3Source::KeysWithInfo & read_keys; const StorageS3::Configuration & configuration; + std::optional format; const std::optional & format_settings; StorageS3Source::KeyWithInfoPtr current_key_with_info; size_t prev_read_keys_size; @@ -1793,17 +1890,20 @@ namespace } -ColumnsDescription StorageS3::getTableStructureFromDataImpl( +std::pair StorageS3::getTableStructureAndFormatFromDataImpl( + std::optional format, const Configuration & configuration, const std::optional & format_settings, - ContextPtr ctx) + const ContextPtr & ctx) { KeysWithInfo read_keys; auto file_iterator = createFileIterator(configuration, false, ctx, {}, {}, &read_keys); - ReadBufferIterator read_buffer_iterator(file_iterator, read_keys, configuration, format_settings, ctx); - return readSchemaFromFormat(configuration.format, format_settings, read_buffer_iterator, configuration.withGlobs(), ctx); + ReadBufferIterator read_buffer_iterator(file_iterator, read_keys, configuration, format, format_settings, ctx); + if (format) + return {readSchemaFromFormat(*format, format_settings, read_buffer_iterator, ctx), *format}; + return detectFormatAndReadSchema(format_settings, read_buffer_iterator, ctx); } void registerStorageS3Impl(const String & name, StorageFactory & factory) diff --git a/src/Storages/StorageS3.h b/src/Storages/StorageS3.h index 81a03cc5ad5..587145cd1a7 100644 --- a/src/Storages/StorageS3.h +++ b/src/Storages/StorageS3.h @@ -19,7 +19,7 @@ #include #include #include -#include +#include #include #include #include @@ -80,7 +80,7 @@ public: const S3::URI & globbed_uri_, const ActionsDAG::Node * predicate, const NamesAndTypesList & virtual_columns, - ContextPtr context, + const ContextPtr & context, KeysWithInfo * read_keys_ = nullptr, const S3Settings::RequestSettings & request_settings_ = {}, std::function progress_callback_ = {}); @@ -134,7 +134,7 @@ public: const ReadFromFormatInfo & info, const String & format, String name_, - ContextPtr context_, + const ContextPtr & context_, std::optional format_settings_, UInt64 max_block_size_, const S3Settings::RequestSettings & request_settings_, @@ -280,9 +280,9 @@ public: String getPath() const { return url.key; } - bool update(ContextPtr context); + bool update(const ContextPtr & context); - void connect(ContextPtr context); + void connect(const ContextPtr & context); bool withGlobs() const { return url.key.find_first_of("*?{") != std::string::npos; } @@ -308,7 +308,7 @@ public: StorageS3( const Configuration & configuration_, - ContextPtr context_, + const ContextPtr & context_, const StorageID & table_id_, const ColumnsDescription & columns_, const ConstraintsDescription & constraints_, @@ -345,21 +345,26 @@ public: static SchemaCache & getSchemaCache(const ContextPtr & ctx); - static StorageS3::Configuration getConfiguration(ASTs & engine_args, ContextPtr local_context, bool get_format_from_file = true); + static StorageS3::Configuration getConfiguration(ASTs & engine_args, const ContextPtr & local_context, bool get_format_from_file = true); static ColumnsDescription getTableStructureFromData( const StorageS3::Configuration & configuration, const std::optional & format_settings, - ContextPtr ctx); + const ContextPtr & ctx); + + static std::pair getTableStructureAndFormatFromData( + const StorageS3::Configuration & configuration, + const std::optional & format_settings, + const ContextPtr & ctx); using KeysWithInfo = StorageS3Source::KeysWithInfo; bool supportsTrivialCountOptimization() const override { return true; } protected: - virtual Configuration updateConfigurationAndGetCopy(ContextPtr local_context); + virtual Configuration updateConfigurationAndGetCopy(const ContextPtr & local_context); - virtual void updateConfiguration(ContextPtr local_context); + virtual void updateConfiguration(const ContextPtr & local_context); void useConfiguration(const Configuration & new_configuration); @@ -380,10 +385,11 @@ private: std::optional format_settings; ASTPtr partition_by; - static ColumnsDescription getTableStructureFromDataImpl( + static std::pair getTableStructureAndFormatFromDataImpl( + std::optional format, const Configuration & configuration, const std::optional & format_settings, - ContextPtr ctx); + const ContextPtr & ctx); bool supportsSubcolumns() const override { return true; } diff --git a/src/Storages/StorageS3Cluster.cpp b/src/Storages/StorageS3Cluster.cpp index 25c2b42b766..0ea224f6ee9 100644 --- a/src/Storages/StorageS3Cluster.cpp +++ b/src/Storages/StorageS3Cluster.cpp @@ -38,25 +38,34 @@ StorageS3Cluster::StorageS3Cluster( const StorageID & table_id_, const ColumnsDescription & columns_, const ConstraintsDescription & constraints_, - ContextPtr context_, - bool structure_argument_was_provided_) - : IStorageCluster(cluster_name_, table_id_, getLogger("StorageS3Cluster (" + table_id_.table_name + ")"), structure_argument_was_provided_) + const ContextPtr & context) + : IStorageCluster(cluster_name_, table_id_, getLogger("StorageS3Cluster (" + table_id_.table_name + ")")) , s3_configuration{configuration_} { - context_->getGlobalContext()->getRemoteHostFilter().checkURL(configuration_.url.uri); - context_->getGlobalContext()->getHTTPHeaderFilter().checkHeaders(configuration_.headers_from_ast); + context->getGlobalContext()->getRemoteHostFilter().checkURL(configuration_.url.uri); + context->getGlobalContext()->getHTTPHeaderFilter().checkHeaders(configuration_.headers_from_ast); StorageInMemoryMetadata storage_metadata; - updateConfigurationIfChanged(context_); + updateConfigurationIfChanged(context); if (columns_.empty()) { + ColumnsDescription columns; /// `format_settings` is set to std::nullopt, because StorageS3Cluster is used only as table function - auto columns = StorageS3::getTableStructureFromDataImpl(s3_configuration, /*format_settings=*/std::nullopt, context_); + if (s3_configuration.format == "auto") + std::tie(columns, s3_configuration.format) = StorageS3::getTableStructureAndFormatFromData(s3_configuration, /*format_settings=*/std::nullopt, context); + else + columns = StorageS3::getTableStructureFromData(s3_configuration, /*format_settings=*/std::nullopt, context); + storage_metadata.setColumns(columns); } else + { + if (s3_configuration.format == "auto") + s3_configuration.format = StorageS3::getTableStructureAndFormatFromData(s3_configuration, /*format_settings=*/std::nullopt, context).second; + storage_metadata.setColumns(columns_); + } storage_metadata.setConstraints(constraints_); setInMemoryMetadata(storage_metadata); @@ -64,13 +73,17 @@ StorageS3Cluster::StorageS3Cluster( virtual_columns = VirtualColumnUtils::getPathFileAndSizeVirtualsForStorage(storage_metadata.getSampleBlock().getNamesAndTypesList()); } -void StorageS3Cluster::addColumnsStructureToQuery(ASTPtr & query, const String & structure, const ContextPtr & context) +void StorageS3Cluster::updateQueryToSendIfNeeded(DB::ASTPtr & query, const DB::StorageSnapshotPtr & storage_snapshot, const DB::ContextPtr & context) { ASTExpressionList * expression_list = extractTableFunctionArgumentsFromSelectQuery(query); if (!expression_list) throw Exception(ErrorCodes::LOGICAL_ERROR, "Expected SELECT query from table function s3Cluster, got '{}'", queryToString(query)); - TableFunctionS3Cluster::addColumnsStructureToArguments(expression_list->children, structure, context); + TableFunctionS3Cluster::updateStructureAndFormatArgumentsIfNeeded( + expression_list->children, + storage_snapshot->metadata->getColumns().getAll().toNamesAndTypesDescription(), + s3_configuration.format, + context); } void StorageS3Cluster::updateConfigurationIfChanged(ContextPtr local_context) diff --git a/src/Storages/StorageS3Cluster.h b/src/Storages/StorageS3Cluster.h index c526f14834a..ac25c506337 100644 --- a/src/Storages/StorageS3Cluster.h +++ b/src/Storages/StorageS3Cluster.h @@ -27,8 +27,7 @@ public: const StorageID & table_id_, const ColumnsDescription & columns_, const ConstraintsDescription & constraints_, - ContextPtr context_, - bool structure_argument_was_provided_); + const ContextPtr & context_); std::string getName() const override { return "S3Cluster"; } @@ -46,7 +45,7 @@ protected: private: void updateBeforeRead(const ContextPtr & context) override { updateConfigurationIfChanged(context); } - void addColumnsStructureToQuery(ASTPtr & query, const String & structure, const ContextPtr & context) override; + void updateQueryToSendIfNeeded(ASTPtr & query, const StorageSnapshotPtr & storage_snapshot, const ContextPtr & context) override; StorageS3::Configuration s3_configuration; NamesAndTypesList virtual_columns; diff --git a/src/Storages/StorageS3Settings.cpp b/src/Storages/StorageS3Settings.cpp index b0c1160429a..2a0d15a2bab 100644 --- a/src/Storages/StorageS3Settings.cpp +++ b/src/Storages/StorageS3Settings.cpp @@ -293,7 +293,7 @@ void StorageS3Settings::loadFromConfig(const String & config_elem, const Poco::U } } -S3Settings StorageS3Settings::getSettings(const String & endpoint) const +S3Settings StorageS3Settings::getSettings(const String & endpoint, const String & user) const { std::lock_guard lock(mutex); auto next_prefix_setting = s3_settings.upper_bound(endpoint); @@ -302,7 +302,8 @@ S3Settings StorageS3Settings::getSettings(const String & endpoint) const for (auto possible_prefix_setting = next_prefix_setting; possible_prefix_setting != s3_settings.begin();) { std::advance(possible_prefix_setting, -1); - if (boost::algorithm::starts_with(endpoint, possible_prefix_setting->first)) + const auto & [endpoint_prefix, settings] = *possible_prefix_setting; + if (boost::algorithm::starts_with(endpoint, endpoint_prefix) && settings.auth_settings.canBeUsedByUser(user)) return possible_prefix_setting->second; } diff --git a/src/Storages/StorageS3Settings.h b/src/Storages/StorageS3Settings.h index 0e152bb2d31..21b6264717e 100644 --- a/src/Storages/StorageS3Settings.h +++ b/src/Storages/StorageS3Settings.h @@ -112,7 +112,7 @@ class StorageS3Settings public: void loadFromConfig(const String & config_elem, const Poco::Util::AbstractConfiguration & config, const Settings & settings); - S3Settings getSettings(const String & endpoint) const; + S3Settings getSettings(const String & endpoint, const String & user) const; private: mutable std::mutex mutex; diff --git a/src/Storages/StorageSQLite.cpp b/src/Storages/StorageSQLite.cpp index 85c5e16a1bf..30cca409dc8 100644 --- a/src/Storages/StorageSQLite.cpp +++ b/src/Storages/StorageSQLite.cpp @@ -19,6 +19,20 @@ #include #include +namespace +{ + +using namespace DB; + +ContextPtr makeSQLiteWriteContext(ContextPtr context) +{ + auto write_context = Context::createCopy(context); + write_context->setSetting("output_format_values_escape_quote_with_quote", Field(true)); + return write_context; +} + +} + namespace DB { @@ -43,6 +57,7 @@ StorageSQLite::StorageSQLite( , database_path(database_path_) , sqlite_db(sqlite_db_) , log(getLogger("StorageSQLite (" + table_id_.table_name + ")")) + , write_context(makeSQLiteWriteContext(getContext())) { StorageInMemoryMetadata storage_metadata; @@ -144,7 +159,7 @@ public: sqlbuf << ") VALUES "; - auto writer = FormatFactory::instance().getOutputFormat("Values", sqlbuf, metadata_snapshot->getSampleBlock(), storage.getContext()); + auto writer = FormatFactory::instance().getOutputFormat("Values", sqlbuf, metadata_snapshot->getSampleBlock(), storage.write_context); writer->write(block); sqlbuf << ";"; diff --git a/src/Storages/StorageSQLite.h b/src/Storages/StorageSQLite.h index baacdfb4899..ed673123fe0 100644 --- a/src/Storages/StorageSQLite.h +++ b/src/Storages/StorageSQLite.h @@ -47,10 +47,13 @@ public: const String & table); private: + friend class SQLiteSink; /// for write_context + String remote_table_name; String database_path; SQLitePtr sqlite_db; LoggerPtr log; + ContextPtr write_context; }; } diff --git a/src/Storages/StorageURL.cpp b/src/Storages/StorageURL.cpp index ce9b0cd366b..608e44c3cd0 100644 --- a/src/Storages/StorageURL.cpp +++ b/src/Storages/StorageURL.cpp @@ -5,7 +5,7 @@ #include #include -#include +#include #include #include #include @@ -101,7 +101,7 @@ static ConnectionTimeouts getHTTPTimeouts(ContextPtr context) IStorageURLBase::IStorageURLBase( const String & uri_, - ContextPtr context_, + const ContextPtr & context_, const StorageID & table_id_, const String & format_name_, const std::optional & format_settings_, @@ -123,16 +123,26 @@ IStorageURLBase::IStorageURLBase( , partition_by(partition_by_) , distributed_processing(distributed_processing_) { - FormatFactory::instance().checkFormatName(format_name); + if (format_name != "auto") + FormatFactory::instance().checkFormatName(format_name); + StorageInMemoryMetadata storage_metadata; if (columns_.empty()) { - auto columns = getTableStructureFromData(format_name, uri, compression_method, headers, format_settings, context_); + ColumnsDescription columns; + if (format_name == "auto") + std::tie(columns, format_name) = getTableStructureAndFormatFromData(uri, compression_method, headers, format_settings, context_); + else + columns = getTableStructureFromData(format_name, uri, compression_method, headers, format_settings, context_); + storage_metadata.setColumns(columns); } else { + if (format_name == "auto") + format_name = getTableStructureAndFormatFromData(uri, compression_method, headers, format_settings, context_).second; + /// We don't allow special columns in URL storage. if (!columns_.hasOnlyOrdinary()) throw Exception(ErrorCodes::BAD_ARGUMENTS, "Table engine URL doesn't support special columns like MATERIALIZED, ALIAS or EPHEMERAL"); @@ -257,7 +267,7 @@ StorageURLSource::StorageURLSource( const String & format_, const std::optional & format_settings_, String name_, - ContextPtr context_, + const ContextPtr & context_, UInt64 max_block_size, const ConnectionTimeouts & timeouts, CompressionMethod compression_method, @@ -525,7 +535,7 @@ StorageURLSink::StorageURLSink( const String & format, const std::optional & format_settings, const Block & sample_block, - ContextPtr context, + const ContextPtr & context, const ConnectionTimeouts & timeouts, const CompressionMethod compression_method, const HTTPHeaderEntries & headers, @@ -668,7 +678,7 @@ std::vector> IStorageURLBase::getReadURIPara const Names & /*column_names*/, const StorageSnapshotPtr & /*storage_snapshot*/, const SelectQueryInfo & /*query_info*/, - ContextPtr /*context*/, + const ContextPtr & /*context*/, QueryProcessingStage::Enum & /*processed_stage*/, size_t /*max_block_size*/) const { @@ -679,7 +689,7 @@ std::function IStorageURLBase::getReadPOSTDataCallback( const Names & /*column_names*/, const ColumnsDescription & /* columns_description */, const SelectQueryInfo & /*query_info*/, - ContextPtr /*context*/, + const ContextPtr & /*context*/, QueryProcessingStage::Enum & /*processed_stage*/, size_t /*max_block_size*/) const { @@ -693,28 +703,48 @@ namespace public: ReadBufferIterator( const std::vector & urls_to_check_, - const String & format_, + std::optional format_, const CompressionMethod & compression_method_, const HTTPHeaderEntries & headers_, const std::optional & format_settings_, const ContextPtr & context_) - : WithContext(context_), format(format_), compression_method(compression_method_), headers(headers_), format_settings(format_settings_) + : WithContext(context_), format(std::move(format_)), compression_method(compression_method_), headers(headers_), format_settings(format_settings_) { url_options_to_check.reserve(urls_to_check_.size()); for (const auto & url : urls_to_check_) url_options_to_check.push_back(getFailoverOptions(url, getContext()->getSettingsRef().glob_expansion_max_elements)); } - std::pair, std::optional> next() override + Data next() override { bool is_first = (current_index == 0); - /// For default mode check cached columns for all urls on first iteration. - if (is_first && getContext()->getSettingsRef().schema_inference_mode == SchemaInferenceMode::DEFAULT) + if (is_first) { - for (const auto & options : url_options_to_check) + /// If format is unknown we iterate through all url options on first iteration and + /// try to determine format by file name. + if (!format) { - if (auto cached_columns = tryGetColumnsFromCache(options)) - return {nullptr, cached_columns}; + for (const auto & options : url_options_to_check) + { + for (const auto & url : options) + { + if (auto format_from_file_name = FormatFactory::instance().tryGetFormatFromFileName(url)) + { + format = format_from_file_name; + break; + } + } + } + } + + /// For default mode check cached columns for all urls on first iteration. + if (getContext()->getSettingsRef().schema_inference_mode == SchemaInferenceMode::DEFAULT) + { + for (const auto & options : url_options_to_check) + { + if (auto cached_columns = tryGetColumnsFromCache(options)) + return {nullptr, cached_columns, format}; + } } } @@ -724,20 +754,30 @@ namespace if (current_index == url_options_to_check.size()) { if (is_first) + { + if (format) + throw Exception( + ErrorCodes::CANNOT_EXTRACT_TABLE_STRUCTURE, + "The table structure cannot be extracted from a {} format file, because all files are empty. " + "You can specify table structure manually", + *format); + throw Exception( ErrorCodes::CANNOT_EXTRACT_TABLE_STRUCTURE, - "Cannot extract table structure from {} format file, because all files are empty. " - "You must specify table structure manually", - format); - return {nullptr, std::nullopt}; + "The data format cannot be detected by the contents of the files, because there are no files with provided path " + "You can specify the format manually"); + + } + + return {nullptr, std::nullopt, format}; } if (getContext()->getSettingsRef().schema_inference_mode == SchemaInferenceMode::UNION) { - if (auto cached_columns = tryGetColumnsFromCache(url_options_to_check[current_index])) + if (auto cached_schema = tryGetColumnsFromCache(url_options_to_check[current_index])) { ++current_index; - return {nullptr, cached_columns}; + return {nullptr, cached_schema, format}; } } @@ -762,7 +802,7 @@ namespace return {wrapReadBufferWithCompressionMethod( std::move(uri_and_buf.second), compression_method, - static_cast(getContext()->getSettingsRef().zstd_window_log_max)), std::nullopt}; + static_cast(getContext()->getSettingsRef().zstd_window_log_max)), std::nullopt, format}; } void setNumRowsToLastFile(size_t num_rows) override @@ -770,7 +810,7 @@ namespace if (!getContext()->getSettingsRef().schema_inference_use_cache_for_url) return; - auto key = getKeyForSchemaCache(current_url_option, format, format_settings, getContext()); + auto key = getKeyForSchemaCache(current_url_option, *format, format_settings, getContext()); StorageURL::getSchemaCache(getContext()).addNumRows(key, num_rows); } @@ -780,7 +820,7 @@ namespace || getContext()->getSettingsRef().schema_inference_mode != SchemaInferenceMode::UNION) return; - auto key = getKeyForSchemaCache(current_url_option, format, format_settings, getContext()); + auto key = getKeyForSchemaCache(current_url_option, *format, format_settings, getContext()); StorageURL::getSchemaCache(getContext()).addColumns(key, columns); } @@ -792,17 +832,45 @@ namespace for (const auto & options : url_options_to_check) { - auto keys = getKeysForSchemaCache(options, format, format_settings, getContext()); + auto keys = getKeysForSchemaCache(options, *format, format_settings, getContext()); StorageURL::getSchemaCache(getContext()).addManyColumns(keys, columns); } } + void setFormatName(const String & format_name) override + { + format = format_name; + } + String getLastFileName() const override { return current_url_option; } + bool supportsLastReadBufferRecreation() const override { return true; } + + std::unique_ptr recreateLastReadBuffer() override + { + chassert(current_index > 0 && current_index <= url_options_to_check.size()); + auto first_option = url_options_to_check[current_index - 1].cbegin(); + auto uri_and_buf = StorageURLSource::getFirstAvailableURIAndReadBuffer( + first_option, + url_options_to_check[current_index - 1].cend(), + getContext(), + {}, + Poco::Net::HTTPRequest::HTTP_GET, + {}, + getHTTPTimeouts(getContext()), + credentials, + headers, + false, + false); + + return wrapReadBufferWithCompressionMethod(std::move(uri_and_buf.second), compression_method, static_cast(getContext()->getSettingsRef().zstd_window_log_max)); + } + private: std::optional tryGetColumnsFromCache(const Strings & urls) { - if (!getContext()->getSettingsRef().schema_inference_use_cache_for_url) + auto context = getContext(); + if (!context->getSettingsRef().schema_inference_use_cache_for_url) return std::nullopt; auto & schema_cache = StorageURL::getSchemaCache(getContext()); @@ -810,7 +878,7 @@ namespace { auto get_last_mod_time = [&]() -> std::optional { - auto last_mod_time = StorageURL::tryGetLastModificationTime(url, headers, credentials, getContext()); + auto last_mod_time = StorageURL::tryGetLastModificationTime(url, headers, credentials, context); /// Some URLs could not have Last-Modified header, in this case we cannot be sure that /// data wasn't changed after adding it's schema to cache. Use schema from cache only if /// special setting for this case is enabled. @@ -819,10 +887,27 @@ namespace return last_mod_time; }; - auto cache_key = getKeyForSchemaCache(url, format, format_settings, getContext()); - auto columns = schema_cache.tryGetColumns(cache_key, get_last_mod_time); - if (columns) - return columns; + if (format) + { + auto cache_key = getKeyForSchemaCache(url, *format, format_settings, context); + if (auto columns = schema_cache.tryGetColumns(cache_key, get_last_mod_time)) + return columns; + } + else + { + /// If format is unknown, we can iterate through all possible input formats + /// and check if we have an entry with this format and this file in schema cache. + /// If we have such entry for some format, we can use this format to read the file. + for (const auto & format_name : FormatFactory::instance().getAllInputFormats()) + { + auto cache_key = getKeyForSchemaCache(url, format_name, format_settings, context); + if (auto columns = schema_cache.tryGetColumns(cache_key, get_last_mod_time)) + { + format = format_name; + return columns; + } + } + } } return std::nullopt; @@ -831,7 +916,7 @@ namespace std::vector> url_options_to_check; size_t current_index = 0; String current_url_option; - const String & format; + std::optional format; const CompressionMethod & compression_method; const HTTPHeaderEntries & headers; Poco::Net::HTTPBasicCredentials credentials; @@ -839,13 +924,13 @@ namespace }; } -ColumnsDescription IStorageURLBase::getTableStructureFromData( - const String & format, +std::pair IStorageURLBase::getTableStructureAndFormatFromDataImpl( + std::optional format, const String & uri, CompressionMethod compression_method, const HTTPHeaderEntries & headers, const std::optional & format_settings, - ContextPtr context) + const ContextPtr & context) { context->getRemoteHostFilter().checkURL(Poco::URI(uri)); @@ -858,7 +943,30 @@ ColumnsDescription IStorageURLBase::getTableStructureFromData( urls_to_check = {uri}; ReadBufferIterator read_buffer_iterator(urls_to_check, format, compression_method, headers, format_settings, context); - return readSchemaFromFormat(format, format_settings, read_buffer_iterator, urls_to_check.size() > 1, context); + if (format) + return {readSchemaFromFormat(*format, format_settings, read_buffer_iterator, context), *format}; + return detectFormatAndReadSchema(format_settings, read_buffer_iterator, context); +} + +ColumnsDescription IStorageURLBase::getTableStructureFromData( + const String & format, + const String & uri, + CompressionMethod compression_method, + const HTTPHeaderEntries & headers, + const std::optional & format_settings, + const ContextPtr & context) +{ + return getTableStructureAndFormatFromDataImpl(format, uri, compression_method, headers, format_settings, context).first; +} + +std::pair IStorageURLBase::getTableStructureAndFormatFromData( + const String & uri, + CompressionMethod compression_method, + const HTTPHeaderEntries & headers, + const std::optional & format_settings, + const ContextPtr & context) +{ + return getTableStructureAndFormatFromDataImpl(std::nullopt, uri, compression_method, headers, format_settings, context); } bool IStorageURLBase::supportsSubsetOfColumns(const ContextPtr & context) const @@ -1245,7 +1353,7 @@ StorageURL::StorageURL( const ColumnsDescription & columns_, const ConstraintsDescription & constraints_, const String & comment, - ContextPtr context_, + const ContextPtr & context_, const String & compression_method_, const HTTPHeaderEntries & headers_, const String & http_method_, @@ -1278,7 +1386,7 @@ StorageURLWithFailover::StorageURLWithFailover( const std::optional & format_settings_, const ColumnsDescription & columns_, const ConstraintsDescription & constraints_, - ContextPtr context_, + const ContextPtr & context_, const String & compression_method_) : StorageURL("", table_id_, format_name_, format_settings_, columns_, constraints_, String{}, context_, compression_method_) { @@ -1327,7 +1435,7 @@ FormatSettings StorageURL::getFormatSettingsFromArgs(const StorageFactory::Argum } size_t StorageURL::evalArgsAndCollectHeaders( - ASTs & url_function_args, HTTPHeaderEntries & header_entries, ContextPtr context) + ASTs & url_function_args, HTTPHeaderEntries & header_entries, const ContextPtr & context) { ASTs::iterator headers_it = url_function_args.end(); @@ -1403,7 +1511,7 @@ void StorageURL::processNamedCollectionResult(Configuration & configuration, con && configuration.http_method != Poco::Net::HTTPRequest::HTTP_PUT) throw Exception( ErrorCodes::BAD_ARGUMENTS, - "Http method can be POST or PUT (current: {}). For insert default is POST, for select GET", + "HTTP method can be POST or PUT (current: {}). For insert default is POST, for select GET", configuration.http_method); configuration.format = collection.getOrDefault("format", "auto"); @@ -1411,7 +1519,7 @@ void StorageURL::processNamedCollectionResult(Configuration & configuration, con configuration.structure = collection.getOrDefault("structure", "auto"); } -StorageURL::Configuration StorageURL::getConfiguration(ASTs & args, ContextPtr local_context) +StorageURL::Configuration StorageURL::getConfiguration(ASTs & args, const ContextPtr & local_context) { StorageURL::Configuration configuration; @@ -1435,7 +1543,7 @@ StorageURL::Configuration StorageURL::getConfiguration(ASTs & args, ContextPtr l } if (configuration.format == "auto") - configuration.format = FormatFactory::instance().getFormatFromFileName(Poco::URI(configuration.url).getPath(), true); + configuration.format = FormatFactory::instance().tryGetFormatFromFileName(Poco::URI(configuration.url).getPath()).value_or("auto"); for (const auto & [header, value] : configuration.headers) { diff --git a/src/Storages/StorageURL.h b/src/Storages/StorageURL.h index c8b8d0942f4..18a90c7bb82 100644 --- a/src/Storages/StorageURL.h +++ b/src/Storages/StorageURL.h @@ -57,7 +57,15 @@ public: CompressionMethod compression_method, const HTTPHeaderEntries & headers, const std::optional & format_settings, - ContextPtr context); + const ContextPtr & context); + + static std::pair getTableStructureAndFormatFromData( + const String & uri, + CompressionMethod compression_method, + const HTTPHeaderEntries & headers, + const std::optional & format_settings, + const ContextPtr & context); + static SchemaCache & getSchemaCache(const ContextPtr & context); @@ -72,7 +80,7 @@ protected: IStorageURLBase( const String & uri_, - ContextPtr context_, + const ContextPtr & context_, const StorageID & id_, const String & format_name_, const std::optional & format_settings_, @@ -106,7 +114,7 @@ protected: const Names & column_names, const StorageSnapshotPtr & storage_snapshot, const SelectQueryInfo & query_info, - ContextPtr context, + const ContextPtr & context, QueryProcessingStage::Enum & processed_stage, size_t max_block_size) const; @@ -114,7 +122,7 @@ protected: const Names & column_names, const ColumnsDescription & columns_description, const SelectQueryInfo & query_info, - ContextPtr context, + const ContextPtr & context, QueryProcessingStage::Enum & processed_stage, size_t max_block_size) const; @@ -127,6 +135,14 @@ protected: bool supportsTrivialCountOptimization() const override { return true; } private: + static std::pair getTableStructureAndFormatFromDataImpl( + std::optional format, + const String & uri, + CompressionMethod compression_method, + const HTTPHeaderEntries & headers, + const std::optional & format_settings, + const ContextPtr & context); + virtual Block getHeaderBlock(const Names & column_names, const StorageSnapshotPtr & storage_snapshot) const = 0; }; @@ -160,7 +176,7 @@ public: const String & format, const std::optional & format_settings, String name_, - ContextPtr context, + const ContextPtr & context, UInt64 max_block_size, const ConnectionTimeouts & timeouts, CompressionMethod compression_method, @@ -231,7 +247,7 @@ public: const String & format, const std::optional & format_settings, const Block & sample_block, - ContextPtr context, + const ContextPtr & context, const ConnectionTimeouts & timeouts, CompressionMethod compression_method, const HTTPHeaderEntries & headers = {}, @@ -263,7 +279,7 @@ public: const ColumnsDescription & columns_, const ConstraintsDescription & constraints_, const String & comment, - ContextPtr context_, + const ContextPtr & context_, const String & compression_method_, const HTTPHeaderEntries & headers_ = {}, const String & method_ = "", @@ -292,12 +308,12 @@ public: std::string addresses_expr; }; - static Configuration getConfiguration(ASTs & args, ContextPtr context); + static Configuration getConfiguration(ASTs & args, const ContextPtr & context); /// Does evaluateConstantExpressionOrIdentifierAsLiteral() on all arguments. /// If `headers(...)` argument is present, parses it and moves it to the end of the array. /// Returns number of arguments excluding `headers(...)`. - static size_t evalArgsAndCollectHeaders(ASTs & url_function_args, HTTPHeaderEntries & header_entries, ContextPtr context); + static size_t evalArgsAndCollectHeaders(ASTs & url_function_args, HTTPHeaderEntries & header_entries, const ContextPtr & context); static void processNamedCollectionResult(Configuration & configuration, const NamedCollection & collection); }; @@ -314,7 +330,7 @@ public: const std::optional & format_settings_, const ColumnsDescription & columns_, const ConstraintsDescription & constraints_, - ContextPtr context_, + const ContextPtr & context_, const String & compression_method_); void read( diff --git a/src/Storages/StorageURLCluster.cpp b/src/Storages/StorageURLCluster.cpp index 2365887983d..d0df74d7521 100644 --- a/src/Storages/StorageURLCluster.cpp +++ b/src/Storages/StorageURLCluster.cpp @@ -35,36 +35,43 @@ namespace ErrorCodes } StorageURLCluster::StorageURLCluster( - ContextPtr context_, + const ContextPtr & context, const String & cluster_name_, const String & uri_, const String & format_, - const String & compression_method_, + const String & compression_method, const StorageID & table_id_, const ColumnsDescription & columns_, const ConstraintsDescription & constraints_, - const StorageURL::Configuration & configuration_, - bool structure_argument_was_provided_) - : IStorageCluster(cluster_name_, table_id_, getLogger("StorageURLCluster (" + table_id_.table_name + ")"), structure_argument_was_provided_) - , uri(uri_) + const StorageURL::Configuration & configuration_) + : IStorageCluster(cluster_name_, table_id_, getLogger("StorageURLCluster (" + table_id_.table_name + ")")) + , uri(uri_), format_name(format_) { - context_->getRemoteHostFilter().checkURL(Poco::URI(uri)); - context_->getHTTPHeaderFilter().checkHeaders(configuration_.headers); + context->getRemoteHostFilter().checkURL(Poco::URI(uri)); + context->getHTTPHeaderFilter().checkHeaders(configuration_.headers); StorageInMemoryMetadata storage_metadata; if (columns_.empty()) { - auto columns = StorageURL::getTableStructureFromData(format_, - uri, - chooseCompressionMethod(Poco::URI(uri).getPath(), compression_method_), - configuration_.headers, - std::nullopt, - context_); + ColumnsDescription columns; + if (format_name == "auto") + std::tie(columns, format_name) = StorageURL::getTableStructureAndFormatFromData( + uri, chooseCompressionMethod(Poco::URI(uri).getPath(), compression_method), configuration_.headers, std::nullopt, context); + else + columns = StorageURL::getTableStructureFromData( + format_, uri, chooseCompressionMethod(Poco::URI(uri).getPath(), compression_method), configuration_.headers, std::nullopt, context); + storage_metadata.setColumns(columns); } else + { + if (format_name == "auto") + format_name = StorageURL::getTableStructureAndFormatFromData( + uri, chooseCompressionMethod(Poco::URI(uri).getPath(), compression_method), configuration_.headers, std::nullopt, context).second; + storage_metadata.setColumns(columns_); + } storage_metadata.setConstraints(constraints_); setInMemoryMetadata(storage_metadata); @@ -72,13 +79,14 @@ StorageURLCluster::StorageURLCluster( virtual_columns = VirtualColumnUtils::getPathFileAndSizeVirtualsForStorage(storage_metadata.getSampleBlock().getNamesAndTypesList()); } -void StorageURLCluster::addColumnsStructureToQuery(ASTPtr & query, const String & structure, const ContextPtr & context) +void StorageURLCluster::updateQueryToSendIfNeeded(ASTPtr & query, const StorageSnapshotPtr & storage_snapshot, const ContextPtr & context) { ASTExpressionList * expression_list = extractTableFunctionArgumentsFromSelectQuery(query); if (!expression_list) throw Exception(ErrorCodes::LOGICAL_ERROR, "Expected SELECT query from table function urlCluster, got '{}'", queryToString(query)); - TableFunctionURLCluster::addColumnsStructureToArguments(expression_list->children, structure, context); + TableFunctionURLCluster::updateStructureAndFormatArgumentsIfNeeded( + expression_list->children, storage_snapshot->metadata->getColumns().getAll().toNamesAndTypesDescription(), format_name, context); } RemoteQueryExecutor::Extension StorageURLCluster::getTaskIteratorExtension(const ActionsDAG::Node * predicate, const ContextPtr & context) const diff --git a/src/Storages/StorageURLCluster.h b/src/Storages/StorageURLCluster.h index 07978040029..f57d262f434 100644 --- a/src/Storages/StorageURLCluster.h +++ b/src/Storages/StorageURLCluster.h @@ -19,16 +19,15 @@ class StorageURLCluster : public IStorageCluster { public: StorageURLCluster( - ContextPtr context_, + const ContextPtr & context, const String & cluster_name_, const String & uri_, const String & format_, - const String & compression_method_, + const String & compression_method, const StorageID & table_id_, const ColumnsDescription & columns_, const ConstraintsDescription & constraints_, - const StorageURL::Configuration & configuration_, - bool structure_argument_was_provided_); + const StorageURL::Configuration & configuration_); std::string getName() const override { return "URLCluster"; } @@ -41,11 +40,10 @@ public: bool supportsTrivialCountOptimization() const override { return true; } private: - void addColumnsStructureToQuery(ASTPtr & query, const String & structure, const ContextPtr & context) override; + void updateQueryToSendIfNeeded(ASTPtr & query, const StorageSnapshotPtr & storage_snapshot, const ContextPtr & context) override; String uri; String format_name; - String compression_method; NamesAndTypesList virtual_columns; }; diff --git a/src/Storages/StorageView.cpp b/src/Storages/StorageView.cpp index f0f9b9540de..181fd0ac61c 100644 --- a/src/Storages/StorageView.cpp +++ b/src/Storages/StorageView.cpp @@ -112,7 +112,15 @@ StorageView::StorageView( : IStorage(table_id_) { StorageInMemoryMetadata storage_metadata; - storage_metadata.setColumns(columns_); + if (!is_parameterized_view_) + { + /// If CREATE query is to create parameterized view, then we dont want to set columns + if (!query.isParameterizedView()) + storage_metadata.setColumns(columns_); + } + else + storage_metadata.setColumns(columns_); + storage_metadata.setComment(comment); if (!query.select) @@ -243,8 +251,7 @@ void StorageView::replaceWithSubquery(ASTSelectQuery & outer_query, ASTPtr view_ view_name = table_expression->database_and_table_name; table_expression->database_and_table_name = {}; - table_expression->subquery = std::make_shared(); - table_expression->subquery->children.push_back(view_query); + table_expression->subquery = std::make_shared(view_query); table_expression->subquery->setAlias(alias); for (auto & child : table_expression->children) diff --git a/src/Storages/StorageXDBC.cpp b/src/Storages/StorageXDBC.cpp index 259abefb00f..fb8fa2d6da4 100644 --- a/src/Storages/StorageXDBC.cpp +++ b/src/Storages/StorageXDBC.cpp @@ -59,7 +59,7 @@ std::vector> StorageXDBC::getReadURIParams( const Names & /* column_names */, const StorageSnapshotPtr & /*storage_snapshot*/, const SelectQueryInfo & /*query_info*/, - ContextPtr /*context*/, + const ContextPtr & /*context*/, QueryProcessingStage::Enum & /*processed_stage*/, size_t max_block_size) const { @@ -70,7 +70,7 @@ std::function StorageXDBC::getReadPOSTDataCallback( const Names & column_names, const ColumnsDescription & columns_description, const SelectQueryInfo & query_info, - ContextPtr local_context, + const ContextPtr & local_context, QueryProcessingStage::Enum & /*processed_stage*/, size_t /*max_block_size*/) const { diff --git a/src/Storages/StorageXDBC.h b/src/Storages/StorageXDBC.h index cba15a83226..7cec7266760 100644 --- a/src/Storages/StorageXDBC.h +++ b/src/Storages/StorageXDBC.h @@ -55,7 +55,7 @@ private: const Names & column_names, const StorageSnapshotPtr & storage_snapshot, const SelectQueryInfo & query_info, - ContextPtr context, + const ContextPtr & context, QueryProcessingStage::Enum & processed_stage, size_t max_block_size) const override; @@ -63,7 +63,7 @@ private: const Names & column_names, const ColumnsDescription & columns_description, const SelectQueryInfo & query_info, - ContextPtr context, + const ContextPtr & context, QueryProcessingStage::Enum & processed_stage, size_t max_block_size) const override; diff --git a/src/Storages/System/StorageSystemDashboards.cpp b/src/Storages/System/StorageSystemDashboards.cpp index 7e545757129..7c9e8b73519 100644 --- a/src/Storages/System/StorageSystemDashboards.cpp +++ b/src/Storages/System/StorageSystemDashboards.cpp @@ -26,192 +26,329 @@ void StorageSystemDashboards::fillData(MutableColumns & res_columns, ContextPtr, { static const std::vector> dashboards { + /// Default dashboard for self-managed ClickHouse { - { "dashboard", "overview" }, + { "dashboard", "Overview" }, { "title", "Queries/second" }, { "query", trim(R"EOQ( SELECT toStartOfInterval(event_time, INTERVAL {rounding:UInt32} SECOND)::INT AS t, avg(ProfileEvent_Query) -FROM system.metric_log +FROM merge('system', '^metric_log') WHERE event_date >= toDate(now() - {seconds:UInt32}) AND event_time >= now() - {seconds:UInt32} GROUP BY t ORDER BY t WITH FILL STEP {rounding:UInt32} )EOQ") } }, { - { "dashboard", "overview" }, + { "dashboard", "Overview" }, { "title", "CPU Usage (cores)" }, { "query", trim(R"EOQ( SELECT toStartOfInterval(event_time, INTERVAL {rounding:UInt32} SECOND)::INT AS t, avg(ProfileEvent_OSCPUVirtualTimeMicroseconds) / 1000000 -FROM system.metric_log +FROM merge('system', '^metric_log') WHERE event_date >= toDate(now() - {seconds:UInt32}) AND event_time >= now() - {seconds:UInt32} GROUP BY t ORDER BY t WITH FILL STEP {rounding:UInt32} )EOQ") } }, { - { "dashboard", "overview" }, + { "dashboard", "Overview" }, { "title", "Queries Running" }, { "query", trim(R"EOQ( SELECT toStartOfInterval(event_time, INTERVAL {rounding:UInt32} SECOND)::INT AS t, avg(CurrentMetric_Query) -FROM system.metric_log +FROM merge('system', '^metric_log') WHERE event_date >= toDate(now() - {seconds:UInt32}) AND event_time >= now() - {seconds:UInt32} GROUP BY t ORDER BY t WITH FILL STEP {rounding:UInt32} )EOQ") } }, { - { "dashboard", "overview" }, + { "dashboard", "Overview" }, { "title", "Merges Running" }, { "query", trim(R"EOQ( SELECT toStartOfInterval(event_time, INTERVAL {rounding:UInt32} SECOND)::INT AS t, avg(CurrentMetric_Merge) -FROM system.metric_log +FROM merge('system', '^metric_log') WHERE event_date >= toDate(now() - {seconds:UInt32}) AND event_time >= now() - {seconds:UInt32} GROUP BY t ORDER BY t WITH FILL STEP {rounding:UInt32} )EOQ") } }, { - { "dashboard", "overview" }, + { "dashboard", "Overview" }, { "title", "Selected Bytes/second" }, { "query", trim(R"EOQ( SELECT toStartOfInterval(event_time, INTERVAL {rounding:UInt32} SECOND)::INT AS t, avg(ProfileEvent_SelectedBytes) -FROM system.metric_log +FROM merge('system', '^metric_log') WHERE event_date >= toDate(now() - {seconds:UInt32}) AND event_time >= now() - {seconds:UInt32} GROUP BY t ORDER BY t WITH FILL STEP {rounding:UInt32} )EOQ") } }, { - { "dashboard", "overview" }, + { "dashboard", "Overview" }, { "title", "IO Wait" }, { "query", trim(R"EOQ( SELECT toStartOfInterval(event_time, INTERVAL {rounding:UInt32} SECOND)::INT AS t, avg(ProfileEvent_OSIOWaitMicroseconds) / 1000000 -FROM system.metric_log +FROM merge('system', '^metric_log') WHERE event_date >= toDate(now() - {seconds:UInt32}) AND event_time >= now() - {seconds:UInt32} GROUP BY t ORDER BY t WITH FILL STEP {rounding:UInt32} )EOQ") } }, { - { "dashboard", "overview" }, + { "dashboard", "Overview" }, { "title", "CPU Wait" }, { "query", trim(R"EOQ( SELECT toStartOfInterval(event_time, INTERVAL {rounding:UInt32} SECOND)::INT AS t, avg(ProfileEvent_OSCPUWaitMicroseconds) / 1000000 -FROM system.metric_log +FROM merge('system', '^metric_log') WHERE event_date >= toDate(now() - {seconds:UInt32}) AND event_time >= now() - {seconds:UInt32} GROUP BY t ORDER BY t WITH FILL STEP {rounding:UInt32} )EOQ") } }, { - { "dashboard", "overview" }, + { "dashboard", "Overview" }, { "title", "OS CPU Usage (Userspace)" }, { "query", trim(R"EOQ( SELECT toStartOfInterval(event_time, INTERVAL {rounding:UInt32} SECOND)::INT AS t, avg(value) -FROM system.asynchronous_metric_log +FROM merge('system', '^asynchronous_metric_log') WHERE event_date >= toDate(now() - {seconds:UInt32}) AND event_time >= now() - {seconds:UInt32} AND metric = 'OSUserTimeNormalized' GROUP BY t ORDER BY t WITH FILL STEP {rounding:UInt32} )EOQ") } }, { - { "dashboard", "overview" }, + { "dashboard", "Overview" }, { "title", "OS CPU Usage (Kernel)" }, { "query", trim(R"EOQ( SELECT toStartOfInterval(event_time, INTERVAL {rounding:UInt32} SECOND)::INT AS t, avg(value) -FROM system.asynchronous_metric_log +FROM merge('system', '^asynchronous_metric_log') WHERE event_date >= toDate(now() - {seconds:UInt32}) AND event_time >= now() - {seconds:UInt32} AND metric = 'OSSystemTimeNormalized' GROUP BY t ORDER BY t WITH FILL STEP {rounding:UInt32} )EOQ") } }, { - { "dashboard", "overview" }, + { "dashboard", "Overview" }, { "title", "Read From Disk" }, { "query", trim(R"EOQ( SELECT toStartOfInterval(event_time, INTERVAL {rounding:UInt32} SECOND)::INT AS t, avg(ProfileEvent_OSReadBytes) -FROM system.metric_log +FROM merge('system', '^metric_log') WHERE event_date >= toDate(now() - {seconds:UInt32}) AND event_time >= now() - {seconds:UInt32} GROUP BY t ORDER BY t WITH FILL STEP {rounding:UInt32} )EOQ") } }, { - { "dashboard", "overview" }, + { "dashboard", "Overview" }, { "title", "Read From Filesystem" }, { "query", trim(R"EOQ( SELECT toStartOfInterval(event_time, INTERVAL {rounding:UInt32} SECOND)::INT AS t, avg(ProfileEvent_OSReadChars) -FROM system.metric_log +FROM merge('system', '^metric_log') WHERE event_date >= toDate(now() - {seconds:UInt32}) AND event_time >= now() - {seconds:UInt32} GROUP BY t ORDER BY t WITH FILL STEP {rounding:UInt32} )EOQ") } }, { - { "dashboard", "overview" }, + { "dashboard", "Overview" }, { "title", "Memory (tracked)" }, { "query", trim(R"EOQ( SELECT toStartOfInterval(event_time, INTERVAL {rounding:UInt32} SECOND)::INT AS t, avg(CurrentMetric_MemoryTracking) -FROM system.metric_log +FROM merge('system', '^metric_log') WHERE event_date >= toDate(now() - {seconds:UInt32}) AND event_time >= now() - {seconds:UInt32} GROUP BY t ORDER BY t WITH FILL STEP {rounding:UInt32} )EOQ") } }, { - { "dashboard", "overview" }, + { "dashboard", "Overview" }, { "title", "Load Average (15 minutes)" }, { "query", trim(R"EOQ( SELECT toStartOfInterval(event_time, INTERVAL {rounding:UInt32} SECOND)::INT AS t, avg(value) -FROM system.asynchronous_metric_log +FROM merge('system', '^asynchronous_metric_log') WHERE event_date >= toDate(now() - {seconds:UInt32}) AND event_time >= now() - {seconds:UInt32} AND metric = 'LoadAverage15' GROUP BY t ORDER BY t WITH FILL STEP {rounding:UInt32} )EOQ") } }, { - { "dashboard", "overview" }, + { "dashboard", "Overview" }, { "title", "Selected Rows/second" }, { "query", trim(R"EOQ( SELECT toStartOfInterval(event_time, INTERVAL {rounding:UInt32} SECOND)::INT AS t, avg(ProfileEvent_SelectedRows) -FROM system.metric_log +FROM merge('system', '^metric_log') WHERE event_date >= toDate(now() - {seconds:UInt32}) AND event_time >= now() - {seconds:UInt32} GROUP BY t ORDER BY t WITH FILL STEP {rounding:UInt32} )EOQ") } }, { - { "dashboard", "overview" }, + { "dashboard", "Overview" }, { "title", "Inserted Rows/second" }, { "query", trim(R"EOQ( SELECT toStartOfInterval(event_time, INTERVAL {rounding:UInt32} SECOND)::INT AS t, avg(ProfileEvent_InsertedRows) -FROM system.metric_log +FROM merge('system', '^metric_log') WHERE event_date >= toDate(now() - {seconds:UInt32}) AND event_time >= now() - {seconds:UInt32} GROUP BY t ORDER BY t WITH FILL STEP {rounding:UInt32} )EOQ") } }, { - { "dashboard", "overview" }, + { "dashboard", "Overview" }, { "title", "Total MergeTree Parts" }, { "query", trim(R"EOQ( SELECT toStartOfInterval(event_time, INTERVAL {rounding:UInt32} SECOND)::INT AS t, avg(value) -FROM system.asynchronous_metric_log +FROM merge('system', '^asynchronous_metric_log') WHERE event_date >= toDate(now() - {seconds:UInt32}) AND event_time >= now() - {seconds:UInt32} AND metric = 'TotalPartsOfMergeTreeTables' GROUP BY t ORDER BY t WITH FILL STEP {rounding:UInt32} )EOQ") } }, { - { "dashboard", "overview" }, + { "dashboard", "Overview" }, { "title", "Max Parts For Partition" }, { "query", trim(R"EOQ( SELECT toStartOfInterval(event_time, INTERVAL {rounding:UInt32} SECOND)::INT AS t, max(value) -FROM system.asynchronous_metric_log +FROM merge('system', '^asynchronous_metric_log') WHERE event_date >= toDate(now() - {seconds:UInt32}) AND event_time >= now() - {seconds:UInt32} AND metric = 'MaxPartCountForPartition' GROUP BY t ORDER BY t WITH FILL STEP {rounding:UInt32} )EOQ") } + }, + /// Default dashboard for ClickHouse Cloud + { + { "dashboard", "Cloud overview" }, + { "title", "Queries/second" }, + { "query", "SELECT \n toStartOfInterval(event_time, INTERVAL {rounding:UInt32} SECOND)::INT AS t,\n avg(metric)\nFROM (\n SELECT event_time, sum(ProfileEvent_Query) AS metric \n FROM clusterAllReplicas(default, merge('system', '^metric_log'))\n WHERE event_date >= toDate(now() - {seconds:UInt32})\n AND event_time >= now() - {seconds:UInt32}\n GROUP BY event_time)\nGROUP BY t\nORDER BY t WITH FILL STEP {rounding:UInt32} SETTINGS skip_unavailable_shards = 1" } + }, + { + { "dashboard", "Cloud overview" }, + { "title", "CPU Usage (cores)" }, + { "query", "SELECT toStartOfInterval(event_time, INTERVAL {rounding:UInt32} SECOND)::INT AS t, avg(metric) / 1000000\nFROM (\n SELECT event_time, sum(ProfileEvent_OSCPUVirtualTimeMicroseconds) AS metric \n FROM clusterAllReplicas(default, merge('system', '^metric_log'))\n WHERE event_date >= toDate(now() - {seconds:UInt32})\n AND event_time >= now() - {seconds:UInt32} GROUP BY event_time)\nGROUP BY t\nORDER BY t WITH FILL STEP {rounding:UInt32} SETTINGS skip_unavailable_shards = 1" } + }, + { + { "dashboard", "Cloud overview" }, + { "title", "Queries Running" }, + { "query", "SELECT \n toStartOfInterval(event_time, INTERVAL {rounding:UInt32} SECOND)::INT AS t,\n avg(metric)\nFROM (\n SELECT event_time, sum(CurrentMetric_Query) AS metric \n FROM clusterAllReplicas(default, merge('system', '^metric_log'))\n WHERE event_date >= toDate(now() - {seconds:UInt32})\n AND event_time >= now() - {seconds:UInt32}\n GROUP BY event_time)\nGROUP BY t\nORDER BY t WITH FILL STEP {rounding:UInt32} SETTINGS skip_unavailable_shards = 1" } + }, + { + { "dashboard", "Cloud overview" }, + { "title", "Merges Running" }, + { "query", "SELECT \n toStartOfInterval(event_time, INTERVAL {rounding:UInt32} SECOND)::INT AS t,\n avg(metric)\nFROM (\n SELECT event_time, sum(CurrentMetric_Merge) AS metric \n FROM clusterAllReplicas(default, merge('system', '^metric_log'))\n WHERE event_date >= toDate(now() - {seconds:UInt32})\n AND event_time >= now() - {seconds:UInt32}\n GROUP BY event_time)\nGROUP BY t\nORDER BY t WITH FILL STEP {rounding:UInt32} SETTINGS skip_unavailable_shards = 1" } + }, + { + { "dashboard", "Cloud overview" }, + { "title", "Selected Bytes/second" }, + { "query", "SELECT \n toStartOfInterval(event_time, INTERVAL {rounding:UInt32} SECOND)::INT AS t,\n avg(metric)\nFROM (\n SELECT event_time, sum(ProfileEvent_SelectedBytes) AS metric \n FROM clusterAllReplicas(default, merge('system', '^metric_log'))\n WHERE event_date >= toDate(now() - {seconds:UInt32})\n AND event_time >= now() - {seconds:UInt32}\n GROUP BY event_time)\nGROUP BY t\nORDER BY t WITH FILL STEP {rounding:UInt32} SETTINGS skip_unavailable_shards = 1" } + }, + { + { "dashboard", "Cloud overview" }, + { "title", "IO Wait (local fs)" }, + { "query", "SELECT \n toStartOfInterval(event_time, INTERVAL {rounding:UInt32} SECOND)::INT AS t,\n avg(metric)\nFROM (\n SELECT event_time, sum(ProfileEvent_OSIOWaitMicroseconds) / 1000000 AS metric \n FROM clusterAllReplicas(default, merge('system', '^metric_log'))\n WHERE event_date >= toDate(now() - {seconds:UInt32})\n AND event_time >= now() - {seconds:UInt32}\n GROUP BY event_time)\nGROUP BY t\nORDER BY t WITH FILL STEP {rounding:UInt32} SETTINGS skip_unavailable_shards = 1" } + }, + { + { "dashboard", "Cloud overview" }, + { "title", "S3 read wait" }, + { "query", "SELECT \n toStartOfInterval(event_time, INTERVAL {rounding:UInt32} SECOND)::INT AS t,\n avg(metric)\nFROM (\n SELECT event_time, sum(ProfileEvent_ReadBufferFromS3Microseconds) / 1000000 AS metric \n FROM clusterAllReplicas(default, merge('system', '^metric_log'))\n WHERE event_date >= toDate(now() - {seconds:UInt32})\n AND event_time >= now() - {seconds:UInt32}\n GROUP BY event_time)\nGROUP BY t\nORDER BY t WITH FILL STEP {rounding:UInt32} SETTINGS skip_unavailable_shards = 1" } + }, + { + { "dashboard", "Cloud overview" }, + { "title", "S3 read errors/sec" }, + { "query", "SELECT \n toStartOfInterval(event_time, INTERVAL {rounding:UInt32} SECOND)::INT AS t,\n avg(metric)\nFROM (\n SELECT event_time, sum(ProfileEvent_ReadBufferFromS3RequestsErrors) AS metric \n FROM clusterAllReplicas(default, merge('system', '^metric_log'))\n WHERE event_date >= toDate(now() - {seconds:UInt32})\n AND event_time >= now() - {seconds:UInt32}\n GROUP BY event_time)\nGROUP BY t\nORDER BY t WITH FILL STEP {rounding:UInt32} SETTINGS skip_unavailable_shards = 1" } + }, + { + { "dashboard", "Cloud overview" }, + { "title", "CPU Wait" }, + { "query", "SELECT \n toStartOfInterval(event_time, INTERVAL {rounding:UInt32} SECOND)::INT AS t,\n avg(metric)\nFROM (\n SELECT event_time, sum(ProfileEvent_OSCPUWaitMicroseconds) / 1000000 AS metric \n FROM clusterAllReplicas(default, merge('system', '^metric_log'))\n WHERE event_date >= toDate(now() - {seconds:UInt32})\n AND event_time >= now() - {seconds:UInt32}\n GROUP BY event_time)\nGROUP BY t\nORDER BY t WITH FILL STEP {rounding:UInt32} SETTINGS skip_unavailable_shards = 1" } + }, + { + { "dashboard", "Cloud overview" }, + { "title", "OS CPU Usage (Userspace, normalized)" }, + { "query", "SELECT toStartOfInterval(event_time, INTERVAL {rounding:UInt32} SECOND)::INT AS t, avg(value)\nFROM clusterAllReplicas(default, merge('system', '^asynchronous_metric_log'))\nWHERE event_date >= toDate(now() - {seconds:UInt32}) AND event_time >= now() - {seconds:UInt32}\nAND metric = 'OSUserTimeNormalized'\nGROUP BY t\nORDER BY t WITH FILL STEP {rounding:UInt32} SETTINGS skip_unavailable_shards = 1" } + }, + { + { "dashboard", "Cloud overview" }, + { "title", "OS CPU Usage (Kernel, normalized)" }, + { "query", "SELECT toStartOfInterval(event_time, INTERVAL {rounding:UInt32} SECOND)::INT AS t, avg(value)\nFROM clusterAllReplicas(default, merge('system', '^asynchronous_metric_log'))\nWHERE event_date >= toDate(now() - {seconds:UInt32}) AND event_time >= now() - {seconds:UInt32}\nAND metric = 'OSSystemTimeNormalized'\nGROUP BY t\nORDER BY t WITH FILL STEP {rounding:UInt32} SETTINGS skip_unavailable_shards = 1" } + }, + { + { "dashboard", "Cloud overview" }, + { "title", "Read From Disk (bytes/sec)" }, + { "query", "SELECT \n toStartOfInterval(event_time, INTERVAL {rounding:UInt32} SECOND)::INT AS t,\n avg(metric)\nFROM (\n SELECT event_time, sum(ProfileEvent_OSReadBytes) AS metric \n FROM clusterAllReplicas(default, merge('system', '^metric_log'))\n WHERE event_date >= toDate(now() - {seconds:UInt32})\n AND event_time >= now() - {seconds:UInt32}\n GROUP BY event_time)\nGROUP BY t\nORDER BY t WITH FILL STEP {rounding:UInt32} SETTINGS skip_unavailable_shards = 1" } + }, + { + { "dashboard", "Cloud overview" }, + { "title", "Read From Filesystem (bytes/sec)" }, + { "query", "SELECT \n toStartOfInterval(event_time, INTERVAL {rounding:UInt32} SECOND)::INT AS t,\n avg(metric)\nFROM (\n SELECT event_time, sum(ProfileEvent_OSReadChars) AS metric \n FROM clusterAllReplicas(default, merge('system', '^metric_log'))\n WHERE event_date >= toDate(now() - {seconds:UInt32})\n AND event_time >= now() - {seconds:UInt32}\n GROUP BY event_time)\nGROUP BY t\nORDER BY t WITH FILL STEP {rounding:UInt32} SETTINGS skip_unavailable_shards = 1" } + }, + { + { "dashboard", "Cloud overview" }, + { "title", "Memory (tracked, bytes)" }, + { "query", "SELECT \n toStartOfInterval(event_time, INTERVAL {rounding:UInt32} SECOND)::INT AS t,\n avg(metric)\nFROM (\n SELECT event_time, sum(CurrentMetric_MemoryTracking) AS metric \n FROM clusterAllReplicas(default, merge('system', '^metric_log'))\n WHERE event_date >= toDate(now() - {seconds:UInt32})\n AND event_time >= now() - {seconds:UInt32}\n GROUP BY event_time)\nGROUP BY t\nORDER BY t WITH FILL STEP {rounding:UInt32} SETTINGS skip_unavailable_shards = 1" } + }, + { + { "dashboard", "Cloud overview" }, + { "title", "Load Average (15 minutes)" }, + { "query", "SELECT toStartOfInterval(event_time, INTERVAL {rounding:UInt32} SECOND)::INT AS t, avg(value)\nFROM (\n SELECT event_time, sum(value) AS value\n FROM clusterAllReplicas(default, merge('system', '^asynchronous_metric_log'))\n WHERE event_date >= toDate(now() - {seconds:UInt32})\n AND event_time >= now() - {seconds:UInt32}\n AND metric = 'LoadAverage15'\n GROUP BY event_time)\nGROUP BY t\nORDER BY t WITH FILL STEP {rounding:UInt32} SETTINGS skip_unavailable_shards = 1" } + }, + { + { "dashboard", "Cloud overview" }, + { "title", "Selected Rows/sec" }, + { "query", "SELECT \n toStartOfInterval(event_time, INTERVAL {rounding:UInt32} SECOND)::INT AS t,\n avg(metric)\nFROM (\n SELECT event_time, sum(ProfileEvent_SelectedRows) AS metric \n FROM clusterAllReplicas(default, merge('system', '^metric_log'))\n WHERE event_date >= toDate(now() - {seconds:UInt32})\n AND event_time >= now() - {seconds:UInt32}\n GROUP BY event_time)\nGROUP BY t\nORDER BY t WITH FILL STEP {rounding:UInt32} SETTINGS skip_unavailable_shards = 1" } + }, + { + { "dashboard", "Cloud overview" }, + { "title", "Inserted Rows/sec" }, + { "query", "SELECT \n toStartOfInterval(event_time, INTERVAL {rounding:UInt32} SECOND)::INT AS t,\n avg(metric)\nFROM (\n SELECT event_time, sum(ProfileEvent_InsertedRows) AS metric \n FROM clusterAllReplicas(default, merge('system', '^metric_log'))\n WHERE event_date >= toDate(now() - {seconds:UInt32})\n AND event_time >= now() - {seconds:UInt32}\n GROUP BY event_time)\nGROUP BY t\nORDER BY t WITH FILL STEP {rounding:UInt32} SETTINGS skip_unavailable_shards = 1" } + }, + { + { "dashboard", "Cloud overview" }, + { "title", "Total MergeTree Parts" }, + { "query", "SELECT toStartOfInterval(event_time, INTERVAL {rounding:UInt32} SECOND)::INT AS t, max(value)\nFROM clusterAllReplicas(default, merge('system', '^asynchronous_metric_log'))\nWHERE event_date >= toDate(now() - {seconds:UInt32}) AND event_time >= now() - {seconds:UInt32}\nAND metric = 'TotalPartsOfMergeTreeTables'\nGROUP BY t\nORDER BY t WITH FILL STEP {rounding:UInt32} SETTINGS skip_unavailable_shards = 1" } + }, + { + { "dashboard", "Cloud overview" }, + { "title", "Max Parts For Partition" }, + { "query", "SELECT toStartOfInterval(event_time, INTERVAL {rounding:UInt32} SECOND)::INT AS t, max(value)\nFROM clusterAllReplicas(default, merge('system', '^asynchronous_metric_log'))\nWHERE event_date >= toDate(now() - {seconds:UInt32}) AND event_time >= now() - {seconds:UInt32}\nAND metric = 'MaxPartCountForPartition'\nGROUP BY t\nORDER BY t WITH FILL STEP {rounding:UInt32} SETTINGS skip_unavailable_shards = 1" } + }, + { + { "dashboard", "Cloud overview" }, + { "title", "Read From S3 (bytes/sec)" }, + { "query", "SELECT \n toStartOfInterval(event_time, INTERVAL {rounding:UInt32} SECOND)::INT AS t,\n avg(metric)\nFROM (\n SELECT event_time, sum(ProfileEvent_ReadBufferFromS3Bytes) AS metric \n FROM clusterAllReplicas(default, merge('system', '^metric_log'))\n WHERE event_date >= toDate(now() - {seconds:UInt32})\n AND event_time >= now() - {seconds:UInt32}\n GROUP BY event_time)\nGROUP BY t\nORDER BY t WITH FILL STEP {rounding:UInt32} SETTINGS skip_unavailable_shards = 1" } + }, + { + { "dashboard", "Cloud overview" }, + { "title", "Filesystem Cache Size" }, + { "query", "SELECT \n toStartOfInterval(event_time, INTERVAL {rounding:UInt32} SECOND)::INT AS t,\n avg(metric)\nFROM (\n SELECT event_time, sum(CurrentMetric_FilesystemCacheSize) AS metric \n FROM clusterAllReplicas(default, merge('system', '^metric_log'))\n WHERE event_date >= toDate(now() - {seconds:UInt32})\n AND event_time >= now() - {seconds:UInt32}\n GROUP BY event_time)\nGROUP BY t\nORDER BY t WITH FILL STEP {rounding:UInt32} SETTINGS skip_unavailable_shards = 1" } + }, + { + { "dashboard", "Cloud overview" }, + { "title", "Disk S3 write req/sec" }, + { "query", "SELECT \n toStartOfInterval(event_time, INTERVAL {rounding:UInt32} SECOND)::INT AS t,\n avg(metric)\nFROM (\n SELECT event_time, sum(ProfileEvent_DiskS3PutObject + ProfileEvent_DiskS3UploadPart + ProfileEvent_DiskS3CreateMultipartUpload + ProfileEvent_DiskS3CompleteMultipartUpload) AS metric \n FROM clusterAllReplicas(default, merge('system', '^metric_log'))\n WHERE event_date >= toDate(now() - {seconds:UInt32})\n AND event_time >= now() - {seconds:UInt32}\n GROUP BY event_time)\nGROUP BY t\nORDER BY t WITH FILL STEP {rounding:UInt32} SETTINGS skip_unavailable_shards = 1" } + }, + { + { "dashboard", "Cloud overview" }, + { "title", "Disk S3 read req/sec" }, + { "query", "SELECT \n toStartOfInterval(event_time, INTERVAL {rounding:UInt32} SECOND)::INT AS t,\n avg(metric)\nFROM (\n SELECT event_time, sum(ProfileEvent_DiskS3GetObject + ProfileEvent_DiskS3HeadObject + ProfileEvent_DiskS3ListObjects) AS metric \n FROM clusterAllReplicas(default, merge('system', '^metric_log'))\n WHERE event_date >= toDate(now() - {seconds:UInt32})\n AND event_time >= now() - {seconds:UInt32}\n GROUP BY event_time)\nGROUP BY t\nORDER BY t WITH FILL STEP {rounding:UInt32} SETTINGS skip_unavailable_shards = 1" } + }, + { + { "dashboard", "Cloud overview" }, + { "title", "FS cache hit rate" }, + { "query", "SELECT \n toStartOfInterval(event_time, INTERVAL {rounding:UInt32} SECOND)::INT AS t,\n avg(metric)\nFROM (\n SELECT event_time, sum(ProfileEvent_CachedReadBufferReadFromCacheBytes) / (sum(ProfileEvent_CachedReadBufferReadFromCacheBytes) + sum(ProfileEvent_CachedReadBufferReadFromSourceBytes)) AS metric \n FROM clusterAllReplicas(default, merge('system', '^metric_log'))\n WHERE event_date >= toDate(now() - {seconds:UInt32})\n AND event_time >= now() - {seconds:UInt32}\n GROUP BY event_time)\nGROUP BY t\nORDER BY t WITH FILL STEP {rounding:UInt32} SETTINGS skip_unavailable_shards = 1" } + }, + { + { "dashboard", "Cloud overview" }, + { "title", "Page cache hit rate" }, + { "query", "SELECT \n toStartOfInterval(event_time, INTERVAL {rounding:UInt32} SECOND)::INT AS t,\n avg(metric)\nFROM (\n SELECT event_time, greatest(0, (sum(ProfileEvent_OSReadChars) - sum(ProfileEvent_OSReadBytes)) / (sum(ProfileEvent_OSReadChars) + sum(ProfileEvent_ReadBufferFromS3Bytes))) AS metric \n FROM clusterAllReplicas(default, merge('system', '^metric_log'))\n WHERE event_date >= toDate(now() - {seconds:UInt32})\n AND event_time >= now() - {seconds:UInt32}\n GROUP BY event_time)\nGROUP BY t\nORDER BY t WITH FILL STEP {rounding:UInt32} SETTINGS skip_unavailable_shards = 1" } + }, + { + { "dashboard", "Cloud overview" }, + { "title", "Network receive bytes/sec" }, + { "query", "SELECT toStartOfInterval(event_time, INTERVAL {rounding:UInt32} SECOND)::INT AS t, avg(value)\nFROM (\n SELECT event_time, sum(value) AS value\n FROM clusterAllReplicas(default, merge('system', '^asynchronous_metric_log'))\n WHERE event_date >= toDate(now() - {seconds:UInt32})\n AND event_time >= now() - {seconds:UInt32}\n AND metric LIKE 'NetworkReceiveBytes%'\n GROUP BY event_time)\nGROUP BY t\nORDER BY t WITH FILL STEP {rounding:UInt32} SETTINGS skip_unavailable_shards = 1" } + }, + { + { "dashboard", "Cloud overview" }, + { "title", "Network send bytes/sec" }, + { "query", "SELECT toStartOfInterval(event_time, INTERVAL {rounding:UInt32} SECOND)::INT AS t, avg(value)\nFROM (\n SELECT event_time, sum(value) AS value\n FROM clusterAllReplicas(default, merge('system', '^asynchronous_metric_log'))\n WHERE event_date >= toDate(now() - {seconds:UInt32})\n AND event_time >= now() - {seconds:UInt32}\n AND metric LIKE 'NetworkSendBytes%'\n GROUP BY event_time)\nGROUP BY t\nORDER BY t WITH FILL STEP {rounding:UInt32} SETTINGS skip_unavailable_shards = 1" } } }; diff --git a/src/Storages/System/StorageSystemDetachedParts.cpp b/src/Storages/System/StorageSystemDetachedParts.cpp index a9cd5f2610a..3dae43976f7 100644 --- a/src/Storages/System/StorageSystemDetachedParts.cpp +++ b/src/Storages/System/StorageSystemDetachedParts.cpp @@ -11,7 +11,7 @@ #include #include #include -#include +#include #include diff --git a/src/Storages/System/StorageSystemTables.cpp b/src/Storages/System/StorageSystemTables.cpp index e0d2dd03e78..47c4a03a595 100644 --- a/src/Storages/System/StorageSystemTables.cpp +++ b/src/Storages/System/StorageSystemTables.cpp @@ -43,6 +43,7 @@ StorageSystemTables::StorageSystemTables(const StorageID & table_id_) {"data_paths", std::make_shared(std::make_shared()), "Paths to the table data in the file systems."}, {"metadata_path", std::make_shared(), "Path to the table metadata in the file system."}, {"metadata_modification_time", std::make_shared(), "Time of latest modification of the table metadata."}, + {"metadata_version", std::make_shared(), "Metadata version for ReplicatedMergeTree table, 0 for non ReplicatedMergeTree table."}, {"dependencies_database", std::make_shared(std::make_shared()), "Database dependencies."}, {"dependencies_table", std::make_shared(std::make_shared()), "Table dependencies (materialized views the current table)."}, {"create_table_query", std::make_shared(), "The query that was used to create the table."}, @@ -287,6 +288,11 @@ protected: if (columns_mask[src_index++]) res_columns[res_index++]->insertDefault(); + // metadata_version + // Temporary tables does not support replication + if (columns_mask[src_index++]) + res_columns[res_index++]->insertDefault(); + // dependencies_database if (columns_mask[src_index++]) res_columns[res_index++]->insertDefault(); @@ -311,7 +317,7 @@ protected: while (src_index < columns_mask.size()) { // total_rows - if (src_index == 18 && columns_mask[src_index]) + if (src_index == 19 && columns_mask[src_index]) { if (auto total_rows = table.second->totalRows(settings)) res_columns[res_index++]->insert(*total_rows); @@ -319,7 +325,7 @@ protected: res_columns[res_index++]->insertDefault(); } // total_bytes - else if (src_index == 19 && columns_mask[src_index]) + else if (src_index == 20 && columns_mask[src_index]) { if (auto total_bytes = table.second->totalBytes(settings)) res_columns[res_index++]->insert(*total_bytes); @@ -418,6 +424,18 @@ protected: if (columns_mask[src_index++]) res_columns[res_index++]->insert(static_cast(database->getObjectMetadataModificationTime(table_name))); + StorageMetadataPtr metadata_snapshot; + if (table) + metadata_snapshot = table->getInMemoryMetadataPtr(); + + if (columns_mask[src_index++]) + { + if (metadata_snapshot && table->supportsReplication()) + res_columns[res_index++]->insert(metadata_snapshot->metadata_version); + else + res_columns[res_index++]->insertDefault(); + } + { Array views_table_name_array; Array views_database_name_array; @@ -482,10 +500,6 @@ protected: else src_index += 3; - StorageMetadataPtr metadata_snapshot; - if (table) - metadata_snapshot = table->getInMemoryMetadataPtr(); - ASTPtr expression_ptr; if (columns_mask[src_index++]) { diff --git a/src/Storages/System/getQueriedColumnsMaskAndHeader.cpp b/src/Storages/System/getQueriedColumnsMaskAndHeader.cpp index c29ccb590ed..b93fe7b8034 100644 --- a/src/Storages/System/getQueriedColumnsMaskAndHeader.cpp +++ b/src/Storages/System/getQueriedColumnsMaskAndHeader.cpp @@ -11,10 +11,11 @@ std::pair, Block> getQueriedColumnsMaskAndHeader(const Block NameSet names_set(column_names.begin(), column_names.end()); for (size_t i = 0; i < columns_mask.size(); ++i) { - if (names_set.contains(sample_block.getByPosition(i).name)) + const auto & column_with_type_and_name = sample_block.getByPosition(i); + if (names_set.contains(column_with_type_and_name.name)) { columns_mask[i] = 1; - header.insert(sample_block.getByPosition(i)); + header.insert(column_with_type_and_name); } } diff --git a/src/TableFunctions/ITableFunctionCluster.h b/src/TableFunctions/ITableFunctionCluster.h index 7e81d6d21b7..9f56d781bc9 100644 --- a/src/TableFunctions/ITableFunctionCluster.h +++ b/src/TableFunctions/ITableFunctionCluster.h @@ -4,7 +4,6 @@ #include #include -#include #include #include #include @@ -29,14 +28,14 @@ public: String getName() const override = 0; String getSignature() const override = 0; - static void addColumnsStructureToArguments(ASTs & args, const String & desired_structure, const ContextPtr & context) + static void updateStructureAndFormatArgumentsIfNeeded(ASTs & args, const String & structure_, const String & format_, const ContextPtr & context) { if (args.empty()) throw Exception(ErrorCodes::LOGICAL_ERROR, "Unexpected empty list of arguments for {}Cluster table function", Base::name); ASTPtr cluster_name_arg = args.front(); args.erase(args.begin()); - Base::addColumnsStructureToArguments(args, desired_structure, context); + Base::updateStructureAndFormatArgumentsIfNeeded(args, structure_, format_, context); args.insert(args.begin(), cluster_name_arg); } diff --git a/src/TableFunctions/ITableFunctionFileLike.cpp b/src/TableFunctions/ITableFunctionFileLike.cpp index b88af855309..b697f3df925 100644 --- a/src/TableFunctions/ITableFunctionFileLike.cpp +++ b/src/TableFunctions/ITableFunctionFileLike.cpp @@ -27,14 +27,14 @@ void ITableFunctionFileLike::parseFirstArguments(const ASTPtr & arg, const Conte filename = checkAndGetLiteralArgument(arg, "source"); } -String ITableFunctionFileLike::getFormatFromFirstArgument() +std::optional ITableFunctionFileLike::tryGetFormatFromFirstArgument() { - return FormatFactory::instance().getFormatFromFileName(filename, true); + return FormatFactory::instance().tryGetFormatFromFileName(filename); } bool ITableFunctionFileLike::supportsReadingSubsetOfColumns(const ContextPtr & context) { - return FormatFactory::instance().checkIfFormatSupportsSubsetOfColumns(format, context); + return format != "auto" && FormatFactory::instance().checkIfFormatSupportsSubsetOfColumns(format, context); } void ITableFunctionFileLike::parseArguments(const ASTPtr & ast_function, ContextPtr context) @@ -63,7 +63,10 @@ void ITableFunctionFileLike::parseArgumentsImpl(ASTs & args, const ContextPtr & format = checkAndGetLiteralArgument(args[1], "format"); if (format == "auto") - format = getFormatFromFirstArgument(); + { + if (auto format_from_first_argument = tryGetFormatFromFirstArgument()) + format = *format_from_first_argument; + } if (args.size() > 2) { @@ -79,34 +82,37 @@ void ITableFunctionFileLike::parseArgumentsImpl(ASTs & args, const ContextPtr & compression_method = checkAndGetLiteralArgument(args[3], "compression_method"); } -void ITableFunctionFileLike::addColumnsStructureToArguments(ASTs & args, const String & structure, const ContextPtr &) +void ITableFunctionFileLike::updateStructureAndFormatArgumentsIfNeeded(ASTs & args, const String & structure, const String & format, const ContextPtr & context) { if (args.empty() || args.size() > getMaxNumberOfArguments()) throw Exception(ErrorCodes::LOGICAL_ERROR, "Expected 1 to {} arguments in table function, got {}", getMaxNumberOfArguments(), args.size()); + auto format_literal = std::make_shared(format); auto structure_literal = std::make_shared(structure); + for (auto & arg : args) + arg = evaluateConstantExpressionOrIdentifierAsLiteral(arg, context); + /// f(filename) if (args.size() == 1) { - /// Add format=auto before structure argument. - args.push_back(std::make_shared("auto")); + args.push_back(format_literal); args.push_back(structure_literal); } /// f(filename, format) else if (args.size() == 2) { + if (checkAndGetLiteralArgument(args[1], "format") == "auto") + args.back() = format_literal; args.push_back(structure_literal); } - /// f(filename, format, 'auto') - else if (args.size() == 3) + /// f(filename, format, structure) or f(filename, format, structure, compression) + else if (args.size() >= 3) { - args.back() = structure_literal; - } - /// f(filename, format, 'auto', compression) - else if (args.size() == 4) - { - args[args.size() - 2] = structure_literal; + if (checkAndGetLiteralArgument(args[1], "format") == "auto") + args[1] = format_literal; + if (checkAndGetLiteralArgument(args[2], "structure") == "auto") + args[2] = structure_literal; } } diff --git a/src/TableFunctions/ITableFunctionFileLike.h b/src/TableFunctions/ITableFunctionFileLike.h index 5fe86587797..c8412905e44 100644 --- a/src/TableFunctions/ITableFunctionFileLike.h +++ b/src/TableFunctions/ITableFunctionFileLike.h @@ -31,7 +31,7 @@ public: static size_t getMaxNumberOfArguments() { return 4; } - static void addColumnsStructureToArguments(ASTs & args, const String & structure, const ContextPtr &); + static void updateStructureAndFormatArgumentsIfNeeded(ASTs & args, const String & structure, const String & format, const ContextPtr &); protected: @@ -39,10 +39,9 @@ protected: virtual void parseArgumentsImpl(ASTs & args, const ContextPtr & context); virtual void parseFirstArguments(const ASTPtr & arg, const ContextPtr & context); - virtual String getFormatFromFirstArgument(); + virtual std::optional tryGetFormatFromFirstArgument(); String filename; - String path_to_archive; String format = "auto"; String structure = "auto"; String compression_method = "auto"; diff --git a/src/TableFunctions/TableFunctionAzureBlobStorage.cpp b/src/TableFunctions/TableFunctionAzureBlobStorage.cpp index b098cac5144..066d6338b6a 100644 --- a/src/TableFunctions/TableFunctionAzureBlobStorage.cpp +++ b/src/TableFunctions/TableFunctionAzureBlobStorage.cpp @@ -58,7 +58,7 @@ void TableFunctionAzureBlobStorage::parseArgumentsImpl(ASTs & engine_args, const configuration.blobs_paths = {configuration.blob_path}; if (configuration.format == "auto") - configuration.format = FormatFactory::instance().getFormatFromFileName(configuration.blob_path, true); + configuration.format = FormatFactory::instance().tryGetFormatFromFileName(configuration.blob_path).value_or("auto"); } else { @@ -155,7 +155,7 @@ void TableFunctionAzureBlobStorage::parseArgumentsImpl(ASTs & engine_args, const configuration.blobs_paths = {configuration.blob_path}; if (configuration.format == "auto") - configuration.format = FormatFactory::instance().getFormatFromFileName(configuration.blob_path, true); + configuration.format = FormatFactory::instance().tryGetFormatFromFileName(configuration.blob_path).value_or("auto"); } } @@ -174,15 +174,24 @@ void TableFunctionAzureBlobStorage::parseArguments(const ASTPtr & ast_function, parseArgumentsImpl(args, context); } -void TableFunctionAzureBlobStorage::addColumnsStructureToArguments(ASTs & args, const String & structure, const ContextPtr & context) +void TableFunctionAzureBlobStorage::updateStructureAndFormatArgumentsIfNeeded(ASTs & args, const String & structure, const String & format, const ContextPtr & context) { - if (tryGetNamedCollectionWithOverrides(args, context)) + if (auto collection = tryGetNamedCollectionWithOverrides(args, context)) { - /// In case of named collection, just add key-value pair "structure='...'" - /// at the end of arguments to override existed structure. - ASTs equal_func_args = {std::make_shared("structure"), std::make_shared(structure)}; - auto equal_func = makeASTFunction("equals", std::move(equal_func_args)); - args.push_back(equal_func); + /// In case of named collection, just add key-value pairs "format='...', structure='...'" + /// at the end of arguments to override existed format and structure with "auto" values. + if (collection->getOrDefault("format", "auto") == "auto") + { + ASTs format_equal_func_args = {std::make_shared("format"), std::make_shared(format)}; + auto format_equal_func = makeASTFunction("equals", std::move(format_equal_func_args)); + args.push_back(format_equal_func); + } + if (collection->getOrDefault("structure", "auto") == "auto") + { + ASTs structure_equal_func_args = {std::make_shared("structure"), std::make_shared(structure)}; + auto structure_equal_func = makeASTFunction("equals", std::move(structure_equal_func_args)); + args.push_back(structure_equal_func); + } } else { @@ -191,65 +200,126 @@ void TableFunctionAzureBlobStorage::addColumnsStructureToArguments(ASTs & args, "Storage Azure requires 3 to 7 arguments: " "AzureBlobStorage(connection_string|storage_account_url, container_name, blobpath, [account_name, account_key, format, compression, structure])"); + auto format_literal = std::make_shared(format); auto structure_literal = std::make_shared(structure); + for (auto & arg : args) + arg = evaluateConstantExpressionOrIdentifierAsLiteral(arg, context); + auto is_format_arg = [](const std::string & s) -> bool { return s == "auto" || FormatFactory::instance().getAllFormats().contains(s); }; - + /// (connection_string, container_name, blobpath) if (args.size() == 3) { - /// Add format=auto & compression=auto before structure argument. - args.push_back(std::make_shared("auto")); + args.push_back(format_literal); + /// Add compression = "auto" before structure argument. args.push_back(std::make_shared("auto")); args.push_back(structure_literal); } + /// (connection_string, container_name, blobpath, structure) or + /// (connection_string, container_name, blobpath, format) + /// We can distinguish them by looking at the 4-th argument: check if it's format name or not. else if (args.size() == 4) { auto fourth_arg = checkAndGetLiteralArgument(args[3], "format/account_name/structure"); + /// (..., format) -> (..., format, compression, structure) if (is_format_arg(fourth_arg)) { + if (fourth_arg == "auto") + args[3] = format_literal; /// Add compression=auto before structure argument. args.push_back(std::make_shared("auto")); args.push_back(structure_literal); } + /// (..., structure) -> (..., format, compression, structure) else { - args.back() = structure_literal; + auto structure_arg = args.back(); + args[3] = format_literal; + /// Add compression=auto before structure argument. + args.push_back(std::make_shared("auto")); + if (fourth_arg == "auto") + args.push_back(structure_literal); + else + args.push_back(structure_arg); } } + /// (connection_string, container_name, blobpath, format, compression) or + /// (storage_account_url, container_name, blobpath, account_name, account_key) + /// We can distinguish them by looking at the 4-th argument: check if it's format name or not. else if (args.size() == 5) { auto fourth_arg = checkAndGetLiteralArgument(args[3], "format/account_name"); - if (!is_format_arg(fourth_arg)) + /// (..., format, compression) -> (..., format, compression, structure) + if (is_format_arg(fourth_arg)) { - /// Add format=auto & compression=auto before structure argument. - args.push_back(std::make_shared("auto")); - args.push_back(std::make_shared("auto")); + if (fourth_arg == "auto") + args[3] = format_literal; + args.push_back(structure_literal); } - args.push_back(structure_literal); - } - else if (args.size() == 6) - { - auto fourth_arg = checkAndGetLiteralArgument(args[3], "format/account_name"); - if (!is_format_arg(fourth_arg)) + /// (..., account_name, account_key) -> (..., account_name, account_key, format, compression, structure) + else { + args.push_back(format_literal); /// Add compression=auto before structure argument. args.push_back(std::make_shared("auto")); args.push_back(structure_literal); } + } + /// (connection_string, container_name, blobpath, format, compression, structure) or + /// (storage_account_url, container_name, blobpath, account_name, account_key, structure) or + /// (storage_account_url, container_name, blobpath, account_name, account_key, format) + else if (args.size() == 6) + { + auto fourth_arg = checkAndGetLiteralArgument(args[3], "format/account_name"); + auto sixth_arg = checkAndGetLiteralArgument(args[5], "format/structure"); + + /// (..., format, compression, structure) + if (is_format_arg(fourth_arg)) + { + if (fourth_arg == "auto") + args[3] = format_literal; + if (checkAndGetLiteralArgument(args[5], "structure") == "auto") + args[5] = structure_literal; + } + /// (..., account_name, account_key, format) -> (..., account_name, account_key, format, compression, structure) + else if (is_format_arg(sixth_arg)) + { + if (sixth_arg == "auto") + args[5] = format_literal; + /// Add compression=auto before structure argument. + args.push_back(std::make_shared("auto")); + args.push_back(structure_literal); + } + /// (..., account_name, account_key, structure) -> (..., account_name, account_key, format, compression, structure) else { - args.back() = structure_literal; + auto structure_arg = args.back(); + args[5] = format_literal; + /// Add compression=auto before structure argument. + args.push_back(std::make_shared("auto")); + if (sixth_arg == "auto") + args.push_back(structure_literal); + else + args.push_back(structure_arg); } } + /// (storage_account_url, container_name, blobpath, account_name, account_key, format, compression) else if (args.size() == 7) { + /// (..., format, compression) -> (..., format, compression, structure) + if (checkAndGetLiteralArgument(args[5], "format") == "auto") + args[5] = format_literal; args.push_back(structure_literal); } + /// (storage_account_url, container_name, blobpath, account_name, account_key, format, compression, structure) else if (args.size() == 8) { - args.back() = structure_literal; + if (checkAndGetLiteralArgument(args[5], "format") == "auto") + args[5] = format_literal; + if (checkAndGetLiteralArgument(args[7], "structure") == "auto") + args[7] = structure_literal; } } } @@ -263,7 +333,9 @@ ColumnsDescription TableFunctionAzureBlobStorage::getActualTableStructure(Contex auto settings = StorageAzureBlob::createSettings(context); auto object_storage = std::make_unique("AzureBlobStorageTableFunction", std::move(client), std::move(settings), configuration.container); - return StorageAzureBlob::getTableStructureFromData(object_storage.get(), configuration, std::nullopt, context, false); + if (configuration.format == "auto") + return StorageAzureBlob::getTableStructureAndFormatFromData(object_storage.get(), configuration, std::nullopt, context).first; + return StorageAzureBlob::getTableStructureFromData(object_storage.get(), configuration, std::nullopt, context); } return parseColumnsListFromString(configuration.structure, context); diff --git a/src/TableFunctions/TableFunctionAzureBlobStorage.h b/src/TableFunctions/TableFunctionAzureBlobStorage.h index 1a221f60c55..9622881b417 100644 --- a/src/TableFunctions/TableFunctionAzureBlobStorage.h +++ b/src/TableFunctions/TableFunctionAzureBlobStorage.h @@ -55,7 +55,7 @@ public: virtual void parseArgumentsImpl(ASTs & args, const ContextPtr & context); - static void addColumnsStructureToArguments(ASTs & args, const String & structure, const ContextPtr & context); + static void updateStructureAndFormatArgumentsIfNeeded(ASTs & args, const String & structure, const String & format, const ContextPtr & context); protected: diff --git a/src/TableFunctions/TableFunctionAzureBlobStorageCluster.cpp b/src/TableFunctions/TableFunctionAzureBlobStorageCluster.cpp index 1c3b302a186..04dddca7672 100644 --- a/src/TableFunctions/TableFunctionAzureBlobStorageCluster.cpp +++ b/src/TableFunctions/TableFunctionAzureBlobStorageCluster.cpp @@ -21,9 +21,8 @@ StoragePtr TableFunctionAzureBlobStorageCluster::executeImpl( { StoragePtr storage; ColumnsDescription columns; - bool structure_argument_was_provided = configuration.structure != "auto"; - if (structure_argument_was_provided) + if (configuration.structure != "auto") { columns = parseColumnsListFromString(configuration.structure, context); } @@ -59,8 +58,7 @@ StoragePtr TableFunctionAzureBlobStorageCluster::executeImpl( StorageID(getDatabaseName(), table_name), columns, ConstraintsDescription{}, - context, - structure_argument_was_provided); + context); } storage->startup(); diff --git a/src/TableFunctions/TableFunctionExplain.cpp b/src/TableFunctions/TableFunctionExplain.cpp index f993a9820cb..400fc81e6d4 100644 --- a/src/TableFunctions/TableFunctionExplain.cpp +++ b/src/TableFunctions/TableFunctionExplain.cpp @@ -1,4 +1,5 @@ #include +#include #include #include #include @@ -21,6 +22,7 @@ namespace ErrorCodes { extern const int LOGICAL_ERROR; extern const int BAD_ARGUMENTS; + extern const int UNEXPECTED_AST_STRUCTURE; } namespace @@ -103,11 +105,25 @@ void TableFunctionExplain::parseArguments(const ASTPtr & ast_function, ContextPt if (function->arguments->children.size() > 2) { - const auto & query_arg = function->arguments->children[2]; + const auto & subquery_arg = function->arguments->children[2]; + const auto * subquery = subquery_arg->as(); + + if (!subquery) + throw Exception(ErrorCodes::BAD_ARGUMENTS, + "Table function '{}' requires a subquery argument, got '{}'", + getName(), queryToString(subquery_arg)); + + if (subquery->children.empty()) + throw Exception(ErrorCodes::UNEXPECTED_AST_STRUCTURE, + "A subquery AST element must have a child"); + + const auto & query_arg = subquery->children[0]; + if (!query_arg->as()) throw Exception(ErrorCodes::BAD_ARGUMENTS, - "Table function '{}' requires a EXPLAIN SELECT query argument, got EXPLAIN '{}'", + "Table function '{}' requires a EXPLAIN's SELECT query argument, got '{}'", getName(), queryToString(query_arg)); + explain_query->setExplainedQuery(query_arg); } else if (kind != ASTExplainQuery::ExplainKind::CurrentTransaction) diff --git a/src/TableFunctions/TableFunctionFile.cpp b/src/TableFunctions/TableFunctionFile.cpp index 8a9dde374ec..b481076e9b6 100644 --- a/src/TableFunctions/TableFunctionFile.cpp +++ b/src/TableFunctions/TableFunctionFile.cpp @@ -54,12 +54,12 @@ void TableFunctionFile::parseFirstArguments(const ASTPtr & arg, const ContextPtr throw Exception(ErrorCodes::BAD_ARGUMENTS, "The first argument of table function '{}' mush be path or file descriptor", getName()); } -String TableFunctionFile::getFormatFromFirstArgument() +std::optional TableFunctionFile::tryGetFormatFromFirstArgument() { if (fd >= 0) - return FormatFactory::instance().getFormatFromFileDescriptor(fd); + return FormatFactory::instance().tryGetFormatFromFileDescriptor(fd); else - return FormatFactory::instance().getFormatFromFileName(filename, true); + return FormatFactory::instance().tryGetFormatFromFileName(filename); } StoragePtr TableFunctionFile::getStorage(const String & source, @@ -104,10 +104,11 @@ ColumnsDescription TableFunctionFile::getActualTableStructure(ContextPtr context archive_info = StorageFile::getArchiveInfo(path_to_archive, filename, context->getUserFilesPath(), context, total_bytes_to_read); + if (format == "auto") + return StorageFile::getTableStructureAndFormatFromFile(paths, compression_method, std::nullopt, context, archive_info).first; return StorageFile::getTableStructureFromFile(format, paths, compression_method, std::nullopt, context, archive_info); } - return parseColumnsListFromString(structure, context); } diff --git a/src/TableFunctions/TableFunctionFile.h b/src/TableFunctions/TableFunctionFile.h index 6eaab29db8a..c1924028b49 100644 --- a/src/TableFunctions/TableFunctionFile.h +++ b/src/TableFunctions/TableFunctionFile.h @@ -26,8 +26,9 @@ public: protected: int fd = -1; + String path_to_archive; void parseFirstArguments(const ASTPtr & arg, const ContextPtr & context) override; - String getFormatFromFirstArgument() override; + std::optional tryGetFormatFromFirstArgument() override; private: StoragePtr getStorage( diff --git a/src/TableFunctions/TableFunctionFileCluster.cpp b/src/TableFunctions/TableFunctionFileCluster.cpp index 843909e2a58..3e53349b022 100644 --- a/src/TableFunctions/TableFunctionFileCluster.cpp +++ b/src/TableFunctions/TableFunctionFileCluster.cpp @@ -43,8 +43,7 @@ StoragePtr TableFunctionFileCluster::getStorage( compression_method, StorageID(getDatabaseName(), table_name), columns, - ConstraintsDescription{}, - structure != "auto"); + ConstraintsDescription{}); } return storage; diff --git a/src/TableFunctions/TableFunctionFormat.cpp b/src/TableFunctions/TableFunctionFormat.cpp index 4b6d0f70c0a..ad2a142a140 100644 --- a/src/TableFunctions/TableFunctionFormat.cpp +++ b/src/TableFunctions/TableFunctionFormat.cpp @@ -33,7 +33,9 @@ namespace ErrorCodes namespace { -/* format(format_name, data) - ... +/* format(format_name, structure, data) - parses data according to the specified format and structure. + * format(format_name, data) - infers the schema from the data and parses it according to the specified format. + * format(data) - detects the format, infers the schema and parses data according to inferred format and structure. */ class TableFunctionFormat : public ITableFunction { @@ -49,11 +51,11 @@ private: ColumnsDescription getActualTableStructure(ContextPtr context, bool is_insert_query) const override; void parseArguments(const ASTPtr & ast_function, ContextPtr context) override; - Block parseData(ColumnsDescription columns, ContextPtr context) const; + Block parseData(const ColumnsDescription & columns, const String & format_name, const ContextPtr & context) const; - String format; - String data; + String format = "auto"; String structure = "auto"; + String data; }; void TableFunctionFormat::parseArguments(const ASTPtr & ast_function, ContextPtr context) @@ -65,14 +67,15 @@ void TableFunctionFormat::parseArguments(const ASTPtr & ast_function, ContextPtr ASTs & args = args_func.at(0)->children; - if (args.size() != 2 && args.size() != 3) - throw Exception(ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH, "Table function '{}' requires 2 or 3 arguments: format, [structure], data", getName()); + if (args.empty() || args.size() > 3) + throw Exception(ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH, "Table function '{}' requires from 1 to 3 arguments: [format, [structure]], data", getName()); for (auto & arg : args) arg = evaluateConstantExpressionOrIdentifierAsLiteral(arg, context); - format = checkAndGetLiteralArgument(args[0], "format"); data = checkAndGetLiteralArgument(args.back(), "data"); + if (args.size() > 1) + format = checkAndGetLiteralArgument(args[0], "format"); if (args.size() == 3) structure = checkAndGetLiteralArgument(args[1], "structure"); } @@ -82,19 +85,21 @@ ColumnsDescription TableFunctionFormat::getActualTableStructure(ContextPtr conte if (structure == "auto") { SingleReadBufferIterator read_buffer_iterator(std::make_unique(data)); - return readSchemaFromFormat(format, std::nullopt, read_buffer_iterator, false, context); + if (format == "auto") + return detectFormatAndReadSchema(std::nullopt, read_buffer_iterator, context).first; + return readSchemaFromFormat(format, std::nullopt, read_buffer_iterator, context); } return parseColumnsListFromString(structure, context); } -Block TableFunctionFormat::parseData(ColumnsDescription columns, ContextPtr context) const +Block TableFunctionFormat::parseData(const ColumnsDescription & columns, const String & format_name, const ContextPtr & context) const { Block block; for (const auto & name_and_type : columns.getAllPhysical()) block.insert({name_and_type.type->createColumn(), name_and_type.type, name_and_type.name}); auto read_buf = std::make_unique(data); - auto input_format = context->getInputFormat(format, *read_buf, block, context->getSettingsRef().max_block_size); + auto input_format = context->getInputFormat(format_name, *read_buf, block, context->getSettingsRef().max_block_size); QueryPipelineBuilder builder; builder.init(Pipe(input_format)); if (columns.hasDefaults()) @@ -120,10 +125,24 @@ Block TableFunctionFormat::parseData(ColumnsDescription columns, ContextPtr cont return concatenateBlocks(blocks); } -StoragePtr TableFunctionFormat::executeImpl(const ASTPtr & /*ast_function*/, ContextPtr context, const std::string & table_name, ColumnsDescription /*cached_columns*/, bool is_insert_query) const +StoragePtr TableFunctionFormat::executeImpl(const ASTPtr & /*ast_function*/, ContextPtr context, const std::string & table_name, ColumnsDescription /*cached_columns*/, bool /*is_insert_query*/) const { - auto columns = getActualTableStructure(context, is_insert_query); - Block res_block = parseData(columns, context); + ColumnsDescription columns; + String format_name = format; + if (structure == "auto") + { + SingleReadBufferIterator read_buffer_iterator(std::make_unique(data)); + if (format_name == "auto") + std::tie(columns, format_name) = detectFormatAndReadSchema(std::nullopt, read_buffer_iterator, context); + else + columns = readSchemaFromFormat(format, std::nullopt, read_buffer_iterator, context); + } + else + { + columns = parseColumnsListFromString(structure, context); + } + + Block res_block = parseData(columns, format_name, context); auto res = std::make_shared(StorageID(getDatabaseName(), table_name), columns, res_block); res->startup(); return res; diff --git a/src/TableFunctions/TableFunctionHDFS.cpp b/src/TableFunctions/TableFunctionHDFS.cpp index 8d48a7ba30e..2dac4398144 100644 --- a/src/TableFunctions/TableFunctionHDFS.cpp +++ b/src/TableFunctions/TableFunctionHDFS.cpp @@ -33,6 +33,8 @@ ColumnsDescription TableFunctionHDFS::getActualTableStructure(ContextPtr context if (structure == "auto") { context->checkAccess(getSourceAccessType()); + if (format == "auto") + return StorageHDFS::getTableStructureAndFormatFromData(filename, compression_method, context).first; return StorageHDFS::getTableStructureFromData(format, filename, compression_method, context); } diff --git a/src/TableFunctions/TableFunctionHDFSCluster.cpp b/src/TableFunctions/TableFunctionHDFSCluster.cpp index 6fb7ed0fce5..57ce6d2b9ff 100644 --- a/src/TableFunctions/TableFunctionHDFSCluster.cpp +++ b/src/TableFunctions/TableFunctionHDFSCluster.cpp @@ -45,8 +45,7 @@ StoragePtr TableFunctionHDFSCluster::getStorage( format, columns, ConstraintsDescription{}, - compression_method, - structure != "auto"); + compression_method); } return storage; } diff --git a/src/TableFunctions/TableFunctionS3.cpp b/src/TableFunctions/TableFunctionS3.cpp index a9c5a5c99f0..3fedd38277c 100644 --- a/src/TableFunctions/TableFunctionS3.cpp +++ b/src/TableFunctions/TableFunctionS3.cpp @@ -61,12 +61,11 @@ void TableFunctionS3::parseArgumentsImpl(ASTs & args, const ContextPtr & context if (configuration.format == "auto") { String file_path = named_collection->getOrDefault("filename", Poco::URI(named_collection->get("url")).getPath()); - configuration.format = FormatFactory::instance().getFormatFromFileName(file_path, true); + configuration.format = FormatFactory::instance().tryGetFormatFromFileName(file_path).value_or("auto"); } } else { - size_t count = StorageURL::evalArgsAndCollectHeaders(args, configuration.headers_from_ast, context); if (count == 0 || count > 7) @@ -216,7 +215,7 @@ void TableFunctionS3::parseArgumentsImpl(ASTs & args, const ContextPtr & context configuration.auth_settings.no_sign_request = no_sign_request; if (configuration.format == "auto") - configuration.format = FormatFactory::instance().getFormatFromFileName(Poco::URI(url).getPath(), true); + configuration.format = FormatFactory::instance().tryGetFormatFromFileName(Poco::URI(url).getPath()).value_or("auto"); } configuration.keys = {configuration.url.key}; @@ -238,15 +237,24 @@ void TableFunctionS3::parseArguments(const ASTPtr & ast_function, ContextPtr con parseArgumentsImpl(args, context); } -void TableFunctionS3::addColumnsStructureToArguments(ASTs & args, const String & structure, const ContextPtr & context) +void TableFunctionS3::updateStructureAndFormatArgumentsIfNeeded(ASTs & args, const String & structure, const String & format, const ContextPtr & context) { - if (tryGetNamedCollectionWithOverrides(args, context)) + if (auto collection = tryGetNamedCollectionWithOverrides(args, context)) { - /// In case of named collection, just add key-value pair "structure='...'" - /// at the end of arguments to override existed structure. - ASTs equal_func_args = {std::make_shared("structure"), std::make_shared(structure)}; - auto equal_func = makeASTFunction("equals", std::move(equal_func_args)); - args.push_back(equal_func); + /// In case of named collection, just add key-value pairs "format='...', structure='...'" + /// at the end of arguments to override existed format and structure with "auto" values. + if (collection->getOrDefault("format", "auto") == "auto") + { + ASTs format_equal_func_args = {std::make_shared("format"), std::make_shared(format)}; + auto format_equal_func = makeASTFunction("equals", std::move(format_equal_func_args)); + args.push_back(format_equal_func); + } + if (collection->getOrDefault("structure", "auto") == "auto") + { + ASTs structure_equal_func_args = {std::make_shared("structure"), std::make_shared(structure)}; + auto structure_equal_func = makeASTFunction("equals", std::move(structure_equal_func_args)); + args.push_back(structure_equal_func); + } } else { @@ -256,23 +264,25 @@ void TableFunctionS3::addColumnsStructureToArguments(ASTs & args, const String & if (count == 0 || count > getMaxNumberOfArguments()) throw Exception(ErrorCodes::LOGICAL_ERROR, "Expected 1 to {} arguments in table function, got {}", getMaxNumberOfArguments(), count); + auto format_literal = std::make_shared(format); auto structure_literal = std::make_shared(structure); - /// s3(s3_url) + /// s3(s3_url) -> s3(s3_url, format, structure) if (count == 1) { - /// Add format=auto before structure argument. - args.push_back(std::make_shared("auto")); + args.push_back(format_literal); args.push_back(structure_literal); } - /// s3(s3_url, format) or s3(s3_url, NOSIGN) + /// s3(s3_url, format) -> s3(s3_url, format, structure) or + /// s3(s3_url, NOSIGN) -> s3(s3_url, NOSIGN, format, structure) /// We can distinguish them by looking at the 2-nd argument: check if it's NOSIGN or not. else if (count == 2) { auto second_arg = checkAndGetLiteralArgument(args[1], "format/NOSIGN"); - /// If there is NOSIGN, add format=auto before structure. if (boost::iequals(second_arg, "NOSIGN")) - args.push_back(std::make_shared("auto")); + args.push_back(format_literal); + else if (second_arg == "auto") + args.back() = format_literal; args.push_back(structure_literal); } /// s3(source, format, structure) or @@ -282,18 +292,25 @@ void TableFunctionS3::addColumnsStructureToArguments(ASTs & args, const String & else if (count == 3) { auto second_arg = checkAndGetLiteralArgument(args[1], "format/NOSIGN"); + /// s3(source, NOSIGN, format) -> s3(source, NOSIGN, format, structure) if (boost::iequals(second_arg, "NOSIGN")) { + if (checkAndGetLiteralArgument(args[2], "format") == "auto") + args.back() = format_literal; args.push_back(structure_literal); } + /// s3(source, format, structure) else if (second_arg == "auto" || FormatFactory::instance().getAllFormats().contains(second_arg)) { - args[count - 1] = structure_literal; + if (second_arg == "auto") + args[1] = format_literal; + if (checkAndGetLiteralArgument(args[2], "structure") == "auto") + args[2] = structure_literal; } + /// s3(source, access_key_id, access_key_id) -> s3(source, access_key_id, access_key_id, format, structure) else { - /// Add format=auto before structure argument. - args.push_back(std::make_shared("auto")); + args.push_back(format_literal); args.push_back(structure_literal); } } @@ -304,16 +321,27 @@ void TableFunctionS3::addColumnsStructureToArguments(ASTs & args, const String & else if (count == 4) { auto second_arg = checkAndGetLiteralArgument(args[1], "format/NOSIGN"); + /// s3(source, NOSIGN, format, structure) if (boost::iequals(second_arg, "NOSIGN")) { - args[count - 1] = structure_literal; + if (checkAndGetLiteralArgument(args[2], "format") == "auto") + args[2] = format_literal; + if (checkAndGetLiteralArgument(args[3], "structure") == "auto") + args[3] = structure_literal; } + /// s3(source, format, structure, compression_method) else if (second_arg == "auto" || FormatFactory::instance().getAllFormats().contains(second_arg)) { - args[count - 2] = structure_literal; + if (second_arg == "auto") + args[1] = format_literal; + if (checkAndGetLiteralArgument(args[2], "structure") == "auto") + args[2] = structure_literal; } + /// s3(source, access_key_id, access_key_id, format) -> s3(source, access_key_id, access_key_id, format, structure) else { + if (checkAndGetLiteralArgument(args[3], "format") == "auto") + args[3] = format_literal; args.push_back(structure_literal); } } @@ -323,19 +351,30 @@ void TableFunctionS3::addColumnsStructureToArguments(ASTs & args, const String & else if (count == 5) { auto sedond_arg = checkAndGetLiteralArgument(args[1], "format/NOSIGN"); + /// s3(source, NOSIGN, format, structure, compression_method) if (boost::iequals(sedond_arg, "NOSIGN")) { - args[count - 2] = structure_literal; + if (checkAndGetLiteralArgument(args[2], "format") == "auto") + args[2] = format_literal; + if (checkAndGetLiteralArgument(args[3], "structure") == "auto") + args[3] = structure_literal; } + /// s3(source, access_key_id, access_key_id, format, structure) else { - args[count - 1] = structure_literal; + if (checkAndGetLiteralArgument(args[3], "format") == "auto") + args[3] = format_literal; + if (checkAndGetLiteralArgument(args[4], "structure") == "auto") + args[4] = structure_literal; } } /// s3(source, access_key_id, secret_access_key, format, structure, compression) else if (count == 6) { - args[count - 2] = structure_literal; + if (checkAndGetLiteralArgument(args[3], "format") == "auto") + args[3] = format_literal; + if (checkAndGetLiteralArgument(args[4], "structure") == "auto") + args[4] = structure_literal; } } } @@ -346,6 +385,9 @@ ColumnsDescription TableFunctionS3::getActualTableStructure(ContextPtr context, { context->checkAccess(getSourceAccessType()); configuration.update(context); + if (configuration.format == "auto") + return StorageS3::getTableStructureAndFormatFromData(configuration, std::nullopt, context).first; + return StorageS3::getTableStructureFromData(configuration, std::nullopt, context); } diff --git a/src/TableFunctions/TableFunctionS3.h b/src/TableFunctions/TableFunctionS3.h index fa73c1d313e..00ca36c6653 100644 --- a/src/TableFunctions/TableFunctionS3.h +++ b/src/TableFunctions/TableFunctionS3.h @@ -57,7 +57,7 @@ public: virtual void parseArgumentsImpl(ASTs & args, const ContextPtr & context); - static void addColumnsStructureToArguments(ASTs & args, const String & structure, const ContextPtr & context); + static void updateStructureAndFormatArgumentsIfNeeded(ASTs & args, const String & structure, const String & format, const ContextPtr & context); protected: diff --git a/src/TableFunctions/TableFunctionS3Cluster.cpp b/src/TableFunctions/TableFunctionS3Cluster.cpp index ce96f7f580b..e727c4e4c89 100644 --- a/src/TableFunctions/TableFunctionS3Cluster.cpp +++ b/src/TableFunctions/TableFunctionS3Cluster.cpp @@ -21,9 +21,8 @@ StoragePtr TableFunctionS3Cluster::executeImpl( { StoragePtr storage; ColumnsDescription columns; - bool structure_argument_was_provided = configuration.structure != "auto"; - if (structure_argument_was_provided) + if (configuration.structure != "auto") { columns = parseColumnsListFromString(configuration.structure, context); } @@ -53,8 +52,7 @@ StoragePtr TableFunctionS3Cluster::executeImpl( StorageID(getDatabaseName(), table_name), columns, ConstraintsDescription{}, - context, - structure_argument_was_provided); + context); } storage->startup(); diff --git a/src/TableFunctions/TableFunctionURL.cpp b/src/TableFunctions/TableFunctionURL.cpp index aa535991d65..a78b2affa9a 100644 --- a/src/TableFunctions/TableFunctionURL.cpp +++ b/src/TableFunctions/TableFunctionURL.cpp @@ -55,7 +55,7 @@ void TableFunctionURL::parseArgumentsImpl(ASTs & args, const ContextPtr & contex format = configuration.format; if (format == "auto") - format = FormatFactory::instance().getFormatFromFileName(Poco::URI(filename).getPath(), true); + format = FormatFactory::instance().tryGetFormatFromFileName(Poco::URI(filename).getPath()).value_or("auto"); StorageURL::evalArgsAndCollectHeaders(args, configuration.headers, context); } @@ -78,15 +78,24 @@ void TableFunctionURL::parseArgumentsImpl(ASTs & args, const ContextPtr & contex } } -void TableFunctionURL::addColumnsStructureToArguments(ASTs & args, const String & desired_structure, const ContextPtr & context) +void TableFunctionURL::updateStructureAndFormatArgumentsIfNeeded(ASTs & args, const String & structure_, const String & format_, const ContextPtr & context) { - if (tryGetNamedCollectionWithOverrides(args, context)) + if (auto collection = tryGetNamedCollectionWithOverrides(args, context)) { - /// In case of named collection, just add key-value pair "structure='...'" - /// at the end of arguments to override existed structure. - ASTs equal_func_args = {std::make_shared("structure"), std::make_shared(desired_structure)}; - auto equal_func = makeASTFunction("equals", std::move(equal_func_args)); - args.push_back(equal_func); + /// In case of named collection, just add key-value pairs "format='...', structure='...'" + /// at the end of arguments to override existed format and structure with "auto" values. + if (collection->getOrDefault("format", "auto") == "auto") + { + ASTs format_equal_func_args = {std::make_shared("format"), std::make_shared(format_)}; + auto format_equal_func = makeASTFunction("equals", std::move(format_equal_func_args)); + args.push_back(format_equal_func); + } + if (collection->getOrDefault("structure", "auto") == "auto") + { + ASTs structure_equal_func_args = {std::make_shared("structure"), std::make_shared(structure_)}; + auto structure_equal_func = makeASTFunction("equals", std::move(structure_equal_func_args)); + args.push_back(structure_equal_func); + } } else { @@ -101,7 +110,7 @@ void TableFunctionURL::addColumnsStructureToArguments(ASTs & args, const String args.pop_back(); } - ITableFunctionFileLike::addColumnsStructureToArguments(args, desired_structure, context); + ITableFunctionFileLike::updateStructureAndFormatArgumentsIfNeeded(args, structure_, format_, context); if (headers_ast) args.push_back(headers_ast); @@ -131,6 +140,14 @@ ColumnsDescription TableFunctionURL::getActualTableStructure(ContextPtr context, if (structure == "auto") { context->checkAccess(getSourceAccessType()); + if (format == "auto") + return StorageURL::getTableStructureAndFormatFromData( + filename, + chooseCompressionMethod(Poco::URI(filename).getPath(), compression_method), + configuration.headers, + std::nullopt, + context).first; + return StorageURL::getTableStructureFromData(format, filename, chooseCompressionMethod(Poco::URI(filename).getPath(), compression_method), @@ -148,9 +165,9 @@ std::unordered_set TableFunctionURL::getVirtualsToCheckBeforeUsingStruct return {virtual_column_names.begin(), virtual_column_names.end()}; } -String TableFunctionURL::getFormatFromFirstArgument() +std::optional TableFunctionURL::tryGetFormatFromFirstArgument() { - return FormatFactory::instance().getFormatFromFileName(Poco::URI(filename).getPath(), true); + return FormatFactory::instance().tryGetFormatFromFileName(Poco::URI(filename).getPath()); } void registerTableFunctionURL(TableFunctionFactory & factory) diff --git a/src/TableFunctions/TableFunctionURL.h b/src/TableFunctions/TableFunctionURL.h index bf417f950c0..54e223283ba 100644 --- a/src/TableFunctions/TableFunctionURL.h +++ b/src/TableFunctions/TableFunctionURL.h @@ -34,7 +34,7 @@ public: ColumnsDescription getActualTableStructure(ContextPtr context, bool is_insert_query) const override; - static void addColumnsStructureToArguments(ASTs & args, const String & desired_structure, const ContextPtr & context); + static void updateStructureAndFormatArgumentsIfNeeded(ASTs & args, const String & structure_, const String & format_, const ContextPtr & context); std::unordered_set getVirtualsToCheckBeforeUsingStructureHint() const override; @@ -53,8 +53,7 @@ private: const char * getStorageTypeName() const override { return "URL"; } - String getFormatFromFirstArgument() override; - + std::optional tryGetFormatFromFirstArgument() override; }; } diff --git a/src/TableFunctions/TableFunctionURLCluster.cpp b/src/TableFunctions/TableFunctionURLCluster.cpp index a2949278155..5fd3c3342a5 100644 --- a/src/TableFunctions/TableFunctionURLCluster.cpp +++ b/src/TableFunctions/TableFunctionURLCluster.cpp @@ -40,8 +40,7 @@ StoragePtr TableFunctionURLCluster::getStorage( StorageID(getDatabaseName(), table_name), getActualTableStructure(context, /* is_insert_query */ true), ConstraintsDescription{}, - configuration, - structure != "auto"); + configuration); } return storage; } diff --git a/tests/analyzer_integration_broken_tests.txt b/tests/analyzer_integration_broken_tests.txt index b2576bfdb2e..6cf5d3b6008 100644 --- a/tests/analyzer_integration_broken_tests.txt +++ b/tests/analyzer_integration_broken_tests.txt @@ -1,16 +1,9 @@ test_build_sets_from_multiple_threads/test.py::test_set test_concurrent_backups_s3/test.py::test_concurrent_backups -test_distributed_backward_compatability/test.py::test_distributed_in_tuple test_distributed_type_object/test.py::test_distributed_type_object test_executable_table_function/test.py::test_executable_function_input_python test_mask_sensitive_info/test.py::test_encryption_functions test_merge_table_over_distributed/test.py::test_global_in test_merge_table_over_distributed/test.py::test_select_table_name_from_merge_over_distributed -test_mutations_with_merge_tree/test.py::test_mutations_with_merge_background_task test_passing_max_partitions_to_read_remotely/test.py::test_default_database_on_cluster test_select_access_rights/test_main.py::test_alias_columns -test_settings_profile/test.py::test_show_profiles -test_shard_level_const_function/test.py::test_remote -test_sql_user_defined_functions_on_cluster/test.py::test_sql_user_defined_functions_on_cluster -test_storage_rabbitmq/test.py::test_rabbitmq_materialized_view -test_wrong_db_or_table_name/test.py::test_wrong_table_name diff --git a/tests/analyzer_tech_debt.txt b/tests/analyzer_tech_debt.txt index 29331d674c8..bbbb09bfd68 100644 --- a/tests/analyzer_tech_debt.txt +++ b/tests/analyzer_tech_debt.txt @@ -1,4 +1,4 @@ -00223_shard_distributed_aggregation_memory_efficient +00223_shard_distributed_aggregation_memory_efficien 00717_merge_and_distributed 00725_memory_tracking 01062_pm_all_join_with_block_continuation @@ -7,18 +7,14 @@ 01244_optimize_distributed_group_by_sharding_key 01584_distributed_buffer_cannot_find_column 01624_soft_constraints -01656_test_query_log_factories_info 01747_join_view_filter_dictionary 01761_cast_to_enum_nullable 01925_join_materialized_columns 01952_optimize_distributed_group_by_sharding_key 02174_cte_scalar_cache_mv 02354_annoy -02428_parameterized_view 02493_inconsistent_hex_and_binary_number 02725_agg_projection_resprect_PK -02763_row_policy_storage_merge_alias -02818_parameterized_view_with_cte_multiple_usage # Check after constants refactoring 02901_parallel_replicas_rollup # Flaky. Please don't delete them without fixing them: diff --git a/tests/ci/bugfix_validate_check.py b/tests/ci/bugfix_validate_check.py index 7fda81f11b2..ae7fce1f102 100644 --- a/tests/ci/bugfix_validate_check.py +++ b/tests/ci/bugfix_validate_check.py @@ -1,28 +1,28 @@ #!/usr/bin/env python3 -import argparse +from pathlib import Path +import subprocess +import sys +from typing import List, Sequence, Tuple import csv import logging -from pathlib import Path -from typing import List, Optional, Tuple -# isort: off -from github import Github - -# isort: on - -from commit_status_helper import get_commit, post_commit_status -from get_robot_token import get_best_robot_token -from pr_info import PRInfo -from report import ERROR, SUCCESS, TestResult, TestResults -from s3_helper import S3Helper -from upload_result_helper import upload_results - - -def parse_args() -> argparse.Namespace: - parser = argparse.ArgumentParser() - parser.add_argument("files", nargs="+", type=Path, help="Path to status files") - return parser.parse_args() +from report import ( + ERROR, + FAILURE, + SKIPPED, + SUCCESS, + FAIL, + OK, + TestResult, + TestResults, + JobReport, +) +from env_helper import TEMP_PATH +from stopwatch import Stopwatch +from ci_config import JobNames +from ci_utils import normalize_string +from functional_test_check import NO_CHANGES_MSG def post_commit_status_from_file(file_path: Path) -> List[str]: @@ -35,93 +35,123 @@ def post_commit_status_from_file(file_path: Path) -> List[str]: return res[0] -# Returns (is_ok, test_results, error_message) -def process_result(file_path: Path) -> Tuple[bool, TestResults, Optional[str]]: - test_results = [] # type: TestResults - state, report_url, description = post_commit_status_from_file(file_path) - prefix = file_path.parent.name - if description.strip() in [ - "Invalid check_status.tsv", - "Not found test_results.tsv", - "Empty test_results.tsv", - ]: - status = ( - f'Check failed (Report)' - if report_url != "null" - else "Check failed" - ) - return False, [TestResult(f"{prefix}: {description}", status)], "Check failed" - - is_ok = state == SUCCESS - if is_ok and report_url == "null": - return is_ok, test_results, None - - status = ( - f'OK: Bug reproduced (Report)' - if is_ok - else f'Bug is not reproduced (Report)' - ) - test_results.append(TestResult(f"{prefix}: {description}", status)) - return is_ok, test_results, None +def get_failed_test_cases(file_path: Path) -> List[TestResult]: + job_report = JobReport.load(from_file=file_path) + test_results = [] # type: List[TestResult] + for tr in job_report.test_results: + if tr.status == FAIL: + if tr.name == NO_CHANGES_MSG: + tr.status = SKIPPED + else: + tr.name = "[with NOT_OK] " + tr.name + tr.status = OK + elif tr.status == OK: + tr.name = "[with NOT_OK] " + tr.name + tr.status = FAIL + else: + # do not invert error status + pass + test_results.append(tr) + return test_results def process_all_results( - file_paths: List[Path], -) -> Tuple[bool, TestResults, Optional[str]]: - any_ok = False - all_results = [] - error = None - for status_path in file_paths: - is_ok, test_results, error = process_result(status_path) - any_ok = any_ok or is_ok - if test_results is not None: - all_results.extend(test_results) + file_paths: Sequence[Path], +) -> Tuple[str, str, TestResults]: + all_results = [] # type: TestResults + has_fail = False + has_error = False + has_ok = False + for job_report_path in file_paths: + test_results = get_failed_test_cases(job_report_path) + for tr in test_results: + if tr.status == FAIL: + has_fail = True + elif tr.status == ERROR: + has_error = True + elif tr.status == OK: + has_ok = True + all_results.extend(test_results) + if has_error: + status = ERROR + description = "Some error(s) occured in tests" + elif has_ok: + status = SUCCESS + description = "New test(s) reproduced a bug" + elif has_fail: + status = FAILURE + description = "New test(s) failed to reproduce a bug" + else: + status = ERROR + description = "Invalid job results" - return any_ok and error is None, all_results, error + return status, description, all_results def main(): logging.basicConfig(level=logging.INFO) - args = parse_args() - status_files = args.files # type: List[Path] + # args = parse_args() + stopwatch = Stopwatch() + jobs_to_validate = [JobNames.STATELESS_TEST_RELEASE, JobNames.INTEGRATION_TEST] + functional_job_report_file = Path(TEMP_PATH) / "functional_test_job_report.json" + integration_job_report_file = Path(TEMP_PATH) / "integration_test_job_report.json" + jobs_report_files = { + JobNames.STATELESS_TEST_RELEASE: functional_job_report_file, + JobNames.INTEGRATION_TEST: integration_job_report_file, + } + jobs_scripts = { + JobNames.STATELESS_TEST_RELEASE: "functional_test_check.py", + JobNames.INTEGRATION_TEST: "integration_test_check.py", + } - check_name_with_group = "Bugfix validate check" - - is_ok, test_results, error = process_all_results(status_files) - - description = "" - if error: - description = error - elif not is_ok: - description = "Changed tests don't reproduce the bug" - - pr_info = PRInfo() - if not test_results: - description = "No results to upload" - report_url = "" - logging.info("No results to upload") - else: - report_url = upload_results( - S3Helper(), - pr_info.number, - pr_info.sha, - test_results, - status_files, - check_name_with_group, + for test_job in jobs_to_validate: + report_file = jobs_report_files[test_job] + test_script = jobs_scripts[test_job] + if report_file.exists(): + report_file.unlink() + extra_timeout_option = "" + if test_job == JobNames.STATELESS_TEST_RELEASE: + extra_timeout_option = str(3600) + # "bugfix" must be present in checkname, as integration test runner checks this + check_name = f"Validate bugfix: {test_job}" + command = f"python3 {test_script} '{check_name}' {extra_timeout_option} --validate-bugfix --report-to-file {report_file}" + print(f"Going to validate job [{test_job}], command [{command}]") + _ = subprocess.run( + command, + stdout=sys.stdout, + stderr=sys.stderr, + text=True, + check=False, + shell=True, ) + assert ( + report_file.is_file() + ), f"No job report [{report_file}] found after job execution" - gh = Github(get_best_robot_token(), per_page=100) - commit = get_commit(gh, pr_info.sha) - post_commit_status( - commit, - SUCCESS if is_ok else ERROR, - report_url, - description, - check_name_with_group, - pr_info, - dump_to_file=True, + status, description, test_results = process_all_results( + list(jobs_report_files.values()) ) + additional_files = [] + for job_id, report_file in jobs_report_files.items(): + jr = JobReport.load(from_file=report_file) + additional_files.append(report_file) + for file in set(jr.additional_files): + file_ = Path(file) + file_name = file_.name + file_name = file_name.replace(".", "__" + normalize_string(job_id) + ".", 1) + file_ = file_.rename(file_.parent / file_name) + additional_files.append(file_) + + JobReport( + description=description, + test_results=test_results, + status=status, + start_time=stopwatch.start_time_str, + duration=stopwatch.duration_seconds, + additional_files=additional_files, + ).dump() + if __name__ == "__main__": main() diff --git a/tests/ci/ci.py b/tests/ci/ci.py index 47e20b3ec09..320a0ef42d5 100644 --- a/tests/ci/ci.py +++ b/tests/ci/ci.py @@ -6,12 +6,13 @@ from enum import Enum import json import logging import os +import random import re import subprocess import sys import time from pathlib import Path -from typing import Any, Dict, List, Optional, Sequence, Union +from typing import Any, Dict, List, Optional, Sequence, Set, Union import docker_images_helper import upload_result_helper @@ -1107,6 +1108,7 @@ def _configure_jobs( ci_cache.print_status() jobs_to_wait: Dict[str, Dict[str, Any]] = {} + randomization_buckets = {} # type: Dict[str, Set[str]] for job in digests: digest = digests[job] @@ -1115,11 +1117,18 @@ def _configure_jobs( batches_to_do: List[int] = [] add_to_skip = False + if job_config.pr_only and pr_info.is_release_branch(): + continue + if job_config.release_only and not pr_info.is_release_branch(): + continue + + # fill job randomization buckets (for jobs with configured @random_bucket property)) + if job_config.random_bucket: + if not job_config.random_bucket in randomization_buckets: + randomization_buckets[job_config.random_bucket] = set() + randomization_buckets[job_config.random_bucket].add(job) + for batch in range(num_batches): # type: ignore - if job_config.pr_only and pr_info.is_release_branch(): - continue - if job_config.release_only and not pr_info.is_release_branch(): - continue if job_config.run_by_label: # this job controlled by label, add to todo if its label is set in pr if job_config.run_by_label in pr_info.labels: @@ -1167,6 +1176,24 @@ def _configure_jobs( "num_batches": num_batches, } + if not pr_info.is_release_branch(): + # randomization bucket filtering (pick one random job from each bucket, for jobs with configured random_bucket property) + for _, jobs in randomization_buckets.items(): + jobs_to_remove_randomization = set() + bucket_ = list(jobs) + random.shuffle(bucket_) + while len(bucket_) > 1: + random_job = bucket_.pop() + if random_job in jobs_to_do: + jobs_to_remove_randomization.add(random_job) + if jobs_to_remove_randomization: + print( + f"Following jobs will be removed due to randomization bucket: [{jobs_to_remove_randomization}]" + ) + jobs_to_do = [ + job for job in jobs_to_do if job not in jobs_to_remove_randomization + ] + ## c. check CI controlling labels and commit messages if pr_info.labels: jobs_requested_by_label = [] # type: List[str] @@ -1284,10 +1311,13 @@ def _update_gh_statuses_action(indata: Dict, s3: S3Helper) -> None: if CI_CONFIG.is_build_job(job): # no GH status for build jobs continue - num_batches = CI_CONFIG.get_job_config(job).num_batches - for batch in range(num_batches): + job_config = CI_CONFIG.get_job_config(job) + if not job_config: + # there might be a new job that does not exist on this branch - skip it + continue + for batch in range(job_config.num_batches): future = executor.submit( - _concurrent_create_status, job, batch, num_batches + _concurrent_create_status, job, batch, job_config.num_batches ) futures.append(future) done, _ = concurrent.futures.wait(futures) @@ -1639,13 +1669,7 @@ def main() -> int: if not args.skip_jobs: ci_cache = CiCache(s3, jobs_data["digests"]) - if ( - pr_info.is_release_branch() - or pr_info.event.get("pull_request", {}) - .get("user", {}) - .get("login", "not_maxknv") - == "maxknv" - ): + if pr_info.is_release_branch(): # wait for pending jobs to be finished, await_jobs is a long blocking call # wait pending jobs (for now only on release/master branches) ready_jobs_batches_dict = ci_cache.await_jobs( @@ -1835,7 +1859,7 @@ def main() -> int: pr_info.sha, job_report.test_results, job_report.additional_files, - job_report.check_name or args.job_name, + job_report.check_name or _get_ext_check_name(args.job_name), additional_urls=additional_urls or None, ) commit = get_commit( @@ -1846,7 +1870,7 @@ def main() -> int: job_report.status, check_url, format_description(job_report.description), - job_report.check_name or args.job_name, + job_report.check_name or _get_ext_check_name(args.job_name), pr_info, dump_to_file=True, ) @@ -1864,7 +1888,7 @@ def main() -> int: job_report.duration, job_report.start_time, check_url or "", - job_report.check_name or args.job_name, + job_report.check_name or _get_ext_check_name(args.job_name), ) ch_helper.insert_events_into( db="default", table="checks", events=prepared_events diff --git a/tests/ci/ci_config.py b/tests/ci/ci_config.py index db5a83d5b96..aaba9f9cfc9 100644 --- a/tests/ci/ci_config.py +++ b/tests/ci/ci_config.py @@ -141,7 +141,7 @@ class JobNames(metaclass=WithIter): BUILD_CHECK_SPECIAL = "ClickHouse special build check" DOCS_CHECK = "Docs check" - BUGFIX_VALIDATE = "tests bugfix validate check" + BUGFIX_VALIDATE = "Bugfix validation" # dynamically update JobName with Build jobs @@ -198,6 +198,8 @@ class JobConfig: pr_only: bool = False # job is for release/master branches only release_only: bool = False + # to randomly pick and run one job among jobs in the same @random_bucket. Applied in PR branches only. + random_bucket: str = "" @dataclass @@ -282,7 +284,6 @@ class BuildReportConfig: @dataclass class TestConfig: required_build: str - force_tests: bool = False job_config: JobConfig = field(default_factory=JobConfig) @@ -302,6 +303,7 @@ install_check_digest = DigestConfig( ) stateless_check_digest = DigestConfig( include_paths=[ + "./tests/ci/functional_test_check.py", "./tests/queries/0_stateless/", "./tests/clickhouse-test", "./tests/config", @@ -312,6 +314,7 @@ stateless_check_digest = DigestConfig( ) stateful_check_digest = DigestConfig( include_paths=[ + "./tests/ci/functional_test_check.py", "./tests/queries/1_stateful/", "./tests/clickhouse-test", "./tests/config", @@ -473,9 +476,6 @@ class CIConfig: if check_name in config: # type: ignore res = config[check_name].job_config # type: ignore break - assert ( - res is not None - ), f"Invalid check_name or CI_CONFIG outdated, config not found for [{check_name}]" return res # type: ignore @staticmethod @@ -639,16 +639,8 @@ CI_CONFIG = CIConfig( Labels.CI_SET_INTEGRATION: LabelConfig( run_jobs=[ JobNames.STYLE_CHECK, - Build.PACKAGE_ASAN, Build.PACKAGE_RELEASE, - Build.PACKAGE_TSAN, - Build.PACKAGE_AARCH64, - JobNames.INTEGRATION_TEST_ASAN, - JobNames.INTEGRATION_TEST_ARM, JobNames.INTEGRATION_TEST, - JobNames.INTEGRATION_TEST_ASAN_ANALYZER, - JobNames.INTEGRATION_TEST_TSAN, - JobNames.INTEGRATION_TEST_FLAKY, ] ), Labels.CI_SET_REDUCED: LabelConfig( @@ -890,7 +882,9 @@ CI_CONFIG = CIConfig( JobNames.BUGFIX_VALIDATE: TestConfig( "", # we run this check by label - no digest required - job_config=JobConfig(run_by_label="pr-bugfix"), + job_config=JobConfig( + run_by_label="pr-bugfix", run_command="bugfix_validate_check.py" + ), ), }, test_configs={ @@ -932,16 +926,16 @@ CI_CONFIG = CIConfig( Build.PACKAGE_DEBUG, job_config=JobConfig(**stateful_test_common_params) # type: ignore ), JobNames.STATEFUL_TEST_PARALLEL_REPL_ASAN: TestConfig( - Build.PACKAGE_ASAN, job_config=JobConfig(**stateful_test_common_params) # type: ignore + Build.PACKAGE_ASAN, job_config=JobConfig(random_bucket="parrepl_with_sanitizer", **stateful_test_common_params) # type: ignore ), JobNames.STATEFUL_TEST_PARALLEL_REPL_MSAN: TestConfig( - Build.PACKAGE_MSAN, job_config=JobConfig(**stateful_test_common_params) # type: ignore + Build.PACKAGE_MSAN, job_config=JobConfig(random_bucket="parrepl_with_sanitizer", **stateful_test_common_params) # type: ignore ), JobNames.STATEFUL_TEST_PARALLEL_REPL_UBSAN: TestConfig( - Build.PACKAGE_UBSAN, job_config=JobConfig(**stateful_test_common_params) # type: ignore + Build.PACKAGE_UBSAN, job_config=JobConfig(random_bucket="parrepl_with_sanitizer", **stateful_test_common_params) # type: ignore ), JobNames.STATEFUL_TEST_PARALLEL_REPL_TSAN: TestConfig( - Build.PACKAGE_TSAN, job_config=JobConfig(**stateful_test_common_params) # type: ignore + Build.PACKAGE_TSAN, job_config=JobConfig(random_bucket="parrepl_with_sanitizer", **stateful_test_common_params) # type: ignore ), # End stateful tests for parallel replicas JobNames.STATELESS_TEST_ASAN: TestConfig( @@ -993,29 +987,29 @@ CI_CONFIG = CIConfig( Build.PACKAGE_TSAN, job_config=JobConfig(num_batches=5, **statless_test_common_params), # type: ignore ), - JobNames.STRESS_TEST_ASAN: TestConfig( - Build.PACKAGE_ASAN, job_config=JobConfig(**stress_test_common_params) # type: ignore + JobNames.STRESS_TEST_DEBUG: TestConfig( + Build.PACKAGE_DEBUG, job_config=JobConfig(**stress_test_common_params) # type: ignore ), JobNames.STRESS_TEST_TSAN: TestConfig( Build.PACKAGE_TSAN, job_config=JobConfig(**stress_test_common_params) # type: ignore ), + JobNames.STRESS_TEST_ASAN: TestConfig( + Build.PACKAGE_ASAN, job_config=JobConfig(random_bucket="stress_with_sanitizer", **stress_test_common_params) # type: ignore + ), JobNames.STRESS_TEST_UBSAN: TestConfig( - Build.PACKAGE_UBSAN, job_config=JobConfig(**stress_test_common_params) # type: ignore + Build.PACKAGE_UBSAN, job_config=JobConfig(random_bucket="stress_with_sanitizer", **stress_test_common_params) # type: ignore ), JobNames.STRESS_TEST_MSAN: TestConfig( - Build.PACKAGE_MSAN, job_config=JobConfig(**stress_test_common_params) # type: ignore - ), - JobNames.STRESS_TEST_DEBUG: TestConfig( - Build.PACKAGE_DEBUG, job_config=JobConfig(**stress_test_common_params) # type: ignore + Build.PACKAGE_MSAN, job_config=JobConfig(random_bucket="stress_with_sanitizer", **stress_test_common_params) # type: ignore ), JobNames.UPGRADE_TEST_ASAN: TestConfig( - Build.PACKAGE_ASAN, job_config=JobConfig(pr_only=True, **upgrade_test_common_params) # type: ignore + Build.PACKAGE_ASAN, job_config=JobConfig(pr_only=True, random_bucket="upgrade_with_sanitizer", **upgrade_test_common_params) # type: ignore ), JobNames.UPGRADE_TEST_TSAN: TestConfig( - Build.PACKAGE_TSAN, job_config=JobConfig(pr_only=True, **upgrade_test_common_params) # type: ignore + Build.PACKAGE_TSAN, job_config=JobConfig(pr_only=True, random_bucket="upgrade_with_sanitizer", **upgrade_test_common_params) # type: ignore ), JobNames.UPGRADE_TEST_MSAN: TestConfig( - Build.PACKAGE_MSAN, job_config=JobConfig(pr_only=True, **upgrade_test_common_params) # type: ignore + Build.PACKAGE_MSAN, job_config=JobConfig(pr_only=True, random_bucket="upgrade_with_sanitizer", **upgrade_test_common_params) # type: ignore ), JobNames.UPGRADE_TEST_DEBUG: TestConfig( Build.PACKAGE_DEBUG, job_config=JobConfig(pr_only=True, **upgrade_test_common_params) # type: ignore @@ -1173,10 +1167,10 @@ CHECK_DESCRIPTIONS = [ lambda x: x.startswith("AST fuzzer"), ), CheckDescription( - "Bugfix validate check", + JobNames.BUGFIX_VALIDATE, "Checks that either a new test (functional or integration) or there " "some changed tests that fail with the binary built on master branch", - lambda x: x == "Bugfix validate check", + lambda x: x == JobNames.BUGFIX_VALIDATE, ), CheckDescription( "CI running", diff --git a/tests/ci/clickbench.py b/tests/ci/clickbench.py index f8707cbcff7..50c7bb85d28 100644 --- a/tests/ci/clickbench.py +++ b/tests/ci/clickbench.py @@ -10,14 +10,15 @@ from pathlib import Path from typing import List, Tuple from build_download_helper import download_all_deb_packages -from clickhouse_helper import CiLogsCredentials -from commit_status_helper import override_status -from docker_images_helper import DockerImage, get_docker_image, pull_image -from env_helper import REPORT_PATH, TEMP_PATH -from pr_info import FORCE_TESTS_LABEL, PRInfo -from report import ERROR, SUCCESS, JobReport, StatusType, TestResults +from clickhouse_helper import ( + CiLogsCredentials, +) +from docker_images_helper import get_docker_image, pull_image, DockerImage +from env_helper import TEMP_PATH, REPORT_PATH +from pr_info import PRInfo from stopwatch import Stopwatch from tee_popen import TeePopen +from report import ERROR, SUCCESS, JobReport, StatusType, TestResults def get_image_name() -> str: @@ -164,7 +165,6 @@ def main(): state, description, test_results, additional_logs = process_results( result_path, server_log_path ) - state = override_status(state, check_name) JobReport( description=description, @@ -176,10 +176,7 @@ def main(): ).dump() if state != SUCCESS: - if FORCE_TESTS_LABEL in pr_info.labels: - print(f"'{FORCE_TESTS_LABEL}' enabled, will report success") - else: - sys.exit(1) + sys.exit(1) if __name__ == "__main__": diff --git a/tests/ci/commit_status_helper.py b/tests/ci/commit_status_helper.py index 8a34d375d1e..b7128e36434 100644 --- a/tests/ci/commit_status_helper.py +++ b/tests/ci/commit_status_helper.py @@ -18,9 +18,7 @@ from github.GithubObject import NotSet from github.IssueComment import IssueComment from github.Repository import Repository -# isort: on - -from ci_config import CHECK_DESCRIPTIONS, CI_CONFIG, REQUIRED_CHECKS, CheckDescription +from ci_config import REQUIRED_CHECKS, CHECK_DESCRIPTIONS, CheckDescription from env_helper import GITHUB_JOB_URL, GITHUB_REPOSITORY, TEMP_PATH from pr_info import SKIP_MERGEABLE_CHECK_LABEL, PRInfo from report import ( @@ -67,21 +65,6 @@ class RerunHelper: return None -def override_status( - status: StatusType, check_name: str, invert: bool = False -) -> StatusType: - test_config = CI_CONFIG.test_configs.get(check_name) - if test_config and test_config.force_tests: - return SUCCESS - - if invert: - if status == SUCCESS: - return ERROR - return SUCCESS - - return status - - def get_commit(gh: Github, commit_sha: str, retry_count: int = RETRY) -> Commit: for i in range(retry_count): try: diff --git a/tests/ci/fast_test_check.py b/tests/ci/fast_test_check.py index e483e9d4ac2..5d528bb4c48 100644 --- a/tests/ci/fast_test_check.py +++ b/tests/ci/fast_test_check.py @@ -10,7 +10,7 @@ from typing import Tuple from docker_images_helper import DockerImage, get_docker_image, pull_image from env_helper import REPO_COPY, S3_BUILDS_BUCKET, TEMP_PATH -from pr_info import FORCE_TESTS_LABEL, PRInfo +from pr_info import PRInfo from report import ( ERROR, FAILURE, @@ -190,13 +190,7 @@ def main(): # Refuse other checks to run if fast test failed if state != SUCCESS: - if state == ERROR: - print("The status is 'error', report failure disregard the labels") - sys.exit(1) - elif FORCE_TESTS_LABEL in pr_info.labels: - print(f"'{FORCE_TESTS_LABEL}' enabled, reporting success") - else: - sys.exit(1) + sys.exit(1) if __name__ == "__main__": diff --git a/tests/ci/functional_test_check.py b/tests/ci/functional_test_check.py index e230aa5a679..da2dea60fc1 100644 --- a/tests/ci/functional_test_check.py +++ b/tests/ci/functional_test_check.py @@ -1,7 +1,6 @@ #!/usr/bin/env python3 import argparse -import atexit import csv import logging import os @@ -11,34 +10,16 @@ import sys from pathlib import Path from typing import List, Tuple -# isort: off -from github import Github - -# isort: on - from build_download_helper import download_all_deb_packages -from clickhouse_helper import ( - CiLogsCredentials, - ClickHouseHelper, - prepare_tests_results_for_clickhouse, -) -from commit_status_helper import ( - get_commit, - override_status, - post_commit_status, - post_commit_status_to_file, - update_mergeable_check, -) -from docker_images_helper import DockerImage, get_docker_image, pull_image +from clickhouse_helper import CiLogsCredentials + +from docker_images_helper import DockerImage, pull_image, get_docker_image from download_release_packages import download_last_release -from env_helper import REPO_COPY, REPORT_PATH, TEMP_PATH -from get_robot_token import get_best_robot_token -from pr_info import FORCE_TESTS_LABEL, PRInfo -from report import ERROR, SUCCESS, StatusType, TestResults, read_test_results -from s3_helper import S3Helper +from env_helper import REPORT_PATH, TEMP_PATH, REPO_COPY +from pr_info import PRInfo +from report import ERROR, SUCCESS, JobReport, StatusType, TestResults, read_test_results from stopwatch import Stopwatch from tee_popen import TeePopen -from upload_result_helper import upload_results NO_CHANGES_MSG = "Nothing to run" @@ -130,7 +111,7 @@ def get_run_command( ) -def get_tests_to_run(pr_info: PRInfo) -> List[str]: +def _get_statless_tests_to_run(pr_info: PRInfo) -> List[str]: result = set() if pr_info.changed_files is None: @@ -213,10 +194,10 @@ def parse_args(): help="Check that added tests failed on latest stable", ) parser.add_argument( - "--post-commit-status", - default="commit_status", - choices=["commit_status", "file"], - help="Where to public post commit status", + "--report-to-file", + type=str, + default="", + help="Path to write script report to (for --validate-bugfix)", ) return parser.parse_args() @@ -232,7 +213,6 @@ def main(): reports_path.mkdir(parents=True, exist_ok=True) repo_path = Path(REPO_COPY) - post_commit_path = temp_path / "functional_commit_status.tsv" args = parse_args() check_name = args.check_name or os.getenv("CHECK_NAME") @@ -249,62 +229,20 @@ def main(): flaky_check = "flaky" in check_name.lower() run_changed_tests = flaky_check or validate_bugfix_check - - # For validate_bugfix_check we need up to date information about labels, so pr_event_from_api is used - pr_info = PRInfo( - need_changed_files=run_changed_tests, pr_event_from_api=validate_bugfix_check - ) - - # FIXME: move to job report and remove - gh = Github(get_best_robot_token(), per_page=100) - commit = get_commit(gh, pr_info.sha) - atexit.register(update_mergeable_check, commit, pr_info, check_name) - - if validate_bugfix_check and "pr-bugfix" not in pr_info.labels: - if args.post_commit_status == "file": - post_commit_status_to_file( - post_commit_path, - f"Skipped (no pr-bugfix in {pr_info.labels})", - SUCCESS, - "null", - ) - logging.info("Skipping '%s' (no pr-bugfix in %s)", check_name, pr_info.labels) - sys.exit(0) + pr_info = PRInfo(need_changed_files=run_changed_tests) + tests_to_run = [] + if run_changed_tests: + assert ( + args.report_to_file + ), "JobReport file path must be provided with --validate-bugfix" + tests_to_run = _get_statless_tests_to_run(pr_info) if "RUN_BY_HASH_NUM" in os.environ: run_by_hash_num = int(os.getenv("RUN_BY_HASH_NUM", "0")) run_by_hash_total = int(os.getenv("RUN_BY_HASH_TOTAL", "0")) - check_name_with_group = ( - check_name + f" [{run_by_hash_num + 1}/{run_by_hash_total}]" - ) else: run_by_hash_num = 0 run_by_hash_total = 0 - check_name_with_group = check_name - - tests_to_run = [] - if run_changed_tests: - tests_to_run = get_tests_to_run(pr_info) - if not tests_to_run: - state = override_status(SUCCESS, check_name, validate_bugfix_check) - if args.post_commit_status == "commit_status": - post_commit_status( - commit, - state, - "", - NO_CHANGES_MSG, - check_name_with_group, - pr_info, - dump_to_file=True, - ) - elif args.post_commit_status == "file": - post_commit_status_to_file( - post_commit_path, - description=NO_CHANGES_MSG, - state=state, - report_url="null", - ) - sys.exit(0) image_name = get_image_name(check_name) @@ -338,91 +276,65 @@ def main(): pr_info, stopwatch.start_time_str, check_name ) - run_command = get_run_command( - check_name, - packages_path, - repo_path, - result_path, - server_log_path, - kill_timeout, - additional_envs, - ci_logs_args, - docker_image, - flaky_check, - tests_to_run, - ) - logging.info("Going to run func tests: %s", run_command) - - with TeePopen(run_command, run_log_path) as process: - retcode = process.wait() - if retcode == 0: - logging.info("Run successfully") - else: - logging.info("Run failed") - - try: - subprocess.check_call(f"sudo chown -R ubuntu:ubuntu {temp_path}", shell=True) - except subprocess.CalledProcessError: - logging.warning("Failed to change files owner in %s, ignoring it", temp_path) - - ci_logs_credentials.clean_ci_logs_from_credentials(run_log_path) - s3_helper = S3Helper() - - state, description, test_results, additional_logs = process_results( - result_path, server_log_path - ) - state = override_status(state, check_name, invert=validate_bugfix_check) - - ch_helper = ClickHouseHelper() - - report_url = upload_results( - s3_helper, - pr_info.number, - pr_info.sha, - test_results, - [run_log_path] + additional_logs, - check_name_with_group, - ) - - print(f"::notice:: {check_name} Report url: {report_url}") - if args.post_commit_status == "commit_status": - post_commit_status( - commit, - state, - report_url, - description, - check_name_with_group, - pr_info, - dump_to_file=True, + if (not validate_bugfix_check and not flaky_check) or tests_to_run: + run_command = get_run_command( + check_name, + packages_path, + repo_path, + result_path, + server_log_path, + kill_timeout, + additional_envs, + ci_logs_args, + docker_image, + flaky_check, + tests_to_run, ) - elif args.post_commit_status == "file": - post_commit_status_to_file( - post_commit_path, - description, - state, - report_url, + logging.info("Going to run func tests: %s", run_command) + + with TeePopen(run_command, run_log_path) as process: + retcode = process.wait() + if retcode == 0: + logging.info("Run successfully") + else: + logging.info("Run failed") + + try: + subprocess.check_call( + f"sudo chown -R ubuntu:ubuntu {temp_path}", shell=True + ) + except subprocess.CalledProcessError: + logging.warning( + "Failed to change files owner in %s, ignoring it", temp_path + ) + + ci_logs_credentials.clean_ci_logs_from_credentials(run_log_path) + + state, description, test_results, additional_logs = process_results( + result_path, server_log_path ) else: - raise Exception( - f'Unknown post_commit_status option "{args.post_commit_status}"' + print( + "This is validate bugfix or flaky check run, but no changes test to run - skip with success" + ) + state, description, test_results, additional_logs = ( + SUCCESS, + "No tests to run", + [], + [], ) - prepared_events = prepare_tests_results_for_clickhouse( - pr_info, - test_results, - state, - stopwatch.duration_seconds, - stopwatch.start_time_str, - report_url, - check_name_with_group, - ) - ch_helper.insert_events_into(db="default", table="checks", events=prepared_events) + JobReport( + description=description, + test_results=test_results, + status=state, + start_time=stopwatch.start_time_str, + duration=stopwatch.duration_seconds, + additional_files=additional_logs, + ).dump(to_file=args.report_to_file if args.report_to_file else None) if state != SUCCESS: - if FORCE_TESTS_LABEL in pr_info.labels: - print(f"'{FORCE_TESTS_LABEL}' enabled, will report success") - else: - sys.exit(1) + sys.exit(1) if __name__ == "__main__": diff --git a/tests/ci/integration_test_check.py b/tests/ci/integration_test_check.py index 5af4d5e625b..751abf617fa 100644 --- a/tests/ci/integration_test_check.py +++ b/tests/ci/integration_test_check.py @@ -5,38 +5,27 @@ import csv import json import logging import os -import subprocess import sys from pathlib import Path from typing import Dict, List, Tuple from build_download_helper import download_all_deb_packages -from clickhouse_helper import ClickHouseHelper, prepare_tests_results_for_clickhouse -from commit_status_helper import ( - get_commit, - override_status, - post_commit_status, - post_commit_status_to_file, -) from docker_images_helper import DockerImage, get_docker_image from download_release_packages import download_last_release from env_helper import REPO_COPY, REPORT_PATH, TEMP_PATH -from get_robot_token import get_best_robot_token -from github_helper import GitHub from integration_test_images import IMAGES from pr_info import PRInfo from report import ( ERROR, SUCCESS, StatusType, + JobReport, TestResult, TestResults, read_test_results, ) -from s3_helper import S3Helper from stopwatch import Stopwatch from tee_popen import TeePopen -from upload_result_helper import upload_results def get_json_params_dict( @@ -131,16 +120,19 @@ def process_results( def parse_args(): parser = argparse.ArgumentParser() parser.add_argument("check_name") + parser.add_argument( + "--run-tests", nargs="*", help="List of tests to run", default=None + ) parser.add_argument( "--validate-bugfix", action="store_true", help="Check that added tests failed on latest stable", ) parser.add_argument( - "--post-commit-status", - default="commit_status", - choices=["commit_status", "file"], - help="Where to public post commit status", + "--report-to-file", + type=str, + default="", + help="Path to write script report to (for --validate-bugfix)", ) return parser.parse_args() @@ -154,7 +146,6 @@ def main(): reports_path = Path(REPORT_PATH) temp_path.mkdir(parents=True, exist_ok=True) - post_commit_path = temp_path / "integration_commit_status.tsv" repo_path = Path(REPO_COPY) args = parse_args() @@ -167,37 +158,19 @@ def main(): if "RUN_BY_HASH_NUM" in os.environ: run_by_hash_num = int(os.getenv("RUN_BY_HASH_NUM", "0")) run_by_hash_total = int(os.getenv("RUN_BY_HASH_TOTAL", "0")) - check_name_with_group = ( - check_name + f" [{run_by_hash_num + 1}/{run_by_hash_total}]" - ) else: run_by_hash_num = 0 run_by_hash_total = 0 - check_name_with_group = check_name is_flaky_check = "flaky" in check_name + assert ( + not validate_bugfix_check or args.report_to_file + ), "--report-to-file must be provided for --validate-bugfix" + # For validate_bugfix_check we need up to date information about labels, so # pr_event_from_api is used - pr_info = PRInfo( - need_changed_files=is_flaky_check or validate_bugfix_check, - pr_event_from_api=validate_bugfix_check, - ) - - if validate_bugfix_check and "pr-bugfix" not in pr_info.labels: - if args.post_commit_status == "file": - post_commit_status_to_file( - post_commit_path, - f"Skipped (no pr-bugfix in {pr_info.labels})", - SUCCESS, - "null", - ) - logging.info("Skipping '%s' (no pr-bugfix in '%s')", check_name, pr_info.labels) - sys.exit(0) - - # FIXME: switch to JobReport and remove: - gh = GitHub(get_best_robot_token()) - commit = get_commit(gh, pr_info.sha) + pr_info = PRInfo(need_changed_files=is_flaky_check or validate_bugfix_check) images = [get_docker_image(image_) for image_ in IMAGES] @@ -245,7 +218,7 @@ def main(): ), ) - ch_helper = ClickHouseHelper() + integration_infrastructure_fail = False with TeePopen(run_command, output_path_log, my_env) as process: retcode = process.wait() if retcode == 0: @@ -254,73 +227,31 @@ def main(): logging.warning( "There were issues with infrastructure. Not writing status report to restart job." ) - prepared_events = prepare_tests_results_for_clickhouse( - pr_info, - [ - TestResult( - "integration_infrastructure_fail", - "ERROR", - stopwatch.duration_seconds, - ) - ], - ERROR, - stopwatch.duration_seconds, - stopwatch.start_time_str, - "", - check_name_with_group, - ) - - ch_helper.insert_events_into( - db="default", table="checks", events=prepared_events - ) + integration_infrastructure_fail = True sys.exit(1) else: logging.info("Some tests failed") - subprocess.check_call(f"sudo chown -R ubuntu:ubuntu {temp_path}", shell=True) + # subprocess.check_call(f"sudo chown -R ubuntu:ubuntu {temp_path}", shell=True) - state, description, test_results, additional_logs = process_results(result_path) - state = override_status(state, check_name, invert=validate_bugfix_check) - - s3_helper = S3Helper() - report_url = upload_results( - s3_helper, - pr_info.number, - pr_info.sha, - test_results, - [output_path_log] + additional_logs, - check_name_with_group, - ) - - print(f"::notice:: {check_name} Report url: {report_url}") - if args.post_commit_status == "commit_status": - post_commit_status( - commit, - state, - report_url, - description, - check_name_with_group, - pr_info, - dump_to_file=True, - ) - elif args.post_commit_status == "file": - post_commit_status_to_file(post_commit_path, description, state, report_url) + if not integration_infrastructure_fail: + state, description, test_results, additional_logs = process_results(result_path) else: - raise Exception( - f'Unknown post_commit_status option "{args.post_commit_status}"' + state, description, test_results, additional_logs = ( + ERROR, + "no description", + [TestResult("infrastructure error", ERROR, stopwatch.duration_seconds)], + [], ) - prepared_events = prepare_tests_results_for_clickhouse( - pr_info, - test_results, - state, - stopwatch.duration_seconds, - stopwatch.start_time_str, - report_url, - check_name_with_group, - ) - - ch_helper.insert_events_into(db="default", table="checks", events=prepared_events) + JobReport( + description=description, + test_results=test_results, + status=state, + start_time=stopwatch.start_time_str, + duration=stopwatch.duration_seconds, + additional_files=[output_path_log] + additional_logs, + ).dump(to_file=args.report_to_file if args.report_to_file else None) if state != SUCCESS: sys.exit(1) diff --git a/tests/ci/mark_release_ready.py b/tests/ci/mark_release_ready.py index 011b3f28843..31415fef9c0 100755 --- a/tests/ci/mark_release_ready.py +++ b/tests/ci/mark_release_ready.py @@ -56,7 +56,6 @@ def main(): description, RELEASE_READY_STATUS, pr_info, - dump_to_file=True, ) diff --git a/tests/ci/pr_info.py b/tests/ci/pr_info.py index 744de7dea72..70f358e8070 100644 --- a/tests/ci/pr_info.py +++ b/tests/ci/pr_info.py @@ -19,7 +19,6 @@ from env_helper import ( GITHUB_SERVER_URL, ) -FORCE_TESTS_LABEL = "force tests" SKIP_MERGEABLE_CHECK_LABEL = "skip mergeable check" NeedsDataType = Dict[str, Dict[str, Union[str, Dict[str, str]]]] diff --git a/tests/ci/report.py b/tests/ci/report.py index 55d1f604605..282c343eec3 100644 --- a/tests/ci/report.py +++ b/tests/ci/report.py @@ -34,6 +34,7 @@ SUCCESS: Final = "success" OK: Final = "OK" FAIL: Final = "FAIL" +SKIPPED: Final = "SKIPPED" StatusType = Literal["error", "failure", "pending", "success"] STATUSES = [ERROR, FAILURE, PENDING, SUCCESS] # type: List[StatusType] @@ -292,9 +293,10 @@ class JobReport: return JOB_REPORT_FILE.is_file() @classmethod - def load(cls): # type: ignore + def load(cls, from_file=None): # type: ignore res = {} - with open(JOB_REPORT_FILE, "r") as json_file: + from_file = from_file or JOB_REPORT_FILE + with open(from_file, "r") as json_file: res = json.load(json_file) # Deserialize the nested lists of TestResult test_results_data = res.get("test_results", []) @@ -307,13 +309,14 @@ class JobReport: if JOB_REPORT_FILE.exists(): JOB_REPORT_FILE.unlink() - def dump(self): + def dump(self, to_file=None): def path_converter(obj): if isinstance(obj, Path): return str(obj) raise TypeError("Type not serializable") - with open(JOB_REPORT_FILE, "w") as json_file: + to_file = to_file or JOB_REPORT_FILE + with open(to_file, "w") as json_file: json.dump(asdict(self), json_file, default=path_converter, indent=2) @@ -594,7 +597,6 @@ class ReportColorTheme: blue = "#00B4FF" default = (ReportColor.green, ReportColor.red, ReportColor.yellow) - bugfixcheck = (ReportColor.yellow, ReportColor.blue, ReportColor.blue) ColorTheme = Tuple[str, str, str] diff --git a/tests/ci/run_check.py b/tests/ci/run_check.py index 2aeac5b5740..09d50c902d8 100644 --- a/tests/ci/run_check.py +++ b/tests/ci/run_check.py @@ -26,7 +26,7 @@ from lambda_shared_package.lambda_shared.pr import ( TRUSTED_CONTRIBUTORS, check_pr_description, ) -from pr_info import FORCE_TESTS_LABEL, PRInfo +from pr_info import PRInfo from report import FAILURE, PENDING TRUSTED_ORG_IDS = { @@ -66,9 +66,6 @@ def pr_is_by_trusted_user(pr_user_login, pr_user_orgs): def should_run_ci_for_pr(pr_info: PRInfo) -> Tuple[bool, str]: # Consider the labels and whether the user is trusted. print("Got labels", pr_info.labels) - if FORCE_TESTS_LABEL in pr_info.labels: - print(f"Label '{FORCE_TESTS_LABEL}' set, forcing remaining checks") - return True, f"Labeled '{FORCE_TESTS_LABEL}'" if OK_SKIP_LABELS.intersection(pr_info.labels): return True, "Don't try new checks for release/backports/cherry-picks" diff --git a/tests/ci/sqllogic_test.py b/tests/ci/sqllogic_test.py index bbd81fd76bb..e9a109e425e 100755 --- a/tests/ci/sqllogic_test.py +++ b/tests/ci/sqllogic_test.py @@ -9,9 +9,8 @@ from pathlib import Path from typing import Tuple from build_download_helper import download_all_deb_packages -from commit_status_helper import override_status -from docker_images_helper import DockerImage, get_docker_image, pull_image -from env_helper import REPO_COPY, REPORT_PATH, TEMP_PATH +from docker_images_helper import DockerImage, pull_image, get_docker_image +from env_helper import REPORT_PATH, TEMP_PATH, REPO_COPY from report import ( ERROR, FAIL, @@ -163,7 +162,7 @@ def main(): status, description = ERROR, "Empty test_results.tsv" assert status is not None - status = override_status(status, check_name) + test_results.append( TestResult( "All tests", diff --git a/tests/ci/upload_result_helper.py b/tests/ci/upload_result_helper.py index 6fa9c1dd873..9dca3fae1dc 100644 --- a/tests/ci/upload_result_helper.py +++ b/tests/ci/upload_result_helper.py @@ -9,7 +9,7 @@ from env_helper import ( GITHUB_RUN_URL, GITHUB_SERVER_URL, ) -from report import ReportColorTheme, TestResults, create_test_html_report +from report import TestResults, create_test_html_report from s3_helper import S3Helper @@ -92,10 +92,6 @@ def upload_results( else: raw_log_url = GITHUB_JOB_URL() - statuscolors = ( - ReportColorTheme.bugfixcheck if "bugfix validate check" in check_name else None - ) - if test_results or not ready_report_url: html_report = create_test_html_report( check_name, @@ -107,7 +103,6 @@ def upload_results( branch_name, commit_url, additional_urls, - statuscolors=statuscolors, ) report_path = Path("report.html") report_path.write_text(html_report, encoding="utf-8") diff --git a/tests/config/config.d/max_num_to_warn.xml b/tests/config/config.d/max_num_to_warn.xml index 77d68998f8e..776c270823d 100644 --- a/tests/config/config.d/max_num_to_warn.xml +++ b/tests/config/config.d/max_num_to_warn.xml @@ -1,5 +1,5 @@ - 10 - 10 + 5 + 2 10 diff --git a/tests/integration/ci-runner.py b/tests/integration/ci-runner.py index 7c922e339fe..08dd9ba276b 100755 --- a/tests/integration/ci-runner.py +++ b/tests/integration/ci-runner.py @@ -252,9 +252,7 @@ class ClickhouseIntegrationTestsRunner: self.image_versions = self.params["docker_images_with_versions"] self.shuffle_groups = self.params["shuffle_test_groups"] self.flaky_check = "flaky check" in self.params["context_name"] - self.bugfix_validate_check = ( - "bugfix validate check" in self.params["context_name"] - ) + self.bugfix_validate_check = "bugfix" in self.params["context_name"].lower() # if use_tmpfs is not set we assume it to be true, otherwise check self.use_tmpfs = "use_tmpfs" not in self.params or self.params["use_tmpfs"] self.disable_net_host = ( diff --git a/tests/integration/helpers/cluster.py b/tests/integration/helpers/cluster.py index 1d96563251b..542f757ddd4 100644 --- a/tests/integration/helpers/cluster.py +++ b/tests/integration/helpers/cluster.py @@ -465,7 +465,7 @@ class ClickHouseCluster: self.base_cmd += ["--project-name", self.project_name] self.base_zookeeper_cmd = None - self.base_mysql_cmd = [] + self.base_mysql57_cmd = [] self.base_kafka_cmd = [] self.base_kerberized_kafka_cmd = [] self.base_kerberos_kdc_cmd = [] @@ -479,7 +479,7 @@ class ClickHouseCluster: self.with_zookeeper = False self.with_zookeeper_secure = False self.with_mysql_client = False - self.with_mysql = False + self.with_mysql57 = False self.with_mysql8 = False self.with_mysql_cluster = False self.with_postgres = False @@ -644,12 +644,19 @@ class ClickHouseCluster: self.mysql_client_host = "mysql_client" self.mysql_client_container = None - # available when with_mysql == True - self.mysql_host = "mysql57" - self.mysql_port = 3306 - self.mysql_ip = None - self.mysql_dir = p.abspath(p.join(self.instances_dir, "mysql")) - self.mysql_logs_dir = os.path.join(self.mysql_dir, "logs") + # available when with_mysql57 == True + self.mysql57_host = "mysql57" + self.mysql57_port = 3306 + self.mysql57_ip = None + self.mysql57_dir = p.abspath(p.join(self.instances_dir, "mysql")) + self.mysql57_logs_dir = os.path.join(self.mysql57_dir, "logs") + + # available when with_mysql8 == True + self.mysql8_host = "mysql80" + self.mysql8_port = 3306 + self.mysql8_ip = None + self.mysql8_dir = p.abspath(p.join(self.instances_dir, "mysql8")) + self.mysql8_logs_dir = os.path.join(self.mysql8_dir, "logs") # available when with_mysql_cluster == True self.mysql2_host = "mysql2" @@ -659,14 +666,7 @@ class ClickHouseCluster: self.mysql3_ip = None self.mysql4_ip = None self.mysql_cluster_dir = p.abspath(p.join(self.instances_dir, "mysql")) - self.mysql_cluster_logs_dir = os.path.join(self.mysql_dir, "logs") - - # available when with_mysql8 == True - self.mysql8_host = "mysql80" - self.mysql8_port = 3306 - self.mysql8_ip = None - self.mysql8_dir = p.abspath(p.join(self.instances_dir, "mysql8")) - self.mysql8_logs_dir = os.path.join(self.mysql8_dir, "logs") + self.mysql_cluster_logs_dir = os.path.join(self.mysql8_dir, "logs") # available when with_zookeper_secure == True self.zookeeper_secure_port = 2281 @@ -1045,17 +1045,17 @@ class ClickHouseCluster: return self.base_mysql_client_cmd - def setup_mysql_cmd(self, instance, env_variables, docker_compose_yml_dir): - self.with_mysql = True - env_variables["MYSQL_HOST"] = self.mysql_host - env_variables["MYSQL_PORT"] = str(self.mysql_port) + def setup_mysql57_cmd(self, instance, env_variables, docker_compose_yml_dir): + self.with_mysql57 = True + env_variables["MYSQL_HOST"] = self.mysql57_host + env_variables["MYSQL_PORT"] = str(self.mysql57_port) env_variables["MYSQL_ROOT_HOST"] = "%" - env_variables["MYSQL_LOGS"] = self.mysql_logs_dir + env_variables["MYSQL_LOGS"] = self.mysql57_logs_dir env_variables["MYSQL_LOGS_FS"] = "bind" self.base_cmd.extend( ["--file", p.join(docker_compose_yml_dir, "docker_compose_mysql.yml")] ) - self.base_mysql_cmd = [ + self.base_mysql57_cmd = [ "docker-compose", "--env-file", instance.env_file, @@ -1065,7 +1065,7 @@ class ClickHouseCluster: p.join(docker_compose_yml_dir, "docker_compose_mysql.yml"), ] - return self.base_mysql_cmd + return self.base_mysql57_cmd def setup_mysql8_cmd(self, instance, env_variables, docker_compose_yml_dir): self.with_mysql8 = True @@ -1091,7 +1091,7 @@ class ClickHouseCluster: def setup_mysql_cluster_cmd(self, instance, env_variables, docker_compose_yml_dir): self.with_mysql_cluster = True - env_variables["MYSQL_CLUSTER_PORT"] = str(self.mysql_port) + env_variables["MYSQL_CLUSTER_PORT"] = str(self.mysql8_port) env_variables["MYSQL_CLUSTER_ROOT_HOST"] = "%" env_variables["MYSQL_CLUSTER_LOGS"] = self.mysql_cluster_logs_dir env_variables["MYSQL_CLUSTER_LOGS_FS"] = "bind" @@ -1572,7 +1572,7 @@ class ClickHouseCluster: with_zookeeper=False, with_zookeeper_secure=False, with_mysql_client=False, - with_mysql=False, + with_mysql57=False, with_mysql8=False, with_mysql_cluster=False, with_kafka=False, @@ -1676,7 +1676,7 @@ class ClickHouseCluster: with_zookeeper=with_zookeeper, zookeeper_config_path=self.zookeeper_config_path, with_mysql_client=with_mysql_client, - with_mysql=with_mysql, + with_mysql57=with_mysql57, with_mysql8=with_mysql8, with_mysql_cluster=with_mysql_cluster, with_kafka=with_kafka, @@ -1767,9 +1767,9 @@ class ClickHouseCluster: ) ) - if with_mysql and not self.with_mysql: + if with_mysql57 and not self.with_mysql57: cmds.append( - self.setup_mysql_cmd(instance, env_variables, docker_compose_yml_dir) + self.setup_mysql57_cmd(instance, env_variables, docker_compose_yml_dir) ) if with_mysql8 and not self.with_mysql8: @@ -1805,9 +1805,9 @@ class ClickHouseCluster: if with_odbc_drivers and not self.with_odbc_drivers: self.with_odbc_drivers = True - if not self.with_mysql: + if not self.with_mysql8: cmds.append( - self.setup_mysql_cmd( + self.setup_mysql8_cmd( instance, env_variables, docker_compose_yml_dir ) ) @@ -2148,8 +2148,8 @@ class ClickHouseCluster: logging.error("Can't connect to MySQL Client:{}".format(errors)) raise Exception("Cannot wait MySQL Client container") - def wait_mysql_to_start(self, timeout=180): - self.mysql_ip = self.get_instance_ip("mysql57") + def wait_mysql57_to_start(self, timeout=180): + self.mysql57_ip = self.get_instance_ip("mysql57") start = time.time() errors = [] while time.time() - start < timeout: @@ -2157,8 +2157,8 @@ class ClickHouseCluster: conn = pymysql.connect( user=mysql_user, password=mysql_pass, - host=self.mysql_ip, - port=self.mysql_port, + host=self.mysql57_ip, + port=self.mysql57_port, ) conn.close() logging.debug("Mysql Started") @@ -2205,7 +2205,7 @@ class ClickHouseCluster: user=mysql_user, password=mysql_pass, host=ip, - port=self.mysql_port, + port=self.mysql8_port, ) conn.close() logging.debug(f"Mysql Started {ip}") @@ -2752,15 +2752,15 @@ class ClickHouseCluster: subprocess_check_call(self.base_mysql_client_cmd + common_opts) self.wait_mysql_client_to_start() - if self.with_mysql and self.base_mysql_cmd: + if self.with_mysql57 and self.base_mysql57_cmd: logging.debug("Setup MySQL") - if os.path.exists(self.mysql_dir): - shutil.rmtree(self.mysql_dir) - os.makedirs(self.mysql_logs_dir) - os.chmod(self.mysql_logs_dir, stat.S_IRWXU | stat.S_IRWXO) - subprocess_check_call(self.base_mysql_cmd + common_opts) + if os.path.exists(self.mysql57_dir): + shutil.rmtree(self.mysql57_dir) + os.makedirs(self.mysql57_logs_dir) + os.chmod(self.mysql57_logs_dir, stat.S_IRWXU | stat.S_IRWXO) + subprocess_check_call(self.base_mysql57_cmd + common_opts) self.up_called = True - self.wait_mysql_to_start() + self.wait_mysql57_to_start() if self.with_mysql8 and self.base_mysql8_cmd: logging.debug("Setup MySQL 8") @@ -2775,7 +2775,7 @@ class ClickHouseCluster: print("Setup MySQL") if os.path.exists(self.mysql_cluster_dir): shutil.rmtree(self.mysql_cluster_dir) - os.makedirs(self.mysql_cluster_logs_dir) + os.makedirs(self.mysql_cluster_logs_dir, exist_ok=True) os.chmod(self.mysql_cluster_logs_dir, stat.S_IRWXU | stat.S_IRWXO) subprocess_check_call(self.base_mysql_cluster_cmd + common_opts) @@ -3239,7 +3239,7 @@ class ClickHouseInstance: with_zookeeper, zookeeper_config_path, with_mysql_client, - with_mysql, + with_mysql57, with_mysql8, with_mysql_cluster, with_kafka, @@ -3324,7 +3324,7 @@ class ClickHouseInstance: self.library_bridge_bin_path = library_bridge_bin_path self.with_mysql_client = with_mysql_client - self.with_mysql = with_mysql + self.with_mysql57 = with_mysql57 self.with_mysql8 = with_mysql8 self.with_mysql_cluster = with_mysql_cluster self.with_postgres = with_postgres @@ -3368,7 +3368,7 @@ class ClickHouseInstance: self.env_file = self.cluster.env_file if with_odbc_drivers: self.odbc_ini_path = self.path + "/odbc.ini:/etc/odbc.ini" - self.with_mysql = True + self.with_mysql8 = True else: self.odbc_ini_path = "" @@ -4294,7 +4294,7 @@ class ClickHouseInstance: "Database": odbc_mysql_db, "Uid": odbc_mysql_uid, "Pwd": odbc_mysql_pass, - "Server": self.cluster.mysql_host, + "Server": self.cluster.mysql8_host, }, "PostgreSQL": { "DSN": "postgresql_odbc", @@ -4482,14 +4482,14 @@ class ClickHouseInstance: if self.with_mysql_client: depends_on.append(self.cluster.mysql_client_host) - if self.with_mysql: + if self.with_mysql57: depends_on.append("mysql57") if self.with_mysql8: depends_on.append("mysql80") if self.with_mysql_cluster: - depends_on.append("mysql57") + depends_on.append("mysql80") depends_on.append("mysql2") depends_on.append("mysql3") depends_on.append("mysql4") diff --git a/tests/integration/helpers/external_sources.py b/tests/integration/helpers/external_sources.py index cccf151e73e..033a2f84fa2 100644 --- a/tests/integration/helpers/external_sources.py +++ b/tests/integration/helpers/external_sources.py @@ -119,7 +119,7 @@ class SourceMySQL(ExternalSource): def prepare(self, structure, table_name, cluster): if self.internal_hostname is None: - self.internal_hostname = cluster.mysql_ip + self.internal_hostname = cluster.mysql8_ip self.create_mysql_conn() self.execute_mysql_query( "create database if not exists test default character set 'utf8'" diff --git a/tests/integration/helpers/keeper_config1.xml b/tests/integration/helpers/keeper_config1.xml index 12c6c0b78b6..a4a1059ffe9 100644 --- a/tests/integration/helpers/keeper_config1.xml +++ b/tests/integration/helpers/keeper_config1.xml @@ -9,11 +9,13 @@ /var/log/clickhouse-keeper/clickhouse-keeper.err.log + + 0 + az-zoo1 + + 2181 - - az-zoo1 - 1 diff --git a/tests/integration/helpers/keeper_config2.xml b/tests/integration/helpers/keeper_config2.xml index 2afff2f5e59..88a0d1f0b4b 100644 --- a/tests/integration/helpers/keeper_config2.xml +++ b/tests/integration/helpers/keeper_config2.xml @@ -9,13 +9,14 @@ /var/log/clickhouse-keeper/clickhouse-keeper.err.log + + 0 + az-zoo2 + + 2181 2 - - az-zoo2 - 1 - 10000 diff --git a/tests/integration/test_backup_restore_s3/configs/s3_settings.xml b/tests/integration/test_backup_restore_s3/configs/s3_settings.xml index 981cf67bbe9..61ef7759b57 100644 --- a/tests/integration/test_backup_restore_s3/configs/s3_settings.xml +++ b/tests/integration/test_backup_restore_s3/configs/s3_settings.xml @@ -1,5 +1,6 @@ + 0 http://minio1:9001/root/data/backups/multipart/ CREATE TABLE nullfloat32 (x Nullable(Float32)) ENGINE = Memory INSERT INTO nullfloat32 diff --git a/tests/queries/0_stateless/00700_decimal_arithm.reference b/tests/queries/0_stateless/00700_decimal_arithm.reference index 811946c87e0..109c0632fb1 100644 --- a/tests/queries/0_stateless/00700_decimal_arithm.reference +++ b/tests/queries/0_stateless/00700_decimal_arithm.reference @@ -10,18 +10,18 @@ 63 21 -42 882 -882 2 0 2 0 63 21 -42 882 -882 2 0 2 0 1.00305798474369219219752355409390731264 -0.16305798474369219219752355409390731264 1.490591730234615865843651857942052864 -1.38847100762815390390123822295304634368 1.38847100762815390390123822295304634368 0.02 0.005 -63.42 21.42 -41.58 890.82 -890.82 2.02 0.505 2.02 0.505 -63.42 21.42 -41.58 890.82 -890.82 2.02 0.505 2.02 0.505 -63.42 21.42 -41.58 890.82 -890.82 2.02 0.505 2.02 0.505 -63.42 21.42 -41.58 890.82 -890.82 2.02 0.5 2.02 0.5 +63.42 21.42 -41.58 890.82 -890.82 2.02 0.505 2 0 +63.42 21.42 -41.58 890.82 -890.82 2.02 0.505 2 0 +63.42 21.42 -41.58 890.82 -890.82 2.02 0.505 2 0 +63.42 21.42 -41.58 890.82 -890.82 2.02 0.5 2 0 63 -21 42 882 -882 0 2 0 2 63 -21 42 882 -882 0 2 0 2 63 -21 42 882 -882 0 2 0 2 1.00305798474369219219752355409390731264 0.16305798474369219219752355409390731264 -1.490591730234615865843651857942052864 -1.38847100762815390390123822295304634368 1.38847100762815390390123822295304634368 -0.00000000000000000000000000000000000001 0.00000000000000000000000000000000000001 -63.42 -21.42 41.58 890.82 -890.82 0.495 1.98 0.495 1.98 +63.42 -21.42 41.58 890.82 -890.82 0.495 1.98 0 1 63.42 -21.42 41.58 890.82 -890.82 -63.42 -21.42 41.58 890.82 -890.82 0.495049504950495049 1.980198019801980198 0.495049504950495049 1.980198019801980198 -63.42 -21.42 41.58 890.82 -890.82 0.49 1.98 0.49 1.98 +63.42 -21.42 41.58 890.82 -890.82 0.495049504950495049 1.980198019801980198 0 1 +63.42 -21.42 41.58 890.82 -890.82 0.49 1.98 0 1 -42 42 42 42 0.42 0.42 0.42 42.42 42.42 42.42 0 0 0 0 0 0 0 0 0 0 42 -42 -42 -42 -0.42 -0.42 -0.42 -42.42 -42.42 -42.42 diff --git a/tests/queries/0_stateless/00752_low_cardinality_lambda_argument.sql b/tests/queries/0_stateless/00752_low_cardinality_lambda_argument.sql index a4bdbd5653c..998ff2f54d3 100644 --- a/tests/queries/0_stateless/00752_low_cardinality_lambda_argument.sql +++ b/tests/queries/0_stateless/00752_low_cardinality_lambda_argument.sql @@ -1,3 +1,4 @@ +set allow_suspicious_low_cardinality_types=1; drop table if exists lc_lambda; create table lc_lambda (arr Array(LowCardinality(UInt64))) engine = Memory; insert into lc_lambda select range(number) from system.numbers limit 10; diff --git a/tests/queries/0_stateless/00752_low_cardinality_left_array_join.sql b/tests/queries/0_stateless/00752_low_cardinality_left_array_join.sql index 1c19700e34d..2d65f01a1b9 100644 --- a/tests/queries/0_stateless/00752_low_cardinality_left_array_join.sql +++ b/tests/queries/0_stateless/00752_low_cardinality_left_array_join.sql @@ -1,3 +1,4 @@ +set allow_suspicious_low_cardinality_types=1; drop table if exists lc_left_aj; CREATE TABLE lc_left_aj ( diff --git a/tests/queries/0_stateless/00873_t64_codec_date.reference b/tests/queries/0_stateless/00873_t64_codec_date.reference new file mode 100644 index 00000000000..9353696610c --- /dev/null +++ b/tests/queries/0_stateless/00873_t64_codec_date.reference @@ -0,0 +1,4 @@ +1970-01-01 1970-01-01 1950-01-01 1950-01-01 +1970-01-01 1970-01-01 1970-01-01 1970-01-01 +2149-06-06 2149-06-06 2149-06-06 2149-06-06 +2149-06-06 2149-06-06 2149-06-08 2149-06-08 diff --git a/tests/queries/0_stateless/00873_t64_codec_date.sql b/tests/queries/0_stateless/00873_t64_codec_date.sql new file mode 100644 index 00000000000..c6e21baba12 --- /dev/null +++ b/tests/queries/0_stateless/00873_t64_codec_date.sql @@ -0,0 +1,26 @@ +DROP TABLE IF EXISTS t64; + +CREATE TABLE t64 +( + date16 Date, + t_date16 Date Codec(T64, ZSTD), + date_32 Date32, + t_date32 Date32 Codec(T64, ZSTD) +) ENGINE MergeTree() ORDER BY tuple(); + +INSERT INTO t64 values ('1970-01-01', '1970-01-01', '1970-01-01', '1970-01-01'); +INSERT INTO t64 values ('2149-06-06', '2149-06-06', '2149-06-06', '2149-06-06'); +INSERT INTO t64 values ('2149-06-08', '2149-06-08', '2149-06-08', '2149-06-08'); +INSERT INTO t64 values ('1950-01-01', '1950-01-01', '1950-01-01', '1950-01-01'); + +SELECT * FROM t64 ORDER BY date_32; + +SELECT * FROM t64 WHERE date16 != t_date16; +SELECT * FROM t64 WHERE date_32 != t_date32; + +OPTIMIZE TABLE t64 FINAL; + +SELECT * FROM t64 WHERE date16 != t_date16; +SELECT * FROM t64 WHERE date_32 != t_date32; + +DROP TABLE t64; diff --git a/tests/queries/0_stateless/00945_bloom_filter_index.sql b/tests/queries/0_stateless/00945_bloom_filter_index.sql index dc47e858c4d..faa7feda04d 100644 --- a/tests/queries/0_stateless/00945_bloom_filter_index.sql +++ b/tests/queries/0_stateless/00945_bloom_filter_index.sql @@ -1,3 +1,4 @@ +SET allow_suspicious_low_cardinality_types=1; DROP TABLE IF EXISTS single_column_bloom_filter; diff --git a/tests/queries/0_stateless/01030_storage_url_syntax.sql b/tests/queries/0_stateless/01030_storage_url_syntax.sql index 9b31558eece..eda108aca2f 100644 --- a/tests/queries/0_stateless/01030_storage_url_syntax.sql +++ b/tests/queries/0_stateless/01030_storage_url_syntax.sql @@ -1,7 +1,7 @@ drop table if exists test_table_url_syntax ; create table test_table_url_syntax (id UInt32) ENGINE = URL('') -; -- { serverError 36 } +; -- { serverError UNSUPPORTED_URI_SCHEME } create table test_table_url_syntax (id UInt32) ENGINE = URL('','','','') ; -- { serverError 42 } drop table if exists test_table_url_syntax @@ -11,7 +11,7 @@ drop table if exists test_table_url ; create table test_table_url(id UInt32) ENGINE = URL('http://localhost/endpoint') -; -- { serverError 36 } +; -- { serverError CANNOT_DETECT_FORMAT } create table test_table_url(id UInt32) ENGINE = URL('http://localhost/endpoint.json'); drop table test_table_url; diff --git a/tests/queries/0_stateless/01193_metadata_loading.sh b/tests/queries/0_stateless/01193_metadata_loading.sh index c25cdf4e970..69178a93d42 100755 --- a/tests/queries/0_stateless/01193_metadata_loading.sh +++ b/tests/queries/0_stateless/01193_metadata_loading.sh @@ -1,5 +1,5 @@ #!/usr/bin/env bash -# Tags: no-tsan, no-asan, no-ubsan, no-msan, no-debug, no-parallel, no-fasttest, no-s3-storage +# Tags: no-tsan, no-asan, no-ubsan, no-msan, no-debug, no-parallel, no-fasttest, no-s3-storage, no-sanitize-coverage CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) # shellcheck source=../shell_config.sh @@ -8,16 +8,12 @@ CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) # Check that attaching a database with a large number of tables is not too slow. # it is the worst way of making performance test, nevertheless it can detect significant slowdown and some other issues, that usually found by stress test -db="test_01193_$RANDOM" +db="test_01193_$RANDOM_$RANDOM_$RANDOM_$RANDOM" tables=1000 threads=10 count_multiplier=1 max_time_ms=1500 -debug_or_sanitizer_build=$($CLICKHOUSE_CLIENT -q "WITH ((SELECT value FROM system.build_options WHERE name='BUILD_TYPE') AS build, (SELECT value FROM system.build_options WHERE name='CXX_FLAGS') as flags) SELECT build='Debug' OR flags LIKE '%fsanitize%' OR hasThreadFuzzer()") - -if [[ debug_or_sanitizer_build -eq 1 ]]; then tables=100; count_multiplier=10; max_time_ms=1500; fi - create_tables() { $CLICKHOUSE_CLIENT -q "WITH 'CREATE TABLE $db.table_$1_' AS create1, diff --git a/tests/queries/0_stateless/01414_low_cardinality_nullable.sql b/tests/queries/0_stateless/01414_low_cardinality_nullable.sql index 2d3d31e9b5c..cd5111faf45 100644 --- a/tests/queries/0_stateless/01414_low_cardinality_nullable.sql +++ b/tests/queries/0_stateless/01414_low_cardinality_nullable.sql @@ -1,3 +1,5 @@ +SET allow_suspicious_low_cardinality_types=1; + DROP TABLE IF EXISTS lc_nullable; CREATE TABLE lc_nullable ( diff --git a/tests/queries/0_stateless/01441_low_cardinality_array_index.sql b/tests/queries/0_stateless/01441_low_cardinality_array_index.sql index 4b31a86edfb..b5e14c957c6 100644 --- a/tests/queries/0_stateless/01441_low_cardinality_array_index.sql +++ b/tests/queries/0_stateless/01441_low_cardinality_array_index.sql @@ -1,3 +1,5 @@ +SET allow_suspicious_low_cardinality_types=1; + DROP TABLE IF EXISTS t_01411; CREATE TABLE t_01411( diff --git a/tests/queries/0_stateless/01557_field_infinite_convert_to_number.sql b/tests/queries/0_stateless/01557_field_infinite_convert_to_number.sql index edc4d5cbc91..cc71c8e6f6c 100644 --- a/tests/queries/0_stateless/01557_field_infinite_convert_to_number.sql +++ b/tests/queries/0_stateless/01557_field_infinite_convert_to_number.sql @@ -1 +1 @@ -SET max_threads = nan; -- { serverError 70 } +SET max_threads = nan; -- { serverError CANNOT_CONVERT_TYPE } diff --git a/tests/queries/0_stateless/01595_countMatches.reference b/tests/queries/0_stateless/01595_countMatches.reference index c65279c0b8e..394c8508430 100644 --- a/tests/queries/0_stateless/01595_countMatches.reference +++ b/tests/queries/0_stateless/01595_countMatches.reference @@ -12,6 +12,7 @@ case sensitive 2 4 4 +2 case insensitive 2 1 @@ -21,4 +22,8 @@ case insensitive 2 4 4 +2 errors +FixedString +2 +2 diff --git a/tests/queries/0_stateless/01595_countMatches.sql b/tests/queries/0_stateless/01595_countMatches.sql index 0b170945d44..0c2982572cd 100644 --- a/tests/queries/0_stateless/01595_countMatches.sql +++ b/tests/queries/0_stateless/01595_countMatches.sql @@ -14,6 +14,7 @@ select countMatches(concat(toString(number), 'foofoo'), 'foo') from numbers(2); select countMatches('foobarbazfoobarbaz', 'foo(bar)(?:baz|)'); select countMatches('foo.com bar.com baz.com bam.com', '([^. ]+)\.([^. ]+)'); select countMatches('foo.com@foo.com bar.com@foo.com baz.com@foo.com bam.com@foo.com', '([^. ]+)\.([^. ]+)@([^. ]+)\.([^. ]+)'); +select countMatches(materialize('foobarfoo'), 'foo'); select 'case insensitive'; select countMatchesCaseInsensitive('foobarfoo', 'FOo'); @@ -23,7 +24,13 @@ select countMatchesCaseInsensitive(concat(toString(number), 'Foofoo'), 'foo') fr select countMatchesCaseInsensitive('foOBarBAZfoobarbaz', 'foo(bar)(?:baz|)'); select countMatchesCaseInsensitive('foo.com BAR.COM baz.com bam.com', '([^. ]+)\.([^. ]+)'); select countMatchesCaseInsensitive('foo.com@foo.com bar.com@foo.com BAZ.com@foo.com bam.com@foo.com', '([^. ]+)\.([^. ]+)@([^. ]+)\.([^. ]+)'); +select countMatchesCaseInsensitive(materialize('foobarfoo'), 'FOo'); select 'errors'; -select countMatches(1, 'foo') from numbers(1); -- { serverError 43 } -select countMatches('foobarfoo', toString(number)) from numbers(1); -- { serverError 44 } +select countMatches(1, 'foo') from numbers(1); -- { serverError ILLEGAL_TYPE_OF_ARGUMENT } +select countMatches('foobarfoo', toString(number)) from numbers(1); -- { serverError ILLEGAL_COLUMN } +select countMatches('foo', materialize('foo')); -- { serverError ILLEGAL_COLUMN } + +select 'FixedString'; +select countMatches(toFixedString('foobarfoo', 9), 'foo'); +select countMatches(materialize(toFixedString('foobarfoo', 9)), 'foo'); diff --git a/tests/queries/0_stateless/01651_lc_insert_tiny_log.sql b/tests/queries/0_stateless/01651_lc_insert_tiny_log.sql index 22532529812..d405bb01fd9 100644 --- a/tests/queries/0_stateless/01651_lc_insert_tiny_log.sql +++ b/tests/queries/0_stateless/01651_lc_insert_tiny_log.sql @@ -1,3 +1,4 @@ +set allow_suspicious_low_cardinality_types=1; drop table if exists perf_lc_num; CREATE TABLE perf_lc_num(  num UInt8,  arr Array(LowCardinality(Int64)) default [num]  ) ENGINE = TinyLog; diff --git a/tests/queries/0_stateless/01656_test_query_log_factories_info.sql b/tests/queries/0_stateless/01656_test_query_log_factories_info.sql index 020d7cc5e72..8a6b604b053 100644 --- a/tests/queries/0_stateless/01656_test_query_log_factories_info.sql +++ b/tests/queries/0_stateless/01656_test_query_log_factories_info.sql @@ -41,7 +41,9 @@ FROM system.query_log WHERE current_database = currentDatabase() AND type = 'Que ORDER BY query_start_time DESC LIMIT 1 FORMAT TabSeparatedWithNames; SELECT ''; -SELECT arraySort(used_functions) +-- 1. analyzer includes arrayJoin into functions list +-- 2. for crc32 (CaseInsensitive function) we use lower case now +SELECT arraySort(arrayMap(x -> x == 'crc32' ? 'CRC32' : x, arrayFilter(x-> x != 'arrayJoin', used_functions))) as `arraySort(used_functions)` FROM system.query_log WHERE current_database = currentDatabase() AND type = 'QueryFinish' AND (query LIKE '%toDate(\'2000-12-05\')%') ORDER BY query_start_time DESC LIMIT 1 FORMAT TabSeparatedWithNames; SELECT ''; diff --git a/tests/queries/0_stateless/01717_int_div_float_too_large_ubsan.sql b/tests/queries/0_stateless/01717_int_div_float_too_large_ubsan.sql index c4f26a079f0..dc1e5b37050 100644 --- a/tests/queries/0_stateless/01717_int_div_float_too_large_ubsan.sql +++ b/tests/queries/0_stateless/01717_int_div_float_too_large_ubsan.sql @@ -1,2 +1,2 @@ -SELECT intDiv(9223372036854775807, 0.9998999834060669); -- { serverError 153 } -SELECT intDiv(9223372036854775807, 1.); -- { serverError 153 } +SELECT intDiv(18446744073709551615, 0.9998999834060669); -- { serverError 153 } +SELECT intDiv(18446744073709551615, 1.); -- { serverError 153 } diff --git a/tests/queries/0_stateless/01852_dictionary_found_rate_long.sql b/tests/queries/0_stateless/01852_dictionary_found_rate_long.sql index 09ca0e2063d..d5108e98510 100644 --- a/tests/queries/0_stateless/01852_dictionary_found_rate_long.sql +++ b/tests/queries/0_stateless/01852_dictionary_found_rate_long.sql @@ -22,7 +22,7 @@ CREATE DICTIONARY simple_key_flat_dictionary_01862 value String ) PRIMARY KEY id -SOURCE(CLICKHOUSE(HOST 'localhost' PORT tcpPort() TABLE 'simple_key_source_table_01862')) +SOURCE(CLICKHOUSE(TABLE 'simple_key_source_table_01862')) LAYOUT(FLAT()) LIFETIME(MIN 0 MAX 1000); @@ -43,7 +43,7 @@ CREATE DICTIONARY simple_key_direct_dictionary_01862 value String ) PRIMARY KEY id -SOURCE(CLICKHOUSE(HOST 'localhost' PORT tcpPort() TABLE 'simple_key_source_table_01862')) +SOURCE(CLICKHOUSE(TABLE 'simple_key_source_table_01862')) LAYOUT(DIRECT()); -- check that found_rate is 0, not nan @@ -65,7 +65,7 @@ CREATE DICTIONARY simple_key_hashed_dictionary_01862 value String ) PRIMARY KEY id -SOURCE(CLICKHOUSE(HOST 'localhost' PORT tcpPort() TABLE 'simple_key_source_table_01862')) +SOURCE(CLICKHOUSE(TABLE 'simple_key_source_table_01862')) LAYOUT(HASHED()) LIFETIME(MIN 0 MAX 1000); @@ -85,7 +85,7 @@ CREATE DICTIONARY simple_key_sparse_hashed_dictionary_01862 value String ) PRIMARY KEY id -SOURCE(CLICKHOUSE(HOST 'localhost' PORT tcpPort() TABLE 'simple_key_source_table_01862')) +SOURCE(CLICKHOUSE(TABLE 'simple_key_source_table_01862')) LAYOUT(SPARSE_HASHED()) LIFETIME(MIN 0 MAX 1000); @@ -105,7 +105,7 @@ CREATE DICTIONARY simple_key_cache_dictionary_01862 value String ) PRIMARY KEY id -SOURCE(CLICKHOUSE(HOST 'localhost' PORT tcpPort() TABLE 'simple_key_source_table_01862')) +SOURCE(CLICKHOUSE(TABLE 'simple_key_source_table_01862')) LAYOUT(CACHE(SIZE_IN_CELLS 100000)) LIFETIME(MIN 0 MAX 1000); @@ -143,7 +143,7 @@ CREATE DICTIONARY complex_key_hashed_dictionary_01862 value String ) PRIMARY KEY id, id_key -SOURCE(CLICKHOUSE(HOST 'localhost' PORT tcpPort() TABLE 'complex_key_source_table_01862')) +SOURCE(CLICKHOUSE(TABLE 'complex_key_source_table_01862')) LAYOUT(COMPLEX_KEY_HASHED()) LIFETIME(MIN 0 MAX 1000); @@ -164,7 +164,7 @@ CREATE DICTIONARY complex_key_direct_dictionary_01862 value String ) PRIMARY KEY id, id_key -SOURCE(CLICKHOUSE(HOST 'localhost' PORT tcpPort() TABLE 'complex_key_source_table_01862')) +SOURCE(CLICKHOUSE(TABLE 'complex_key_source_table_01862')) LAYOUT(COMPLEX_KEY_DIRECT()); SELECT name, found_rate FROM system.dictionaries WHERE database = currentDatabase() AND name = 'complex_key_direct_dictionary_01862'; @@ -184,7 +184,7 @@ CREATE DICTIONARY complex_key_cache_dictionary_01862 value String ) PRIMARY KEY id, id_key -SOURCE(CLICKHOUSE(HOST 'localhost' PORT tcpPort() TABLE 'complex_key_source_table_01862')) +SOURCE(CLICKHOUSE(TABLE 'complex_key_source_table_01862')) LAYOUT(COMPLEX_KEY_CACHE(SIZE_IN_CELLS 100000)) LIFETIME(MIN 0 MAX 1000); @@ -223,7 +223,7 @@ CREATE DICTIONARY simple_key_range_hashed_dictionary_01862 last Date ) PRIMARY KEY id -SOURCE(CLICKHOUSE(HOST 'localhost' PORT tcpPort() TABLE 'range_key_source_table_01862')) +SOURCE(CLICKHOUSE(TABLE 'range_key_source_table_01862')) LAYOUT(RANGE_HASHED()) RANGE(MIN first MAX last) LIFETIME(MIN 0 MAX 1000); @@ -259,13 +259,16 @@ CREATE DICTIONARY ip_trie_dictionary_01862 value String ) PRIMARY KEY prefix -SOURCE(CLICKHOUSE(HOST 'localhost' PORT tcpPort() TABLE 'ip_trie_source_table_01862')) +SOURCE(CLICKHOUSE(TABLE 'ip_trie_source_table_01862')) LAYOUT(IP_TRIE()) LIFETIME(MIN 0 MAX 1000); +-- found_rate = 0, because we didn't make any searches. SELECT name, found_rate FROM system.dictionaries WHERE database = currentDatabase() AND name = 'ip_trie_dictionary_01862'; +-- found_rate = 1, because the dictionary covers the 127.0.0.1 address. SELECT dictGet('ip_trie_dictionary_01862', 'value', tuple(toIPv4('127.0.0.1'))) FORMAT Null; SELECT name, found_rate FROM system.dictionaries WHERE database = currentDatabase() AND name = 'ip_trie_dictionary_01862'; +-- found_rate = 0.5, because the dictionary does not cover 1.1.1.1 and we have two lookups in total as of now. SELECT dictGet('ip_trie_dictionary_01862', 'value', tuple(toIPv4('1.1.1.1'))) FORMAT Null; SELECT name, found_rate FROM system.dictionaries WHERE database = currentDatabase() AND name = 'ip_trie_dictionary_01862'; @@ -299,7 +302,7 @@ CREATE DICTIONARY polygon_dictionary_01862 name String ) PRIMARY KEY key -SOURCE(CLICKHOUSE(HOST 'localhost' PORT tcpPort() USER 'default' TABLE 'polygons_01862')) +SOURCE(CLICKHOUSE(USER 'default' TABLE 'polygons_01862')) LIFETIME(0) LAYOUT(POLYGON()); diff --git a/tests/queries/0_stateless/01889_sqlite_read_write.reference b/tests/queries/0_stateless/01889_sqlite_read_write.reference index 9f2b382e41e..e605693d95d 100644 --- a/tests/queries/0_stateless/01889_sqlite_read_write.reference +++ b/tests/queries/0_stateless/01889_sqlite_read_write.reference @@ -29,7 +29,7 @@ CREATE TABLE default.sqlite_table3\n(\n `col1` String,\n `col2` Int32\n)\n not a null 2 3 4 -line6 6 +line\'6 6 7 test table function line1 1 diff --git a/tests/queries/0_stateless/01889_sqlite_read_write.sh b/tests/queries/0_stateless/01889_sqlite_read_write.sh index 02b9a649e94..fd0a1df20ac 100755 --- a/tests/queries/0_stateless/01889_sqlite_read_write.sh +++ b/tests/queries/0_stateless/01889_sqlite_read_write.sh @@ -76,7 +76,7 @@ ${CLICKHOUSE_CLIENT} --query='DROP TABLE IF EXISTS sqlite_table3' ${CLICKHOUSE_CLIENT} --query="CREATE TABLE sqlite_table3 (col1 String, col2 Int32) ENGINE = SQLite('${DB_PATH}', 'table3')" ${CLICKHOUSE_CLIENT} --query='SHOW CREATE TABLE sqlite_table3;' | sed -r 's/(.*SQLite)(.*)/\1/' -${CLICKHOUSE_CLIENT} --query="INSERT INTO sqlite_table3 VALUES ('line6', 6);" +${CLICKHOUSE_CLIENT} --query="INSERT INTO sqlite_table3 VALUES ('line\'6', 6);" ${CLICKHOUSE_CLIENT} --query="INSERT INTO sqlite_table3 VALUES (NULL, 7);" ${CLICKHOUSE_CLIENT} --query='SELECT * FROM sqlite_table3 ORDER BY col2' diff --git a/tests/queries/0_stateless/01920_not_chain_format.reference b/tests/queries/0_stateless/01920_not_chain_format.reference index 22abfd17dc7..bb58a0ff146 100644 --- a/tests/queries/0_stateless/01920_not_chain_format.reference +++ b/tests/queries/0_stateless/01920_not_chain_format.reference @@ -1,5 +1,5 @@ -- { echo } EXPLAIN SYNTAX SELECT NOT NOT (NOT (NOT (NULL))); -SELECT NOT (NOT (NOT NOT NULL)) +SELECT NOT (NOT (NOT (NOT NULL))) EXPLAIN SYNTAX SELECT NOT (NOT (NOT NOT NULL)); -SELECT NOT (NOT (NOT NOT NULL)) +SELECT NOT (NOT (NOT (NOT NULL))) diff --git a/tests/queries/0_stateless/01921_not_chain.reference b/tests/queries/0_stateless/01921_not_chain.reference index c29c66f1274..ebd18f4b342 100644 --- a/tests/queries/0_stateless/01921_not_chain.reference +++ b/tests/queries/0_stateless/01921_not_chain.reference @@ -4,6 +4,6 @@ SELECT 1 != (NOT 1); SELECT 1 != NOT 1; 1 EXPLAIN SYNTAX SELECT 1 != (NOT 1); -SELECT 1 != NOT 1 +SELECT 1 != (NOT 1) EXPLAIN SYNTAX SELECT 1 != NOT 1; -SELECT 1 != NOT 1 +SELECT 1 != (NOT 1) diff --git a/tests/queries/0_stateless/02008_materialize_column.sql b/tests/queries/0_stateless/02008_materialize_column.sql index a78920d2525..cc7d3096402 100644 --- a/tests/queries/0_stateless/02008_materialize_column.sql +++ b/tests/queries/0_stateless/02008_materialize_column.sql @@ -17,6 +17,7 @@ ALTER TABLE tmp MATERIALIZE COLUMN s; ALTER TABLE tmp MODIFY COLUMN s String DEFAULT toString(x+2); SELECT arraySort(groupArray(x)), groupArray(s) FROM tmp; +ALTER TABLE tmp CLEAR COLUMN s; -- Need to clear because MATERIALIZE COLUMN won't override past values; ALTER TABLE tmp MATERIALIZE COLUMN s; ALTER TABLE tmp MODIFY COLUMN s String DEFAULT toString(x+3); SELECT arraySort(groupArray(x)), groupArray(s) FROM tmp; diff --git a/tests/queries/0_stateless/02010_array_index_bad_cast.sql b/tests/queries/0_stateless/02010_array_index_bad_cast.sql index 19c58bb28a7..42a6556fc77 100644 --- a/tests/queries/0_stateless/02010_array_index_bad_cast.sql +++ b/tests/queries/0_stateless/02010_array_index_bad_cast.sql @@ -1,2 +1,3 @@ -- This query throws exception about uncomparable data types (but at least it does not introduce bad cast in code). +SET allow_suspicious_low_cardinality_types=1; SELECT has(materialize(CAST(['2021-07-14'] AS Array(LowCardinality(Nullable(DateTime))))), materialize('2021-07-14'::DateTime64(7))); -- { serverError 44 } diff --git a/tests/queries/0_stateless/02117_show_create_table_system.reference b/tests/queries/0_stateless/02117_show_create_table_system.reference index 1b758f4132b..e60fb844de8 100644 --- a/tests/queries/0_stateless/02117_show_create_table_system.reference +++ b/tests/queries/0_stateless/02117_show_create_table_system.reference @@ -1083,6 +1083,7 @@ CREATE TABLE system.tables `data_paths` Array(String), `metadata_path` String, `metadata_modification_time` DateTime, + `metadata_version` Int32, `dependencies_database` Array(String), `dependencies_table` Array(String), `create_table_query` String, diff --git a/tests/queries/0_stateless/02117_show_create_table_system.sql b/tests/queries/0_stateless/02117_show_create_table_system.sql index 32465abbed7..438f26dcca7 100644 --- a/tests/queries/0_stateless/02117_show_create_table_system.sql +++ b/tests/queries/0_stateless/02117_show_create_table_system.sql @@ -1,6 +1,6 @@ /* we will `use system` to bypass style check, because `show create table` statement -cannot fit the requirement in check-sytle, which is as +cannot fit the requirement in check-style, which is as "# Queries to: tables_with_database_column=( diff --git a/tests/queries/0_stateless/02131_row_policies_combination.reference b/tests/queries/0_stateless/02131_row_policies_combination.reference index b76028d5077..5015cb14456 100644 --- a/tests/queries/0_stateless/02131_row_policies_combination.reference +++ b/tests/queries/0_stateless/02131_row_policies_combination.reference @@ -12,6 +12,15 @@ R1, R2, R3: (x == 1) OR (x == 2) OR (x == 3) 1 2 3 +R1, R2, R3 + additional_table_filters and PREWHERE: (x == 1) OR (x == 2) OR (x == 3) AND (x < 3) AND (x > 1) +2 +3 +R1, R2, R3 + additional_result_filter and PREWHERE: (x == 1) OR (x == 2) OR (x == 3) AND (x < 3) AND (x > 1) +2 +3 +R1, R2, R3 + additional_table_filters and WHERE: (x == 1) OR (x == 2) OR (x == 3) AND (x < 3) AND (x > 1) +2 +3 R1, R2, R3, R4: ((x == 1) OR (x == 2) OR (x == 3)) AND (x <= 2) 1 2 diff --git a/tests/queries/0_stateless/02131_row_policies_combination.sql b/tests/queries/0_stateless/02131_row_policies_combination.sql index b5be672bb1b..02f2365eed8 100644 --- a/tests/queries/0_stateless/02131_row_policies_combination.sql +++ b/tests/queries/0_stateless/02131_row_policies_combination.sql @@ -23,6 +23,24 @@ CREATE ROW POLICY 02131_filter_3 ON 02131_rptable USING x=3 AS permissive TO ALL SELECT 'R1, R2, R3: (x == 1) OR (x == 2) OR (x == 3)'; SELECT * FROM 02131_rptable; +SELECT 'R1, R2, R3 + additional_table_filters and PREWHERE: (x == 1) OR (x == 2) OR (x == 3) AND (x < 3) AND (x > 1)'; +SELECT * FROM 02131_rptable +PREWHERE x >= 2 +SETTINGS additional_table_filters = {'02131_rptable': 'x > 1'} +; + +SELECT 'R1, R2, R3 + additional_result_filter and PREWHERE: (x == 1) OR (x == 2) OR (x == 3) AND (x < 3) AND (x > 1)'; +SELECT * FROM 02131_rptable +PREWHERE x >= 2 +SETTINGS additional_result_filter = 'x > 1' +; + +SELECT 'R1, R2, R3 + additional_table_filters and WHERE: (x == 1) OR (x == 2) OR (x == 3) AND (x < 3) AND (x > 1)'; +SELECT * FROM 02131_rptable +WHERE x >= 2 +SETTINGS additional_table_filters = {'02131_rptable': 'x > 1'} +; + CREATE ROW POLICY 02131_filter_4 ON 02131_rptable USING x<=2 AS restrictive TO ALL; SELECT 'R1, R2, R3, R4: ((x == 1) OR (x == 2) OR (x == 3)) AND (x <= 2)'; SELECT * FROM 02131_rptable; diff --git a/tests/queries/0_stateless/02184_nested_tuple.sql b/tests/queries/0_stateless/02184_nested_tuple.sql index 67a20e3dce1..09ed8eb7200 100644 --- a/tests/queries/0_stateless/02184_nested_tuple.sql +++ b/tests/queries/0_stateless/02184_nested_tuple.sql @@ -1,3 +1,4 @@ +SET allow_suspicious_low_cardinality_types=1; DROP TABLE IF EXISTS t_nested_tuple; CREATE TABLE t_nested_tuple diff --git a/tests/queries/0_stateless/02185_orc_corrupted_file.sh b/tests/queries/0_stateless/02185_orc_corrupted_file.sh index 1987f094faa..12510ae3836 100755 --- a/tests/queries/0_stateless/02185_orc_corrupted_file.sh +++ b/tests/queries/0_stateless/02185_orc_corrupted_file.sh @@ -8,4 +8,4 @@ CUR_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) USER_FILES_PATH=$($CLICKHOUSE_CLIENT_BINARY --query "select _path,_file from file('nonexist.txt', 'CSV', 'val1 char')" 2>&1 | grep Exception | awk '{gsub("/nonexist.txt","",$9); print $9}') cp $CUR_DIR/data_orc/corrupted.orc $USER_FILES_PATH/ -${CLICKHOUSE_CLIENT} --query="select * from file('corrupted.orc')" 2>&1 | grep -F -q 'Cannot extract table structure' && echo 'OK' || echo 'FAIL' +${CLICKHOUSE_CLIENT} --query="select * from file('corrupted.orc')" 2>&1 | grep -F -q 'CANNOT_EXTRACT_TABLE_STRUCTURE' && echo 'OK' || echo 'FAIL' diff --git a/tests/queries/0_stateless/02235_remote_fs_cache_stress.sh b/tests/queries/0_stateless/02235_remote_fs_cache_stress.sh index bc1a4cbfdd1..0b6b9f461b0 100755 --- a/tests/queries/0_stateless/02235_remote_fs_cache_stress.sh +++ b/tests/queries/0_stateless/02235_remote_fs_cache_stress.sh @@ -6,7 +6,7 @@ CUR_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) . "$CUR_DIR"/../shell_config.sh -${CLICKHOUSE_CLIENT} --multiquery --multiline --query=""" +${CLICKHOUSE_CLIENT} --allow_suspicious_low_cardinality_types=1 --multiquery --multiline --query=""" DROP TABLE IF EXISTS t_01411; DROP TABLE IF EXISTS t_01411_num; diff --git a/tests/queries/0_stateless/02245_parquet_skip_unknown_type.sh b/tests/queries/0_stateless/02245_parquet_skip_unknown_type.sh index 954e2e83f27..8ff6e28b123 100755 --- a/tests/queries/0_stateless/02245_parquet_skip_unknown_type.sh +++ b/tests/queries/0_stateless/02245_parquet_skip_unknown_type.sh @@ -12,6 +12,6 @@ DATA_FILE=$USER_FILES_PATH/$FILE_NAME cp $CUR_DIR/data_parquet_bad_column/metadata_0.parquet $DATA_FILE -$CLICKHOUSE_CLIENT -q "desc file(test_02245.parquet)" 2>&1 | grep -qF "Cannot extract table structure" && echo "OK" || echo "FAIL" +$CLICKHOUSE_CLIENT -q "desc file(test_02245.parquet)" 2>&1 | grep -qF "CANNOT_EXTRACT_TABLE_STRUCTURE" && echo "OK" || echo "FAIL" $CLICKHOUSE_CLIENT -q "desc file(test_02245.parquet) settings input_format_parquet_skip_columns_with_unsupported_types_in_schema_inference=1" $CLICKHOUSE_CLIENT -q "select count(*) from file(test_02245.parquet) settings input_format_parquet_skip_columns_with_unsupported_types_in_schema_inference=1" diff --git a/tests/queries/0_stateless/02268_json_wrong_root_type_in_schema_inference.sql b/tests/queries/0_stateless/02268_json_wrong_root_type_in_schema_inference.sql index 5462d38f1a3..98bf29c32f5 100644 --- a/tests/queries/0_stateless/02268_json_wrong_root_type_in_schema_inference.sql +++ b/tests/queries/0_stateless/02268_json_wrong_root_type_in_schema_inference.sql @@ -1,7 +1,7 @@ -- Tags: no-fasttest insert into function file('02268_data.jsonl', 'TSV') select 1; -select * from file('02268_data.jsonl'); --{serverError CANNOT_PARSE_INPUT_ASSERTION_FAILED} +select * from file('02268_data.jsonl'); --{serverError CANNOT_EXTRACT_TABLE_STRUCTURE} insert into function file('02268_data.jsonCompactEachRow', 'TSV') select 1; -select * from file('02268_data.jsonCompactEachRow'); --{serverError CANNOT_PARSE_INPUT_ASSERTION_FAILED} +select * from file('02268_data.jsonCompactEachRow'); --{serverError CANNOT_EXTRACT_TABLE_STRUCTURE} diff --git a/tests/queries/0_stateless/02286_mysql_dump_input_format.sh b/tests/queries/0_stateless/02286_mysql_dump_input_format.sh index a3711497ae8..2f6167c3ddf 100755 --- a/tests/queries/0_stateless/02286_mysql_dump_input_format.sh +++ b/tests/queries/0_stateless/02286_mysql_dump_input_format.sh @@ -23,7 +23,7 @@ $CLICKHOUSE_CLIENT -q "desc file(dump1.sql, MySQLDump) settings input_format_mys $CLICKHOUSE_CLIENT -q "select * from file(dump1.sql, MySQLDump) settings input_format_mysql_dump_table_name='test', max_threads=1" $CLICKHOUSE_CLIENT -q "desc file(dump1.sql, MySQLDump) settings input_format_mysql_dump_table_name='test2'" $CLICKHOUSE_CLIENT -q "select * from file(dump1.sql, MySQLDump) settings input_format_mysql_dump_table_name='test2', max_threads=1" -$CLICKHOUSE_CLIENT -q "desc file(dump1.sql, MySQLDump) settings input_format_mysql_dump_table_name='test 3'" 2>&1 | grep -F -q 'Cannot extract table structure' && echo 'OK' || echo 'FAIL' +$CLICKHOUSE_CLIENT -q "desc file(dump1.sql, MySQLDump) settings input_format_mysql_dump_table_name='test 3'" 2>&1 | grep -F -q 'CANNOT_EXTRACT_TABLE_STRUCTURE' && echo 'OK' || echo 'FAIL' $CLICKHOUSE_CLIENT -q "select * from file(dump1.sql, MySQLDump, 'x Nullable(Int32)') settings input_format_mysql_dump_table_name='test 3'" 2>&1 | grep -F -q 'EMPTY_DATA_PASSED' && echo 'OK' || echo 'FAIL' echo "dump2" diff --git a/tests/queries/0_stateless/02293_formats_json_columns.sh b/tests/queries/0_stateless/02293_formats_json_columns.sh index ce35c4bd878..4eae5a1abb4 100755 --- a/tests/queries/0_stateless/02293_formats_json_columns.sh +++ b/tests/queries/0_stateless/02293_formats_json_columns.sh @@ -88,4 +88,4 @@ echo ' } ' > $DATA_FILE -$CLICKHOUSE_CLIENT -q "desc file(data_02293, JSONColumns) settings input_format_max_rows_to_read_for_schema_inference=3, input_format_json_infer_incomplete_types_as_strings=0" 2>&1 | grep -F -q 'Cannot extract table structure' && echo 'OK' || echo 'FAIL' +$CLICKHOUSE_CLIENT -q "desc file(data_02293, JSONColumns) settings input_format_max_rows_to_read_for_schema_inference=3, input_format_json_infer_incomplete_types_as_strings=0" 2>&1 | grep -F -q 'CANNOT_EXTRACT_TABLE_STRUCTURE' && echo 'OK' || echo 'FAIL' diff --git a/tests/queries/0_stateless/02327_capnproto_protobuf_empty_messages.sh b/tests/queries/0_stateless/02327_capnproto_protobuf_empty_messages.sh index dfc0dedeaf1..650faf6985e 100755 --- a/tests/queries/0_stateless/02327_capnproto_protobuf_empty_messages.sh +++ b/tests/queries/0_stateless/02327_capnproto_protobuf_empty_messages.sh @@ -15,11 +15,11 @@ mkdir -p $SCHEMADIR/$SERVER_SCHEMADIR cp -r $CLIENT_SCHEMADIR/02327_* $SCHEMADIR/$SERVER_SCHEMADIR/ -$CLICKHOUSE_CLIENT --query="desc file(data.pb) settings format_schema='$SERVER_SCHEMADIR/02327_schema:MessageWithEmpty'" 2>&1 | grep -F -q 'Cannot extract table structure' && echo 'OK' || echo 'FAIL'; -$CLICKHOUSE_CLIENT --query="desc file(data.capnp) settings format_schema='$SERVER_SCHEMADIR/02327_schema:MessageWithEmpty'" 2>&1 | grep -F -q 'Cannot extract table structure' && echo 'OK' || echo 'FAIL'; +$CLICKHOUSE_CLIENT --query="desc file(data.pb) settings format_schema='$SERVER_SCHEMADIR/02327_schema:MessageWithEmpty'" 2>&1 | grep -F -q 'The table structure cannot be extracted' && echo 'OK' || echo 'FAIL'; +$CLICKHOUSE_CLIENT --query="desc file(data.capnp) settings format_schema='$SERVER_SCHEMADIR/02327_schema:MessageWithEmpty'" 2>&1 | grep -F -q 'The table structure cannot be extracted' && echo 'OK' || echo 'FAIL'; -$CLICKHOUSE_CLIENT --query="create table test_protobuf engine=File(Protobuf) settings format_schema='$SERVER_SCHEMADIR/02327_schema:MessageWithEmpty'" 2>&1 | grep -F -q 'Cannot extract table structure' && echo 'OK' || echo 'FAIL'; -$CLICKHOUSE_CLIENT --query="create table test_capnp engine=File(CapnProto) settings format_schema='$SERVER_SCHEMADIR/02327_schema:MessageWithEmpty'" 2>&1 | grep -F -q 'Cannot extract table structure' && echo 'OK' || echo 'FAIL'; +$CLICKHOUSE_CLIENT --query="create table test_protobuf engine=File(Protobuf) settings format_schema='$SERVER_SCHEMADIR/02327_schema:MessageWithEmpty'" 2>&1 | grep -F -q 'The table structure cannot be extracted' && echo 'OK' || echo 'FAIL'; +$CLICKHOUSE_CLIENT --query="create table test_capnp engine=File(CapnProto) settings format_schema='$SERVER_SCHEMADIR/02327_schema:MessageWithEmpty'" 2>&1 | grep -F -q 'The table structure cannot be extracted' && echo 'OK' || echo 'FAIL'; $CLICKHOUSE_CLIENT --query="desc file(data.pb) settings format_schema='$SERVER_SCHEMADIR/02327_schema:MessageWithEmpty', input_format_protobuf_skip_fields_with_unsupported_types_in_schema_inference=1"; $CLICKHOUSE_CLIENT --query="desc file(data.capnp) settings format_schema='$SERVER_SCHEMADIR/02327_schema:MessageWithEmpty', input_format_capn_proto_skip_fields_with_unsupported_types_in_schema_inference=1"; diff --git a/tests/queries/0_stateless/02327_try_infer_integers_schema_inference.sql b/tests/queries/0_stateless/02327_try_infer_integers_schema_inference.sql index 0ceed178865..a4a69f4fa40 100644 --- a/tests/queries/0_stateless/02327_try_infer_integers_schema_inference.sql +++ b/tests/queries/0_stateless/02327_try_infer_integers_schema_inference.sql @@ -1,6 +1,7 @@ -- Tags: no-fasttest set input_format_try_infer_integers=1; +set input_format_try_infer_exponent_floats=1; select 'JSONEachRow'; desc format(JSONEachRow, '{"x" : 123}'); diff --git a/tests/queries/0_stateless/02416_json_object_inference.sql b/tests/queries/0_stateless/02416_json_object_inference.sql index 91137c0243c..3022ee026d0 100644 --- a/tests/queries/0_stateless/02416_json_object_inference.sql +++ b/tests/queries/0_stateless/02416_json_object_inference.sql @@ -2,5 +2,5 @@ set allow_experimental_object_type=1; desc format(JSONEachRow, '{"a" : {"b" : {"c" : 1, "d" : "str"}}}'); set allow_experimental_object_type=0, input_format_json_read_objects_as_strings=0, input_format_json_try_infer_named_tuples_from_objects=0, input_format_json_read_numbers_as_strings=0; -desc format(JSONEachRow, '{"a" : {"b" : {"c" : 1, "d" : "str"}}}'); -- {serverError 652} +desc format(JSONEachRow, '{"a" : {"b" : {"c" : 1, "d" : "str"}}}'); -- {serverError CANNOT_EXTRACT_TABLE_STRUCTURE} diff --git a/tests/queries/0_stateless/02424_pod_array_overflow.sql b/tests/queries/0_stateless/02424_pod_array_overflow.sql index 4b85d5be029..50c46cf19f1 100644 --- a/tests/queries/0_stateless/02424_pod_array_overflow.sql +++ b/tests/queries/0_stateless/02424_pod_array_overflow.sql @@ -1 +1 @@ -SELECT * FROM format(Native, '\x02\x02\x02\x6b\x30\x1a\x4d\x61\x70\x28\x46\x69\x78\x65\x64\x53\x74\x72\x69\x6e\x67\x28\x31\x29\x2c\x20\x49\x6e\x74\x36\x34\x29\x01\x00\x00\x00\x00\x00\x00\x00\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\x7f\x00\x7f\xff\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x64\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\xcf\x31\x3f\x56\x69\x11\x89\x25'); -- { serverError 128 } +SELECT * FROM format(Native, '\x02\x02\x02\x6b\x30\x1a\x4d\x61\x70\x28\x46\x69\x78\x65\x64\x53\x74\x72\x69\x6e\x67\x28\x31\x29\x2c\x20\x49\x6e\x74\x36\x34\x29\x01\x00\x00\x00\x00\x00\x00\x00\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\x7f\x00\x7f\xff\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x64\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\xcf\x31\x3f\x56\x69\x11\x89\x25'); -- { serverError CANNOT_EXTRACT_TABLE_STRUCTURE } diff --git a/tests/queries/0_stateless/02426_pod_array_overflow_2.sql b/tests/queries/0_stateless/02426_pod_array_overflow_2.sql index 52a00730227..6a0d97acee3 100644 --- a/tests/queries/0_stateless/02426_pod_array_overflow_2.sql +++ b/tests/queries/0_stateless/02426_pod_array_overflow_2.sql @@ -1 +1 @@ -SELECT * FROM format(Native, 'k0\x23Array(Tuple(FixedString(1), Int64))\0\0\0\0\0\0\0�����\0����������������\0�\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0d\0\0\0\0\0\0\0\0\0\0\0\0\0�1?Vi�%'); -- { serverError 128 } +SELECT * FROM format(Native, 'k0\x23Array(Tuple(FixedString(1), Int64))\0\0\0\0\0\0\0�����\0����������������\0�\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0d\0\0\0\0\0\0\0\0\0\0\0\0\0�1?Vi�%'); -- { serverError CANNOT_EXTRACT_TABLE_STRUCTURE } diff --git a/tests/queries/0_stateless/02426_pod_array_overflow_3.sql b/tests/queries/0_stateless/02426_pod_array_overflow_3.sql index 857ba2ca28e..caabf7d1679 100644 --- a/tests/queries/0_stateless/02426_pod_array_overflow_3.sql +++ b/tests/queries/0_stateless/02426_pod_array_overflow_3.sql @@ -1 +1 @@ -SELECT * FROM format(Native, '\x01\x01\x01x\x0CArray(UInt8)\x01\x00\xBD\xEF\xBF\xBD\xEF\xBF\xBD\xEF'); -- { serverError 128 } +SELECT * FROM format(Native, '\x01\x01\x01x\x0CArray(UInt8)\x01\x00\xBD\xEF\xBF\xBD\xEF\xBF\xBD\xEF'); -- { serverError CANNOT_EXTRACT_TABLE_STRUCTURE } diff --git a/tests/queries/0_stateless/02428_parameterized_view.reference b/tests/queries/0_stateless/02428_parameterized_view.reference index 422fdaa4983..fd77e6ed8df 100644 --- a/tests/queries/0_stateless/02428_parameterized_view.reference +++ b/tests/queries/0_stateless/02428_parameterized_view.reference @@ -23,6 +23,7 @@ ERROR 20 20 ERROR +20 30 20 30 diff --git a/tests/queries/0_stateless/02428_parameterized_view.sh b/tests/queries/0_stateless/02428_parameterized_view.sh index ad9c672f4c5..c6f0927db36 100755 --- a/tests/queries/0_stateless/02428_parameterized_view.sh +++ b/tests/queries/0_stateless/02428_parameterized_view.sh @@ -37,7 +37,7 @@ $CLICKHOUSE_CLIENT -q "CREATE VIEW test_02428_pv1 AS SELECT * FROM test_02428_Ca $CLICKHOUSE_CLIENT -q "SELECT Price FROM test_02428_pv1(price=20)" $CLICKHOUSE_CLIENT -q "SELECT Price FROM \`test_02428_pv1\`(price=20)" -$CLICKHOUSE_CLIENT -q "SELECT Price FROM test_02428_pv1" 2>&1 | grep -Fq "UNKNOWN_QUERY_PARAMETER" && echo 'ERROR' || echo 'OK' +$CLICKHOUSE_CLIENT -q "SELECT Price FROM test_02428_pv1" 2>&1 | grep -q "UNKNOWN_QUERY_PARAMETER\|UNKNOWN_IDENTIFIER" && echo 'ERROR' || echo 'OK' $CLICKHOUSE_CLIENT --param_p 10 -q "SELECT Price FROM test_02428_pv1(price={p:UInt64})" $CLICKHOUSE_CLIENT --param_l 1 -q "SELECT Price FROM test_02428_pv1(price=50) LIMIT ({l:UInt64})" @@ -72,7 +72,8 @@ $CLICKHOUSE_CLIENT -q "INSERT INTO ${CLICKHOUSE_TEST_UNIQUE_NAME}.Catalog VALUES $CLICKHOUSE_CLIENT -q "INSERT INTO ${CLICKHOUSE_TEST_UNIQUE_NAME}.Catalog VALUES ('Paper', 20, 1)" $CLICKHOUSE_CLIENT -q "CREATE VIEW ${CLICKHOUSE_TEST_UNIQUE_NAME}.pv1 AS SELECT * FROM ${CLICKHOUSE_TEST_UNIQUE_NAME}.Catalog WHERE Price={price:UInt64}" $CLICKHOUSE_CLIENT -q "SELECT Price FROM ${CLICKHOUSE_TEST_UNIQUE_NAME}.pv1(price=20)" -$CLICKHOUSE_CLIENT -q "SELECT Price FROM \`${CLICKHOUSE_TEST_UNIQUE_NAME}.pv1\`(price=20)" 2>&1 | grep -Fq "UNKNOWN_FUNCTION" && echo 'ERROR' || echo 'OK' +$CLICKHOUSE_CLIENT -q "SELECT Price FROM \`${CLICKHOUSE_TEST_UNIQUE_NAME}.pv1\`(price=20) SETTINGS allow_experimental_analyzer = 0" 2>&1 | grep -Fq "UNKNOWN_FUNCTION" && echo 'ERROR' || echo 'OK' +$CLICKHOUSE_CLIENT -q "SELECT Price FROM \`${CLICKHOUSE_TEST_UNIQUE_NAME}.pv1\`(price=20) SETTINGS allow_experimental_analyzer = 1" $CLICKHOUSE_CLIENT -q "INSERT INTO test_02428_Catalog VALUES ('Book2', 30, 8)" diff --git a/tests/queries/0_stateless/02450_kill_distributed_query_deadlock.sh b/tests/queries/0_stateless/02450_kill_distributed_query_deadlock.sh index 03c43843d3a..0cd520d8d5d 100755 --- a/tests/queries/0_stateless/02450_kill_distributed_query_deadlock.sh +++ b/tests/queries/0_stateless/02450_kill_distributed_query_deadlock.sh @@ -1,5 +1,5 @@ #!/usr/bin/env bash -# Tags: long +# Tags: long, no-random-settings, no-debug CUR_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) # shellcheck source=../shell_config.sh diff --git a/tests/queries/0_stateless/02455_duplicate_column_names_in_schema_inference.sql b/tests/queries/0_stateless/02455_duplicate_column_names_in_schema_inference.sql index 626a4d7034e..f67e5496a98 100644 --- a/tests/queries/0_stateless/02455_duplicate_column_names_in_schema_inference.sql +++ b/tests/queries/0_stateless/02455_duplicate_column_names_in_schema_inference.sql @@ -1,7 +1,7 @@ -- Tags: no-fasttest -desc format(JSONEachRow, '{"x" : 1, "x" : 2}'); -- {serverError INCORRECT_DATA} -desc format(JSONEachRow, '{"x" : 1, "y" : 2}\n{"x" : 2, "x" : 3}'); -- {serverError INCORRECT_DATA} -desc format(CSVWithNames, 'a,b,a\n1,2,3'); -- {serverError INCORRECT_DATA} -desc format(CSV, '1,2,3') settings column_names_for_schema_inference='a, b, a'; -- {serverError INCORRECT_DATA} +desc format(JSONEachRow, '{"x" : 1, "x" : 2}'); -- {serverError CANNOT_EXTRACT_TABLE_STRUCTURE} +desc format(JSONEachRow, '{"x" : 1, "y" : 2}\n{"x" : 2, "x" : 3}'); -- {serverError CANNOT_EXTRACT_TABLE_STRUCTURE} +desc format(CSVWithNames, 'a,b,a\n1,2,3'); -- {serverError CANNOT_EXTRACT_TABLE_STRUCTURE} +desc format(CSV, '1,2,3') settings column_names_for_schema_inference='a, b, a'; -- {serverError CANNOT_EXTRACT_TABLE_STRUCTURE} diff --git a/tests/queries/0_stateless/02458_use_structure_from_insertion_table.sql b/tests/queries/0_stateless/02458_use_structure_from_insertion_table.sql index ac549a7faf1..71a2381d7b6 100644 --- a/tests/queries/0_stateless/02458_use_structure_from_insertion_table.sql +++ b/tests/queries/0_stateless/02458_use_structure_from_insertion_table.sql @@ -10,14 +10,14 @@ set input_format_json_infer_incomplete_types_as_strings=0; insert into test select * from file(02458_data.jsonl); insert into test select x, 1 from file(02458_data.jsonl); insert into test select x, y from file(02458_data.jsonl); -insert into test select x + 1, y from file(02458_data.jsonl); -- {serverError ONLY_NULLS_WHILE_READING_SCHEMA} +insert into test select x + 1, y from file(02458_data.jsonl); -- {serverError CANNOT_EXTRACT_TABLE_STRUCTURE} insert into test select x, z from file(02458_data.jsonl); insert into test select * from file(02458_data.jsoncompacteachrow); -insert into test select x, 1 from file(02458_data.jsoncompacteachrow); -- {serverError ONLY_NULLS_WHILE_READING_SCHEMA} -insert into test select x, y from file(02458_data.jsoncompacteachrow); -- {serverError ONLY_NULLS_WHILE_READING_SCHEMA} -insert into test select x + 1, y from file(02458_data.jsoncompacteachrow); -- {serverError ONLY_NULLS_WHILE_READING_SCHEMA} -insert into test select x, z from file(02458_data.jsoncompacteachrow); -- {serverError ONLY_NULLS_WHILE_READING_SCHEMA} +insert into test select x, 1 from file(02458_data.jsoncompacteachrow); -- {serverError CANNOT_EXTRACT_TABLE_STRUCTURE} +insert into test select x, y from file(02458_data.jsoncompacteachrow); -- {serverError CANNOT_EXTRACT_TABLE_STRUCTURE} +insert into test select x + 1, y from file(02458_data.jsoncompacteachrow); -- {serverError CANNOT_EXTRACT_TABLE_STRUCTURE} +insert into test select x, z from file(02458_data.jsoncompacteachrow); -- {serverError CANNOT_EXTRACT_TABLE_STRUCTURE} insert into test select * from input() format CSV 1,2 insert into test select x, y from input() format CSV 1,2 -- {serverError CANNOT_EXTRACT_TABLE_STRUCTURE} diff --git a/tests/queries/0_stateless/02497_schema_inference_nulls.sql b/tests/queries/0_stateless/02497_schema_inference_nulls.sql index a25060e8182..b78b5709dbb 100644 --- a/tests/queries/0_stateless/02497_schema_inference_nulls.sql +++ b/tests/queries/0_stateless/02497_schema_inference_nulls.sql @@ -4,7 +4,7 @@ set input_format_json_try_infer_named_tuples_from_objects=0; set input_format_json_read_objects_as_strings=0; set input_format_json_infer_incomplete_types_as_strings=0; set input_format_json_read_numbers_as_strings=0; -desc format(JSONEachRow, '{"x" : 1234}, {"x" : "String"}') settings input_format_json_try_infer_numbers_from_strings=1; -- { serverError TYPE_MISMATCH } +desc format(JSONEachRow, '{"x" : 1234}, {"x" : "String"}') settings input_format_json_try_infer_numbers_from_strings=1; -- { serverError CANNOT_EXTRACT_TABLE_STRUCTURE } desc format(JSONEachRow, '{"x" : [null, 1]}'); desc format(JSONEachRow, '{"x" : [null, 1]}, {"x" : []}'); desc format(JSONEachRow, '{"x" : [null, 1]}, {"x" : [null]}'); @@ -26,7 +26,7 @@ desc format(JSONEachRow, '{"x" : [1, 2]}, {"x" : [null]}'); select 'JSONCompactEachRow'; set schema_inference_make_columns_nullable=1; -desc format(JSONCompactEachRow, '[1234], ["String"]') settings input_format_json_try_infer_numbers_from_strings=1; -- { serverError TYPE_MISMATCH } +desc format(JSONCompactEachRow, '[1234], ["String"]') settings input_format_json_try_infer_numbers_from_strings=1; -- { serverError CANNOT_EXTRACT_TABLE_STRUCTURE } desc format(JSONCompactEachRow, '[[null, 1]]'); desc format(JSONCompactEachRow, '[[null, 1]], [[]]'); desc format(JSONCompactEachRow, '[[null, 1]], [[null]]'); diff --git a/tests/queries/0_stateless/02500_numbers_inference.sh b/tests/queries/0_stateless/02500_numbers_inference.sh index ce9cd5bdc9f..5d863bd616f 100755 --- a/tests/queries/0_stateless/02500_numbers_inference.sh +++ b/tests/queries/0_stateless/02500_numbers_inference.sh @@ -8,10 +8,10 @@ $CLICKHOUSE_LOCAL -q "desc format(JSONEachRow, '{\"x\" : 1.2}')"; echo '{"x" : 1.2}' | $CLICKHOUSE_LOCAL --input-format='JSONEachRow' --table='test' -q "desc test"; $CLICKHOUSE_LOCAL -q "desc format(JSONEachRow, '{\"x\" : 1}')"; echo '{"x" : 1}' | $CLICKHOUSE_LOCAL --input-format='JSONEachRow' --table='test' -q "desc test"; -$CLICKHOUSE_LOCAL -q "desc format(JSONEachRow, '{\"x\" : 1e10}')"; -echo '{"x" : 1e10}' | $CLICKHOUSE_LOCAL --input-format='JSONEachRow' --table='test' -q "desc test"; -$CLICKHOUSE_LOCAL -q "desc format(JSONEachRow, '{\"x\" : [1, 42.42, 1, 1e10]}')"; -echo '{"x" : [1, 42.42, 1, 1e10]}' | $CLICKHOUSE_LOCAL --input-format='JSONEachRow' --table='test' -q "desc test"; +$CLICKHOUSE_LOCAL -q "desc format(JSONEachRow, '{\"x\" : 1e10}')" --input_format_try_infer_exponent_floats=1; +echo '{"x" : 1e10}' | $CLICKHOUSE_LOCAL --input-format='JSONEachRow' --table='test' -q "desc test" --input_format_try_infer_exponent_floats=1; +$CLICKHOUSE_LOCAL -q "desc format(JSONEachRow, '{\"x\" : [1, 42.42, 1, 1e10]}')" --input_format_try_infer_exponent_floats=1; +echo '{"x" : [1, 42.42, 1, 1e10]}' | $CLICKHOUSE_LOCAL --input-format='JSONEachRow' --table='test' -q "desc test" --input_format_try_infer_exponent_floats=1; $CLICKHOUSE_LOCAL -q "desc format(JSONEachRow, '{\"x\" : [1, 42.42, false]}')"; echo '{"x" : [1, 42.42, false]}' | $CLICKHOUSE_LOCAL --input-format='JSONEachRow' --table='test' -q "desc test"; @@ -19,10 +19,10 @@ $CLICKHOUSE_LOCAL -q "desc format(TSV, '1.2')"; echo '1.2' | $CLICKHOUSE_LOCAL --input-format='TSV' --table='test' -q "desc test"; $CLICKHOUSE_LOCAL -q "desc format(TSV, '1')"; echo '1' | $CLICKHOUSE_LOCAL --input-format='TSV' --table='test' -q "desc test"; -$CLICKHOUSE_LOCAL -q "desc format(TSV, '1e10')"; -echo '1e10' | $CLICKHOUSE_LOCAL --input-format='TSV' --table='test' -q "desc test"; -$CLICKHOUSE_LOCAL -q "desc format(TSV, '[1, 42.42, 1, 1e10]')"; -echo '[1, 42.42, 1, 1e10]' | $CLICKHOUSE_LOCAL --input-format='TSV' --table='test' -q "desc test"; +$CLICKHOUSE_LOCAL -q "desc format(TSV, '1e10')" --input_format_try_infer_exponent_floats=1; +echo '1e10' | $CLICKHOUSE_LOCAL --input-format='TSV' --table='test' -q "desc test" --input_format_try_infer_exponent_floats=1; +$CLICKHOUSE_LOCAL -q "desc format(TSV, '[1, 42.42, 1, 1e10]')" --input_format_try_infer_exponent_floats=1; +echo '[1, 42.42, 1, 1e10]' | $CLICKHOUSE_LOCAL --input-format='TSV' --table='test' -q "desc test" --input_format_try_infer_exponent_floats=1; $CLICKHOUSE_LOCAL -q "desc format(TSV, '[1, 42.42, false]')"; echo '[1, 42.42, false]' | $CLICKHOUSE_LOCAL --input-format='TSV' --table='test' -q "desc test"; diff --git a/tests/queries/0_stateless/02502_bad_values_schema_inference.sql b/tests/queries/0_stateless/02502_bad_values_schema_inference.sql index 4c796842c0d..67ac09832de 100644 --- a/tests/queries/0_stateless/02502_bad_values_schema_inference.sql +++ b/tests/queries/0_stateless/02502_bad_values_schema_inference.sql @@ -1,2 +1,2 @@ -desc format(Values, '(\'abc)'); -- { serverError CANNOT_PARSE_INPUT_ASSERTION_FAILED } +desc format(Values, '(\'abc)'); -- { serverError CANNOT_EXTRACT_TABLE_STRUCTURE } diff --git a/tests/queries/0_stateless/02567_and_consistency.sql b/tests/queries/0_stateless/02567_and_consistency.sql index 0eeab99e539..b1fa526e33f 100644 --- a/tests/queries/0_stateless/02567_and_consistency.sql +++ b/tests/queries/0_stateless/02567_and_consistency.sql @@ -5,7 +5,7 @@ FROM ) GROUP BY number HAVING 1 AND sin(sum(number)) -ORDER BY ALL +ORDER BY * SETTINGS enable_optimize_predicate_expression = 0; SELECT '====='; @@ -17,7 +17,7 @@ FROM ) GROUP BY number HAVING 1 AND sin(1) -ORDER BY ALL +ORDER BY * SETTINGS enable_optimize_predicate_expression = 0; SELECT '====='; @@ -29,7 +29,7 @@ FROM ) GROUP BY number HAVING x AND sin(sum(number)) -ORDER BY ALL +ORDER BY * SETTINGS enable_optimize_predicate_expression = 1; SELECT '====='; @@ -41,7 +41,7 @@ FROM ) GROUP BY number HAVING 1 AND sin(sum(number)) -ORDER BY ALL +ORDER BY * SETTINGS enable_optimize_predicate_expression = 0; SELECT '====='; @@ -61,7 +61,7 @@ FROM ) GROUP BY number HAVING 1 AND sin(sum(number)) -ORDER BY ALL +ORDER BY * SETTINGS enable_optimize_predicate_expression = 1; select '#45440'; diff --git a/tests/queries/0_stateless/02724_database_s3.sh b/tests/queries/0_stateless/02724_database_s3.sh index 13b627c0342..80b47282146 100755 --- a/tests/queries/0_stateless/02724_database_s3.sh +++ b/tests/queries/0_stateless/02724_database_s3.sh @@ -46,7 +46,7 @@ DROP DATABASE IF EXISTS test3; CREATE DATABASE test3 ENGINE = S3; USE test3; SELECT * FROM \"http://localhost:11111/test/a.myext\" -""" 2>&1 | tr '\n' ' ' | grep -oF -e "UNKNOWN_TABLE" -e "BAD_ARGUMENTS" > /dev/null && echo "OK" || echo 'FAIL' ||: +""" 2>&1 | tr '\n' ' ' | grep -oF -e "UNKNOWN_TABLE" -e "S3_ERROR" > /dev/null && echo "OK" || echo 'FAIL' ||: ${CLICKHOUSE_CLIENT} --multiline --multiquery -q """ USE test3; diff --git a/tests/queries/0_stateless/02725_database_hdfs.sh b/tests/queries/0_stateless/02725_database_hdfs.sh index b4e081f6de0..d62f928e947 100755 --- a/tests/queries/0_stateless/02725_database_hdfs.sh +++ b/tests/queries/0_stateless/02725_database_hdfs.sh @@ -58,7 +58,7 @@ SELECT * FROM \"abacaba/file.tsv\" """ 2>&1 | tr '\n' ' ' | grep -oF "CANNOT_EXTRACT_TABLE_STRUCTURE" ${CLICKHOUSE_CLIENT} -q "SELECT * FROM test_hdfs_4.\`http://localhost:11111/test/a.tsv\`" 2>&1 | tr '\n' ' ' | grep -oF -e "UNKNOWN_TABLE" -e "BAD_ARGUMENTS" > /dev/null && echo "OK" || echo 'FAIL' ||: -${CLICKHOUSE_CLIENT} --query "SELECT * FROM test_hdfs_4.\`hdfs://localhost:12222/file.myext\`" 2>&1 | tr '\n' ' ' | grep -oF -e "UNKNOWN_TABLE" -e "BAD_ARGUMENTS" > /dev/null && echo "OK" || echo 'FAIL' ||: +${CLICKHOUSE_CLIENT} --query "SELECT * FROM test_hdfs_4.\`hdfs://localhost:12222/file.myext\`" 2>&1 | tr '\n' ' ' | grep -oF -e "UNKNOWN_TABLE" -e "CANNOT_EXTRACT_TABLE_STRUCTURE" > /dev/null && echo "OK" || echo 'FAIL' ||: ${CLICKHOUSE_CLIENT} --query "SELECT * FROM test_hdfs_4.\`hdfs://localhost:12222/test_02725_3.tsv\`" 2>&1 | tr '\n' ' ' | grep -oF -e "UNKNOWN_TABLE" -e "CANNOT_EXTRACT_TABLE_STRUCTURE" > /dev/null && echo "OK" || echo 'FAIL' ||: ${CLICKHOUSE_CLIENT} --query "SELECT * FROM test_hdfs_4.\`hdfs://localhost:12222\`" 2>&1 | tr '\n' ' ' | grep -oF -e "UNKNOWN_TABLE" -e "BAD_ARGUMENTS" > /dev/null && echo "OK" || echo 'FAIL' ||: diff --git a/tests/queries/0_stateless/02732_transform_fuzz.sql b/tests/queries/0_stateless/02732_transform_fuzz.sql index c2918d4da81..872cf3a6599 100644 --- a/tests/queries/0_stateless/02732_transform_fuzz.sql +++ b/tests/queries/0_stateless/02732_transform_fuzz.sql @@ -1 +1 @@ -SELECT caseWithExpr(arrayReduce(NULL, []), []); -- { serverError BAD_ARGUMENTS } +SELECT caseWithExpr(arrayReduce(NULL, []), []); -- { serverError ILLEGAL_TYPE_OF_ARGUMENT } diff --git a/tests/queries/0_stateless/02735_parquet_encoder.sql b/tests/queries/0_stateless/02735_parquet_encoder.sql index 19125abf8da..fe45a2a317d 100644 --- a/tests/queries/0_stateless/02735_parquet_encoder.sql +++ b/tests/queries/0_stateless/02735_parquet_encoder.sql @@ -6,6 +6,7 @@ set output_format_parquet_data_page_size = 800; set output_format_parquet_batch_size = 100; set output_format_parquet_row_group_size_bytes = 1000000000; set engine_file_truncate_on_insert=1; +set allow_suspicious_low_cardinality_types=1; -- Write random data to parquet file, then read from it and check that it matches what we wrote. -- Do this for all kinds of data types: primitive, Nullable(primitive), Array(primitive), diff --git a/tests/queries/0_stateless/02763_row_policy_storage_merge_alias.sql.j2 b/tests/queries/0_stateless/02763_row_policy_storage_merge_alias.sql.j2 index bdd456951dd..99ac89c4eb4 100644 --- a/tests/queries/0_stateless/02763_row_policy_storage_merge_alias.sql.j2 +++ b/tests/queries/0_stateless/02763_row_policy_storage_merge_alias.sql.j2 @@ -12,8 +12,6 @@ CREATE TABLE 02763_a_merge (x UInt8, y UInt64, z UInt64) ENGINE = Merge(currentD {% for prew in [0 , 1] -%} - - SELECT 'x, y, z FROM 02763_a_merge'; SELECT x, y, z FROM 02763_a_merge ORDER BY x SETTINGS optimize_move_to_prewhere= {{prew}}; SELECT '* FROM 02763_a_merge'; diff --git a/tests/queries/0_stateless/02783_max_bytes_to_read_in_schema_inference.sql b/tests/queries/0_stateless/02783_max_bytes_to_read_in_schema_inference.sql index b4165e8e80a..ef0381df1a6 100644 --- a/tests/queries/0_stateless/02783_max_bytes_to_read_in_schema_inference.sql +++ b/tests/queries/0_stateless/02783_max_bytes_to_read_in_schema_inference.sql @@ -1,5 +1,5 @@ set input_format_max_rows_to_read_for_schema_inference=2; set input_format_json_infer_incomplete_types_as_strings=0; -desc format('JSONEachRow', '{"a" : null}, {"a" : 42}') settings input_format_max_bytes_to_read_for_schema_inference=10; -- {serverError ONLY_NULLS_WHILE_READING_SCHEMA} +desc format('JSONEachRow', '{"a" : null}, {"a" : 42}') settings input_format_max_bytes_to_read_for_schema_inference=10; -- {serverError CANNOT_EXTRACT_TABLE_STRUCTURE} desc format('JSONEachRow', '{"a" : null}, {"a" : 42}') settings input_format_max_bytes_to_read_for_schema_inference=20; diff --git a/tests/queries/0_stateless/02797_join_nested_lowcardinality_convert.sql.j2 b/tests/queries/0_stateless/02797_join_nested_lowcardinality_convert.sql.j2 index 79a7c654f10..95bac76c591 100644 --- a/tests/queries/0_stateless/02797_join_nested_lowcardinality_convert.sql.j2 +++ b/tests/queries/0_stateless/02797_join_nested_lowcardinality_convert.sql.j2 @@ -1,4 +1,4 @@ - +SET allow_suspicious_low_cardinality_types=1; DROP TABLE IF EXISTS test1__fuzz_36; DROP TABLE IF EXISTS test1__fuzz_38; diff --git a/tests/queries/0_stateless/02832_alter_max_sessions_for_user.sh b/tests/queries/0_stateless/02832_alter_max_sessions_for_user.sh index 546c54a4de9..a3b0d17f1be 100755 --- a/tests/queries/0_stateless/02832_alter_max_sessions_for_user.sh +++ b/tests/queries/0_stateless/02832_alter_max_sessions_for_user.sh @@ -23,14 +23,14 @@ function test_alter_profile() ${CLICKHOUSE_CLIENT} -q $"ALTER SETTINGS PROFILE ${PROFILE} SETTINGS max_sessions_for_user = ${max_session_count}" - # Create sesssions with $max_session_count resriction + # Create sessions with $max_session_count restriction for ((i = 1 ; i <= ${max_session_count} ; i++)); do local session_id="${SESSION_ID_PREFIX}_${i}" # Skip output from this query ${CLICKHOUSE_CURL} -sS -X POST "${CLICKHOUSE_URL}&user=${USER}&session_id=${session_id}&session_check=0" --data-binary "SELECT 1" > /dev/null done - # Update resriction to $alter_sessions_count + # Update restriction to $alter_sessions_count ${CLICKHOUSE_CLIENT} -q $"ALTER SETTINGS PROFILE ${PROFILE} SETTINGS max_sessions_for_user = ${alter_sessions_count}" # Simultaneous sessions should use max settings from profile ($alter_sessions_count) diff --git a/tests/queries/0_stateless/02884_string_distance_function.sql b/tests/queries/0_stateless/02884_string_distance_function.sql index fddbf41f0e5..95604c6f401 100644 --- a/tests/queries/0_stateless/02884_string_distance_function.sql +++ b/tests/queries/0_stateless/02884_string_distance_function.sql @@ -29,13 +29,13 @@ CREATE TABLE t INSERT INTO t VALUES ('', '') ('abc', '') ('', 'abc') ('abc', 'abc') ('abc', 'ab') ('abc', 'bc') ('clickhouse', 'mouse'); SELECT '-- non-const arguments'; -SELECT 'byteHammingDistance', s1, s2, byteHammingDistance(s1, s2) FROM t ORDER BY ALL; -SELECT 'editDistance', s1, s2, editDistance(s1, s2) FROM t ORDER BY ALL; -SELECT 'damerauLevenshteinDistance', s1, s2, damerauLevenshteinDistance(s1, s2) FROM t ORDER BY ALL; -SELECT 'stringJaccardIndex', s1, s2, stringJaccardIndex(s1, s2) FROM t ORDER BY ALL; -SELECT 'stringJaccardIndexUTF8', s1, s2, stringJaccardIndexUTF8(s1, s2) FROM t ORDER BY ALL; -SELECT 'jaroSimilarity', s1, s2, jaroSimilarity(s1, s2) FROM t ORDER BY ALL; -SELECT 'jaroWinklerSimilarity', s1, s2, jaroWinklerSimilarity(s1, s2) FROM t ORDER BY ALL; +SELECT 'byteHammingDistance', s1, s2, byteHammingDistance(s1, s2) FROM t ORDER BY *; +SELECT 'editDistance', s1, s2, editDistance(s1, s2) FROM t ORDER BY *; +SELECT 'damerauLevenshteinDistance', s1, s2, damerauLevenshteinDistance(s1, s2) FROM t ORDER BY *; +SELECT 'stringJaccardIndex', s1, s2, stringJaccardIndex(s1, s2) FROM t ORDER BY *; +SELECT 'stringJaccardIndexUTF8', s1, s2, stringJaccardIndexUTF8(s1, s2) FROM t ORDER BY *; +SELECT 'jaroSimilarity', s1, s2, jaroSimilarity(s1, s2) FROM t ORDER BY *; +SELECT 'jaroWinklerSimilarity', s1, s2, jaroWinklerSimilarity(s1, s2) FROM t ORDER BY *; SELECT '-- Special UTF-8 tests'; -- We do not perform full UTF8 validation, so sometimes it just returns some result diff --git a/tests/queries/0_stateless/02900_union_schema_inference_mode.sh b/tests/queries/0_stateless/02900_union_schema_inference_mode.sh index dc0dd8ae1f4..a0fdb5276e0 100755 --- a/tests/queries/0_stateless/02900_union_schema_inference_mode.sh +++ b/tests/queries/0_stateless/02900_union_schema_inference_mode.sh @@ -39,13 +39,13 @@ desc file('$CLICKHOUSE_TEST_UNIQUE_NAME/archive.tar :: data{1,2,3}.jsonl'); " echo 'Error' > $CLICKHOUSE_TEST_UNIQUE_NAME/data4.jsonl -$CLICKHOUSE_LOCAL -q "desc file('$CLICKHOUSE_TEST_UNIQUE_NAME/data{1,2,3,4}.jsonl') settings schema_inference_mode='union'" 2>&1 | grep -c -F "Cannot extract table structure" +$CLICKHOUSE_LOCAL -q "desc file('$CLICKHOUSE_TEST_UNIQUE_NAME/data{1,2,3,4}.jsonl') settings schema_inference_mode='union'" 2>&1 | grep -c -F "CANNOT_EXTRACT_TABLE_STRUCTURE" $CLICKHOUSE_LOCAL -nm -q " set schema_inference_mode = 'union'; desc file('$CLICKHOUSE_TEST_UNIQUE_NAME/data{2,3}.jsonl'); desc file('$CLICKHOUSE_TEST_UNIQUE_NAME/data{1,2,3,4}.jsonl'); -" 2>&1 | grep -c -F "Cannot extract table structure" +" 2>&1 | grep -c -F "CANNOT_EXTRACT_TABLE_STRUCTURE" echo 42 > $CLICKHOUSE_TEST_UNIQUE_NAME/data1.csv echo 42, 43 > $CLICKHOUSE_TEST_UNIQUE_NAME/data2.csv diff --git a/tests/queries/0_stateless/02931_max_num_to_warn.reference b/tests/queries/0_stateless/02931_max_num_to_warn.reference index c0ad7354039..7de998eebfa 100644 --- a/tests/queries/0_stateless/02931_max_num_to_warn.reference +++ b/tests/queries/0_stateless/02931_max_num_to_warn.reference @@ -1,3 +1,3 @@ -The number of attached tables is more than 10 -The number of attached databases is more than 10 +The number of attached tables is more than 5 +The number of attached databases is more than 2 The number of active parts is more than 10 diff --git a/tests/queries/0_stateless/02931_max_num_to_warn.sql b/tests/queries/0_stateless/02931_max_num_to_warn.sql index 49b981fc355..23f04816d5a 100644 --- a/tests/queries/0_stateless/02931_max_num_to_warn.sql +++ b/tests/queries/0_stateless/02931_max_num_to_warn.sql @@ -37,7 +37,7 @@ INSERT INTO test_max_num_to_warn_02931.test_max_num_to_warn_9 VALUES (1, 'Hello' INSERT INTO test_max_num_to_warn_02931.test_max_num_to_warn_10 VALUES (1, 'Hello'); INSERT INTO test_max_num_to_warn_02931.test_max_num_to_warn_11 VALUES (1, 'Hello'); -SELECT * FROM system.warnings where message in ('The number of attached tables is more than 10', 'The number of attached databases is more than 10', 'The number of active parts is more than 10'); +SELECT * FROM system.warnings where message in ('The number of attached tables is more than 5', 'The number of attached databases is more than 2', 'The number of active parts is more than 10'); DROP DATABASE IF EXISTS test_max_num_to_warn_02931; DROP DATABASE IF EXISTS test_max_num_to_warn_1; diff --git a/tests/queries/0_stateless/02933_compare_with_bool_as_string.reference b/tests/queries/0_stateless/02933_compare_with_bool_as_string.reference new file mode 100644 index 00000000000..d00491fd7e5 --- /dev/null +++ b/tests/queries/0_stateless/02933_compare_with_bool_as_string.reference @@ -0,0 +1 @@ +1 diff --git a/tests/queries/0_stateless/02933_compare_with_bool_as_string.sql b/tests/queries/0_stateless/02933_compare_with_bool_as_string.sql new file mode 100644 index 00000000000..5dbacd5fbbf --- /dev/null +++ b/tests/queries/0_stateless/02933_compare_with_bool_as_string.sql @@ -0,0 +1 @@ +select true = 'true'; diff --git a/tests/queries/0_stateless/02943_order_by_all.reference b/tests/queries/0_stateless/02943_order_by_all.reference index 6eed33cc68d..ef399fe8e2d 100644 --- a/tests/queries/0_stateless/02943_order_by_all.reference +++ b/tests/queries/0_stateless/02943_order_by_all.reference @@ -49,45 +49,9 @@ A 2 2 A 3 B \N C --- what happens if some column "all" already exists? -B 3 10 -D 1 20 -A 2 30 -C \N 40 -B 3 10 -D 1 20 -A 2 30 -C \N 40 -D 1 +-- Special case: all columns in SELECT clause, ORDER BY * A 2 B 3 C \N D 1 -A 2 -B 3 -C \N -A 2 -B 3 -D 1 -\N -A 2 -B 3 -D 1 -\N -B 3 10 -D 1 20 -A 2 30 -C \N 40 -B 3 10 -D 1 20 -A 2 30 -C \N 40 --- test SELECT * ORDER BY ALL with no "all" column in the SELECT clause -A 2 30 -B 3 10 -C \N 40 -D 1 20 -A 2 30 -B 3 10 -C \N 40 -D 1 20 +-- "*" must appear stand-alone in ORDER BY diff --git a/tests/queries/0_stateless/02943_order_by_all.sql b/tests/queries/0_stateless/02943_order_by_all.sql index 0960d75ad96..2fe628e9b95 100644 --- a/tests/queries/0_stateless/02943_order_by_all.sql +++ b/tests/queries/0_stateless/02943_order_by_all.sql @@ -1,4 +1,4 @@ --- Tests that sort expression ORDER BY ALL +-- Tests that sort expression ORDER BY * DROP TABLE IF EXISTS order_by_all; @@ -6,104 +6,48 @@ CREATE TABLE order_by_all ( a String, b Nullable(Int32), - all UInt64, ) ENGINE = Memory; -INSERT INTO order_by_all VALUES ('B', 3, 10), ('C', NULL, 40), ('D', 1, 20), ('A', 2, 30); +INSERT INTO order_by_all VALUES ('B', 3), ('C', NULL), ('D', 1), ('A', 2); SELECT '-- no modifiers'; SET allow_experimental_analyzer = 0; -SELECT a, b FROM order_by_all ORDER BY ALL; -SELECT b, a FROM order_by_all ORDER BY ALL; +SELECT a, b FROM order_by_all ORDER BY *; +SELECT b, a FROM order_by_all ORDER BY *; SET allow_experimental_analyzer = 1; -SELECT a, b FROM order_by_all ORDER BY ALL; -SELECT b, a FROM order_by_all ORDER BY ALL; +SELECT a, b FROM order_by_all ORDER BY *; +SELECT b, a FROM order_by_all ORDER BY *; SELECT '-- with ASC/DESC modifiers'; SET allow_experimental_analyzer = 0; -SELECT a, b FROM order_by_all ORDER BY ALL ASC; -SELECT a, b FROM order_by_all ORDER BY ALL DESC; +SELECT a, b FROM order_by_all ORDER BY * ASC; +SELECT a, b FROM order_by_all ORDER BY * DESC; SET allow_experimental_analyzer = 1; -SELECT a, b FROM order_by_all ORDER BY ALL ASC; -SELECT a, b FROM order_by_all ORDER BY ALL DESC; +SELECT a, b FROM order_by_all ORDER BY * ASC; +SELECT a, b FROM order_by_all ORDER BY * DESC; SELECT '-- with NULLS FIRST/LAST modifiers'; SET allow_experimental_analyzer = 0; -SELECT b, a FROM order_by_all ORDER BY ALL NULLS FIRST; -SELECT b, a FROM order_by_all ORDER BY ALL NULLS LAST; +SELECT b, a FROM order_by_all ORDER BY * NULLS FIRST; +SELECT b, a FROM order_by_all ORDER BY * NULLS LAST; SET allow_experimental_analyzer = 1; -SELECT b, a FROM order_by_all ORDER BY ALL NULLS FIRST; -SELECT b, a FROM order_by_all ORDER BY ALL NULLS LAST; +SELECT b, a FROM order_by_all ORDER BY * NULLS FIRST; +SELECT b, a FROM order_by_all ORDER BY * NULLS LAST; -SELECT '-- what happens if some column "all" already exists?'; +SELECT '-- Special case: all columns in SELECT clause, ORDER BY *'; +SELECT * FROM order_by_all ORDER BY * NULLS LAST; --- columns +SELECT '-- "*" must appear stand-alone in ORDER BY'; SET allow_experimental_analyzer = 0; -SELECT a, b, all FROM order_by_all ORDER BY all; -- { serverError UNEXPECTED_EXPRESSION } -SELECT a, b, all FROM order_by_all ORDER BY ALL; -- { serverError UNEXPECTED_EXPRESSION } -SELECT a, b, all FROM order_by_all ORDER BY all SETTINGS enable_order_by_all = false; +SELECT a, b FROM order_by_all ORDER BY *, a; -- { serverError UNKNOWN_IDENTIFIER } SET allow_experimental_analyzer = 1; -SELECT a, b, all FROM order_by_all ORDER BY all; -- { serverError UNEXPECTED_EXPRESSION } -SELECT a, b, all FROM order_by_all ORDER BY ALL; -- { serverError UNEXPECTED_EXPRESSION } -SELECT a, b, all FROM order_by_all ORDER BY all SETTINGS enable_order_by_all = false; - --- column aliases - -SET allow_experimental_analyzer = 0; -SELECT a, b AS all FROM order_by_all ORDER BY all; -- { serverError UNEXPECTED_EXPRESSION } -SELECT a, b AS all FROM order_by_all ORDER BY ALL; -- { serverError UNEXPECTED_EXPRESSION } -SELECT a, b AS all FROM order_by_all ORDER BY all SETTINGS enable_order_by_all = false; - -SET allow_experimental_analyzer = 1; -SELECT a, b AS all FROM order_by_all ORDER BY all; -- { serverError UNEXPECTED_EXPRESSION } -SELECT a, b AS all FROM order_by_all ORDER BY ALL; -- { serverError UNEXPECTED_EXPRESSION } -SELECT a, b AS all FROM order_by_all ORDER BY all SETTINGS enable_order_by_all = false; - --- expressions - -SET allow_experimental_analyzer = 0; -SELECT format('{} {}', a, b) AS all FROM order_by_all ORDER BY all; -- { serverError UNEXPECTED_EXPRESSION } -SELECT format('{} {}', a, b) AS all FROM order_by_all ORDER BY ALL; -- { serverError UNEXPECTED_EXPRESSION } -SELECT format('{} {}', a, b) AS all FROM order_by_all ORDER BY all SETTINGS enable_order_by_all = false; - -SET allow_experimental_analyzer = 1; -SELECT format('{} {}', a, b) AS all FROM order_by_all ORDER BY all; -- { serverError UNEXPECTED_EXPRESSION } -SELECT format('{} {}', a, b) AS all FROM order_by_all ORDER BY ALL; -- { serverError UNEXPECTED_EXPRESSION } -SELECT format('{} {}', a, b) AS all FROM order_by_all ORDER BY all SETTINGS enable_order_by_all = false; - -SET allow_experimental_analyzer = 0; -SELECT a, b, all FROM order_by_all ORDER BY all, a; - -SET allow_experimental_analyzer = 1; -SELECT a, b, all FROM order_by_all ORDER BY all, a; - -DROP TABLE order_by_all; - -SELECT '-- test SELECT * ORDER BY ALL with no "all" column in the SELECT clause'; - -CREATE TABLE order_by_all -( - a String, - b Nullable(Int32), - c UInt64, -) - ENGINE = Memory; - -INSERT INTO order_by_all VALUES ('B', 3, 10), ('C', NULL, 40), ('D', 1, 20), ('A', 2, 30); - -SET allow_experimental_analyzer = 0; -SELECT * FROM order_by_all ORDER BY ALL; - -SET allow_experimental_analyzer = 1; -SELECT * FROM order_by_all ORDER BY ALL; - -DROP TABLE order_by_all; +SELECT a, b FROM order_by_all ORDER BY *, a; -- { serverError UNSUPPORTED_METHOD } diff --git a/tests/queries/0_stateless/02943_variant_type_with_different_local_and_global_order.sh b/tests/queries/0_stateless/02943_variant_type_with_different_local_and_global_order.sh index d089ed3cb2f..e4c1206263f 100755 --- a/tests/queries/0_stateless/02943_variant_type_with_different_local_and_global_order.sh +++ b/tests/queries/0_stateless/02943_variant_type_with_different_local_and_global_order.sh @@ -1,5 +1,5 @@ #!/usr/bin/env bash -# Tags: long +# Tags: long, no-debug CUR_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) # reset --log_comment @@ -74,11 +74,11 @@ run 0 $CH_CLIENT -q "drop table test;" echo "MergeTree compact" -$CH_CLIENT -q "create table test (id UInt64, v Variant(UInt64, String, Array(UInt64))) engine=MergeTree order by id settings min_rows_for_wide_part=100000000, min_bytes_for_wide_part=1000000000;" +$CH_CLIENT -q "create table test (id UInt64, v Variant(UInt64, String, Array(UInt64))) engine=MergeTree order by id settings min_rows_for_wide_part=100000000, min_bytes_for_wide_part=1000000000, index_granularity = 8192, index_granularity_bytes = '10Mi';" run 1 $CH_CLIENT -q "drop table test;" echo "MergeTree wide" -$CH_CLIENT -q "create table test (id UInt64, v Variant(UInt64, String, Array(UInt64))) engine=MergeTree order by id settings min_rows_for_wide_part=1, min_bytes_for_wide_part=1;" +$CH_CLIENT -q "create table test (id UInt64, v Variant(UInt64, String, Array(UInt64))) engine=MergeTree order by id settings min_rows_for_wide_part=1, min_bytes_for_wide_part=1, index_granularity = 8192, index_granularity_bytes = '10Mi';" run 1 $CH_CLIENT -q "drop table test;" diff --git a/tests/queries/0_stateless/02946_materialize_column_must_not_override_past_values.reference b/tests/queries/0_stateless/02946_materialize_column_must_not_override_past_values.reference new file mode 100644 index 00000000000..461075e9607 --- /dev/null +++ b/tests/queries/0_stateless/02946_materialize_column_must_not_override_past_values.reference @@ -0,0 +1,45 @@ +DEFAULT expressions +-- Compact parts +Before materialize +1 1 +2 54321 +After materialize +1 1 +2 54321 +-- Wide parts +Before materialize +1 1 +2 54321 +After materialize +1 1 +2 54321 +-- Nullable column != physically absent +Before materialize +1 1 +2 \N +3 54321 +After materialize +1 1 +2 \N +3 54321 +-- Parts with renamed column +Before materialize +1 1 +2 54321 +After rename +1 1 +2 54321 +After materialize +1 1 +2 54321 +MATERIALIZED expressions +-- Compact parts +Before materialize +1 54321 +After materialize +1 65432 +-- Compact parts +Before materialize +1 54321 +After materialize +1 65432 diff --git a/tests/queries/0_stateless/02946_materialize_column_must_not_override_past_values.sql b/tests/queries/0_stateless/02946_materialize_column_must_not_override_past_values.sql new file mode 100644 index 00000000000..cfdde287712 --- /dev/null +++ b/tests/queries/0_stateless/02946_materialize_column_must_not_override_past_values.sql @@ -0,0 +1,85 @@ +SET mutations_sync = 2; + +DROP TABLE IF EXISTS tab; + +-- Tests that existing parts which contain a non-default value in columns with DEFAULT expression remain unchanged by MATERIALIZE COLUMN> +SELECT 'DEFAULT expressions'; + +SELECT '-- Compact parts'; + +CREATE TABLE tab (id Int64, dflt Int64 DEFAULT 54321) ENGINE MergeTree ORDER BY id; +INSERT INTO tab (id, dflt) VALUES (1, 1); +INSERT INTO tab (id) VALUES (2); +SELECT 'Before materialize'; +SELECT * FROM tab ORDER BY id; +ALTER TABLE tab MATERIALIZE COLUMN dflt; +SELECT 'After materialize'; +SELECT * FROM tab ORDER BY id; +DROP TABLE tab; + +SELECT '-- Wide parts'; + +CREATE TABLE tab (id Int64, dflt Int64 DEFAULT 54321) ENGINE MergeTree ORDER BY id SETTINGS min_bytes_for_wide_part = 1; +INSERT INTO tab (id, dflt) VALUES (1, 1); +INSERT INTO tab (id) VALUES (2); +SELECT 'Before materialize'; +SELECT * FROM tab ORDER BY id; +ALTER TABLE tab MATERIALIZE COLUMN dflt; +SELECT 'After materialize'; +SELECT * FROM tab ORDER BY id; +DROP TABLE tab; + +SELECT '-- Nullable column != physically absent'; + +CREATE TABLE tab (id Int64, dflt Nullable(Int64) DEFAULT 54321) ENGINE MergeTree ORDER BY id SETTINGS min_bytes_for_wide_part = 1; +INSERT INTO tab (id, dflt) VALUES (1, 1); +INSERT INTO tab (id, dflt) VALUES (2, NULL); +INSERT INTO tab (id) VALUES (3); +SELECT 'Before materialize'; +SELECT * FROM tab ORDER BY id; +ALTER TABLE tab MATERIALIZE COLUMN dflt; +SELECT 'After materialize'; +SELECT * FROM tab ORDER BY id; +DROP TABLE tab; + +SELECT '-- Parts with renamed column'; + +CREATE TABLE tab (id Int64, dflt Int64 DEFAULT 54321) ENGINE MergeTree ORDER BY id; +INSERT INTO tab (id, dflt) VALUES (1, 1); +INSERT INTO tab (id) VALUES (2); +SELECT 'Before materialize'; +SELECT * FROM tab ORDER BY id; +ALTER TABLE tab RENAME COLUMN dflt TO dflt2; +SELECT 'After rename'; +SELECT * FROM tab ORDER BY id; +ALTER TABLE tab MATERIALIZE COLUMN dflt2; +SELECT 'After materialize'; +SELECT * FROM tab ORDER BY id; +DROP TABLE tab; + +-- But for columns with MATERIALIZED expression, all existing parts should be rewritten in case a new expression was set in the meantime. +SELECT 'MATERIALIZED expressions'; + +SELECT '-- Compact parts'; + +CREATE TABLE tab (id Int64, mtrl Int64 MATERIALIZED 54321) ENGINE MergeTree ORDER BY id; +INSERT INTO tab (id) VALUES (1); +SELECT 'Before materialize'; +SELECT id, mtrl FROM tab ORDER BY id; +ALTER TABLE tab MODIFY COLUMN mtrl Int64 MATERIALIZED 65432; +ALTER TABLE tab MATERIALIZE COLUMN mtrl; +SELECT 'After materialize'; +SELECT id, mtrl FROM tab ORDER BY id; +DROP TABLE tab; + +SELECT '-- Compact parts'; + +CREATE TABLE tab (id Int64, mtrl Int64 MATERIALIZED 54321) ENGINE MergeTree ORDER BY id SETTINGS min_bytes_for_wide_part = 1; +INSERT INTO tab (id) VALUES (1); +SELECT 'Before materialize'; +SELECT id, mtrl FROM tab ORDER BY id; +ALTER TABLE tab MODIFY COLUMN mtrl Int64 MATERIALIZED 65432; +ALTER TABLE tab MATERIALIZE COLUMN mtrl; +SELECT 'After materialize'; +SELECT id, mtrl FROM tab ORDER BY id; +DROP TABLE tab; diff --git a/tests/queries/0_stateless/02962_join_using_bug_57894.sql b/tests/queries/0_stateless/02962_join_using_bug_57894.sql index 87aef8b1a71..c9570be7053 100644 --- a/tests/queries/0_stateless/02962_join_using_bug_57894.sql +++ b/tests/queries/0_stateless/02962_join_using_bug_57894.sql @@ -11,23 +11,23 @@ INSERT INTO r VALUES (NULL, NULL); SET allow_experimental_analyzer = 0; -SELECT x FROM t FULL JOIN r USING (x) ORDER BY ALL +SELECT x FROM t FULL JOIN r USING (x) ORDER BY * ; -SELECT x FROM t FULL JOIN r USING (x) ORDER BY ALL +SELECT x FROM t FULL JOIN r USING (x) ORDER BY * SETTINGS join_algorithm = 'partial_merge'; -SELECT x FROM t FULL JOIN r USING (x) ORDER BY ALL +SELECT x FROM t FULL JOIN r USING (x) ORDER BY * SETTINGS join_algorithm = 'full_sorting_merge'; SET allow_experimental_analyzer = 1; -SELECT x FROM t FULL JOIN r USING (x) ORDER BY ALL +SELECT x FROM t FULL JOIN r USING (x) ORDER BY * ; -SELECT x FROM t FULL JOIN r USING (x) ORDER BY ALL +SELECT x FROM t FULL JOIN r USING (x) ORDER BY * SETTINGS join_algorithm = 'partial_merge'; -SELECT x FROM t FULL JOIN r USING (x) ORDER BY ALL +SELECT x FROM t FULL JOIN r USING (x) ORDER BY * SETTINGS join_algorithm = 'full_sorting_merge'; diff --git a/tests/queries/0_stateless/02966_s3_access_key_id_restriction.sql b/tests/queries/0_stateless/02966_s3_access_key_id_restriction.sql deleted file mode 100644 index c1ca0b4bcd5..00000000000 --- a/tests/queries/0_stateless/02966_s3_access_key_id_restriction.sql +++ /dev/null @@ -1,6 +0,0 @@ --- Tags: no-fasttest - -select * from s3('http://localhost:11111/test/a.tsv', '\ninjection\n', 'admin'); -- { serverError 36 } -select * from deltaLake('http://localhost:11111/test/a.tsv', '\ninjection\n', 'admin'); -- { serverError 36 } -select * from hudi('http://localhost:11111/test/a.tsv', '\ninjection\n', 'admin'); -- { serverError 36 } -select * from iceberg('http://localhost:11111/test/a.tsv', '\ninjection\n', 'admin'); -- { serverError 36 } diff --git a/tests/queries/0_stateless/02969_auto_format_detection.reference b/tests/queries/0_stateless/02969_auto_format_detection.reference new file mode 100644 index 00000000000..4b86be04996 --- /dev/null +++ b/tests/queries/0_stateless/02969_auto_format_detection.reference @@ -0,0 +1,123 @@ +Parquet +a Nullable(UInt64) +b Nullable(String) +c Array(Nullable(UInt64)) +d Tuple(\n a Nullable(UInt64),\n b Nullable(String)) +ORC +a Nullable(Int64) +b Nullable(String) +c Array(Nullable(Int64)) +d Tuple(\n a Nullable(Int64),\n b Nullable(String)) +Arrow +a Nullable(UInt64) +b Nullable(String) +c Array(Nullable(UInt64)) +d Tuple(\n a Nullable(UInt64),\n b Nullable(String)) +ArrowStream +a Nullable(UInt64) +b Nullable(String) +c Array(Nullable(UInt64)) +d Tuple(\n a Nullable(UInt64),\n b Nullable(String)) +Avro +a Int64 +b String +c Array(Int64) +d Tuple(\n a Int64,\n b String) +Native +a UInt64 +b String +c Array(UInt64) +d Tuple(\n a UInt64,\n b String) +BSONEachRow +a Nullable(Int64) +b Nullable(String) +c Array(Nullable(Int64)) +d Tuple(\n a Nullable(Int64),\n b Nullable(String)) +JSONCompact +a UInt64 +b String +c Array(UInt64) +d Tuple(\n a UInt64,\n b String) +Values +c1 Nullable(UInt64) +c2 Nullable(String) +c3 Array(Nullable(UInt64)) +c4 Tuple(Nullable(UInt64), Nullable(String)) +TSKV +a Nullable(String) +b Nullable(String) +c Array(Nullable(UInt64)) +d Nullable(String) +JSONObjectEachRow +a Nullable(String) +b Nullable(String) +c Array(Nullable(String)) +d Tuple(\n a Nullable(String),\n b Nullable(String)) +JSONColumns +a Nullable(String) +b Nullable(String) +c Array(Nullable(String)) +d Tuple(\n a Nullable(String),\n b Nullable(String)) +JSONCompactColumns +c1 Nullable(String) +c2 Nullable(String) +c3 Array(Nullable(String)) +c4 Tuple(\n a Nullable(String),\n b Nullable(String)) +JSONCompact +a UInt64 +b String +c Array(UInt64) +d Tuple(\n a UInt64,\n b String) +JSON +a UInt64 +b String +c Array(UInt64) +d Tuple(\n a UInt64,\n b String) +TSV +c1 Nullable(UInt64) +c2 Nullable(String) +c3 Array(Nullable(UInt64)) +c4 Tuple(Nullable(UInt64), Nullable(String)) +CSV +c1 Nullable(UInt64) +c2 Nullable(String) +c3 Array(Nullable(UInt64)) +c4 Nullable(UInt64) +c5 Nullable(String) +a Nullable(String) +b Nullable(String) +c Array(Nullable(String)) +d Tuple(\n a Nullable(String),\n b Nullable(String)) +a UInt64 +b String +c Array(UInt64) +d Tuple(\n a UInt64,\n b String) +a Nullable(String) +b Nullable(String) +c Array(Nullable(String)) +d Tuple(\n a Nullable(String),\n b Nullable(String)) +a Nullable(String) +b Nullable(String) +c Array(Nullable(String)) +d Tuple(\n a Nullable(String),\n b Nullable(String)) +a Nullable(String) +b Nullable(String) +c Array(Nullable(String)) +d Tuple(\n a Nullable(String),\n b Nullable(String)) +a Nullable(String) +b Nullable(String) +c Array(Nullable(String)) +d Tuple(\n a Nullable(String),\n b Nullable(String)) +a Nullable(String) +b Nullable(String) +c Array(Nullable(String)) +d Tuple(\n a Nullable(String),\n b Nullable(String)) +1 +a Nullable(String) +b Nullable(String) +c Array(Nullable(String)) +d Tuple(\n a Nullable(String),\n b Nullable(String)) +a Nullable(String) +b Nullable(String) +c Array(Nullable(String)) +d Tuple(\n a Nullable(String),\n b Nullable(String)) diff --git a/tests/queries/0_stateless/02969_auto_format_detection.sh b/tests/queries/0_stateless/02969_auto_format_detection.sh new file mode 100755 index 00000000000..88d6575e499 --- /dev/null +++ b/tests/queries/0_stateless/02969_auto_format_detection.sh @@ -0,0 +1,47 @@ +#!/usr/bin/env bash +# Tags: no-fasttest + +CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CURDIR"/../shell_config.sh + +DATA_FILE=$CLICKHOUSE_TEST_UNIQUE_NAME.data + +for format in Parquet ORC Arrow ArrowStream Avro Native BSONEachRow JSONCompact Values TSKV JSONObjectEachRow JSONColumns JSONCompactColumns JSONCompact JSON TSV CSV +do + echo $format + $CLICKHOUSE_LOCAL -q "select * from generateRandom('a UInt64, b String, c Array(UInt64), d Tuple(a UInt64, b String)', 42) limit 10 format $format" > $DATA_FILE + $CLICKHOUSE_LOCAL -q "desc file('$DATA_FILE')" +done + +rm $DATA_FILE + +$CLICKHOUSE_LOCAL -q "select * from generateRandom('a UInt64, b String, c Array(UInt64), d Tuple(a UInt64, b String)', 42) limit 10 format JSONEachRow" > $DATA_FILE.jsonl +$CLICKHOUSE_LOCAL -q "desc file('$DATA_FILE*')" + + +$CLICKHOUSE_LOCAL -q "select * from generateRandom('a UInt64, b String, c Array(UInt64), d Tuple(a UInt64, b String)', 42) limit 10 format JSONEachRow" > $DATA_FILE + +$CLICKHOUSE_LOCAL -q "desc file('$DATA_FILE', auto, 'a UInt64, b String, c Array(UInt64), d Tuple(a UInt64, b String)')" + +$CLICKHOUSE_LOCAL -nmq " +desc file('$DATA_FILE'); +desc file('$DATA_FILE'); +" + +$CLICKHOUSE_LOCAL -nmq " +desc file('$DATA_FILE', JSONEachRow); +desc file('$DATA_FILE'); +" + +touch $DATA_FILE.1 +$CLICKHOUSE_LOCAL -q "select * from generateRandom('a UInt64, b String, c Array(UInt64), d Tuple(a UInt64, b String)', 42) limit 10 format JSONEachRow" > $DATA_FILE.2 +$CLICKHOUSE_LOCAL -q "desc file('$DATA_FILE.{1,2}')" +$CLICKHOUSE_LOCAL -q "desc file('$DATA_FILE.{1,2}') settings schema_inference_mode='union'" 2>&1 | grep -c "CANNOT_DETECT_FORMAT" + +$CLICKHOUSE_LOCAL -nmq " +desc file('$DATA_FILE.2'); +desc file('$DATA_FILE.{1,2}'); +" + +rm $DATA_FILE* diff --git a/tests/queries/0_stateless/02975_intdiv_with_decimal.reference b/tests/queries/0_stateless/02975_intdiv_with_decimal.reference new file mode 100644 index 00000000000..5540734ae4c --- /dev/null +++ b/tests/queries/0_stateless/02975_intdiv_with_decimal.reference @@ -0,0 +1,68 @@ +2 +2 +1 +2 +2 +2 +2 +2 +2 +2 +2 +1 +2 +1 +2 +1 +2 +2 +2 +2 +2 +2 +2 +2 +2 +2 +1 +1 +1 +1 +2 +2 +2 +2 +2 +2 +1 +2 +2 +2 +2 +2 +2 +2 +2 +1 +2 +1 +2 +1 +2 +2 +2 +2 +2 +2 +2 +2 +2 +2 +1 +1 +1 +1 +2 +2 +2 +2 diff --git a/tests/queries/0_stateless/02975_intdiv_with_decimal.sql b/tests/queries/0_stateless/02975_intdiv_with_decimal.sql new file mode 100644 index 00000000000..0911a481251 --- /dev/null +++ b/tests/queries/0_stateless/02975_intdiv_with_decimal.sql @@ -0,0 +1,70 @@ +--intDiv-- +SELECT intDiv(4,2); +SELECT intDiv(toDecimal32(4.4, 2), 2); +SELECT intDiv(4, toDecimal32(2.2, 2)); +SELECT intDiv(toDecimal32(4.4, 2), 2); +SELECT intDiv(toDecimal32(4.4, 2), toDecimal32(2.2, 2)); +SELECT intDiv(toDecimal64(4.4, 3), 2); +SELECT intDiv(toDecimal64(4.4, 3), toDecimal32(2.2, 2)); +SELECT intDiv(toDecimal128(4.4, 4), 2); +SELECT intDiv(toDecimal128(4.4, 4), toDecimal32(2.2, 2)); +SELECT intDiv(toDecimal256(4.4, 5), 2); +SELECT intDiv(toDecimal256(4.4, 5), toDecimal32(2.2, 2)); +SELECT intDiv(4, toDecimal64(2.2, 2)); +SELECT intDiv(toDecimal32(4.4, 2), toDecimal64(2.2, 2)); +SELECT intDiv(4, toDecimal128(2.2, 3)); +SELECT intDiv(toDecimal32(4.4, 2), toDecimal128(2.2, 2)); +SELECT intDiv(4, toDecimal256(2.2, 4)); +SELECT intDiv(toDecimal32(4.4, 2), toDecimal256(2.2, 2)); +SELECT intDiv(toDecimal64(4.4, 2), toDecimal64(2.2, 2)); +SELECT intDiv(toDecimal128(4.4, 2), toDecimal64(2.2, 2)); +SELECT intDiv(toDecimal256(4.4, 2), toDecimal64(2.2, 2)); +SELECT intDiv(toDecimal64(4.4, 2), toDecimal128(2.2, 2)); +SELECT intDiv(toDecimal128(4.4, 2), toDecimal128(2.2, 2)); +SELECT intDiv(toDecimal256(4.4, 2), toDecimal128(2.2, 2)); +SELECT intDiv(toDecimal64(4.4, 2), toDecimal256(2.2, 2)); +SELECT intDiv(toDecimal128(4.4, 2), toDecimal256(2.2, 2)); +SELECT intDiv(toDecimal256(4.4, 2), toDecimal256(2.2, 2)); +SELECT intDiv(4.2, toDecimal32(2.2, 2)); +SELECT intDiv(4.2, toDecimal64(2.2, 2)); +SELECT intDiv(4.2, toDecimal128(2.2, 2)); +SELECT intDiv(4.2, toDecimal256(2.2, 2)); +SELECT intDiv(toDecimal32(4.4, 2), 2.2); +SELECT intDiv(toDecimal64(4.4, 2), 2.2); +SELECT intDiv(toDecimal128(4.4, 2), 2.2); +SELECT intDiv(toDecimal256(4.4, 2), 2.2); +--intDivOrZero-- +SELECT intDivOrZero(4,2); +SELECT intDivOrZero(toDecimal32(4.4, 2), 2); +SELECT intDivOrZero(4, toDecimal32(2.2, 2)); +SELECT intDivOrZero(toDecimal32(4.4, 2), 2); +SELECT intDivOrZero(toDecimal32(4.4, 2), toDecimal32(2.2, 2)); +SELECT intDivOrZero(toDecimal64(4.4, 3), 2); +SELECT intDivOrZero(toDecimal64(4.4, 3), toDecimal32(2.2, 2)); +SELECT intDivOrZero(toDecimal128(4.4, 4), 2); +SELECT intDivOrZero(toDecimal128(4.4, 4), toDecimal32(2.2, 2)); +SELECT intDivOrZero(toDecimal256(4.4, 5), 2); +SELECT intDivOrZero(toDecimal256(4.4, 5), toDecimal32(2.2, 2)); +SELECT intDivOrZero(4, toDecimal64(2.2, 2)); +SELECT intDivOrZero(toDecimal32(4.4, 2), toDecimal64(2.2, 2)); +SELECT intDivOrZero(4, toDecimal128(2.2, 3)); +SELECT intDivOrZero(toDecimal32(4.4, 2), toDecimal128(2.2, 2)); +SELECT intDivOrZero(4, toDecimal256(2.2, 4)); +SELECT intDivOrZero(toDecimal32(4.4, 2), toDecimal256(2.2, 2)); +SELECT intDivOrZero(toDecimal64(4.4, 2), toDecimal64(2.2, 2)); +SELECT intDivOrZero(toDecimal128(4.4, 2), toDecimal64(2.2, 2)); +SELECT intDivOrZero(toDecimal256(4.4, 2), toDecimal64(2.2, 2)); +SELECT intDivOrZero(toDecimal64(4.4, 2), toDecimal128(2.2, 2)); +SELECT intDivOrZero(toDecimal128(4.4, 2), toDecimal128(2.2, 2)); +SELECT intDivOrZero(toDecimal256(4.4, 2), toDecimal128(2.2, 2)); +SELECT intDivOrZero(toDecimal64(4.4, 2), toDecimal256(2.2, 2)); +SELECT intDivOrZero(toDecimal128(4.4, 2), toDecimal256(2.2, 2)); +SELECT intDivOrZero(toDecimal256(4.4, 2), toDecimal256(2.2, 2)); +SELECT intDivOrZero(4.2, toDecimal32(2.2, 2)); +SELECT intDivOrZero(4.2, toDecimal64(2.2, 2)); +SELECT intDivOrZero(4.2, toDecimal128(2.2, 2)); +SELECT intDivOrZero(4.2, toDecimal256(2.2, 2)); +SELECT intDivOrZero(toDecimal32(4.4, 2), 2.2); +SELECT intDivOrZero(toDecimal64(4.4, 2), 2.2); +SELECT intDivOrZero(toDecimal128(4.4, 2), 2.2); +SELECT intDivOrZero(toDecimal256(4.4, 2), 2.2); diff --git a/tests/queries/0_stateless/02981_nested_bad_types.reference b/tests/queries/0_stateless/02981_nested_bad_types.reference new file mode 100644 index 00000000000..e69de29bb2d diff --git a/tests/queries/0_stateless/02981_nested_bad_types.sql b/tests/queries/0_stateless/02981_nested_bad_types.sql new file mode 100644 index 00000000000..87bc80693c8 --- /dev/null +++ b/tests/queries/0_stateless/02981_nested_bad_types.sql @@ -0,0 +1,58 @@ +set allow_suspicious_low_cardinality_types=0; +set allow_suspicious_fixed_string_types=0; +set allow_experimental_variant_type=0; + +select [42]::Array(LowCardinality(UInt64)); -- {serverError SUSPICIOUS_TYPE_FOR_LOW_CARDINALITY} +select [[[42]]]::Array(Array(Array(LowCardinality(UInt64)))); -- {serverError SUSPICIOUS_TYPE_FOR_LOW_CARDINALITY} +select map('a', 42)::Map(String, LowCardinality(UInt64)); -- {serverError SUSPICIOUS_TYPE_FOR_LOW_CARDINALITY} +select map('a', map('b', [42]))::Map(String, Map(String, Array(LowCardinality(UInt64)))); -- {serverError SUSPICIOUS_TYPE_FOR_LOW_CARDINALITY} +select tuple('a', 42)::Tuple(String, LowCardinality(UInt64)); -- {serverError SUSPICIOUS_TYPE_FOR_LOW_CARDINALITY} +select tuple('a', [map('b', 42)])::Tuple(String, Array(Map(String, LowCardinality(UInt64)))); -- {serverError SUSPICIOUS_TYPE_FOR_LOW_CARDINALITY} + +create table test (x Array(LowCardinality(UInt64))) engine=Memory; -- {serverError SUSPICIOUS_TYPE_FOR_LOW_CARDINALITY} +create table test (x Array(Array(LowCardinality(UInt64)))) engine=Memory; -- {serverError SUSPICIOUS_TYPE_FOR_LOW_CARDINALITY} +create table test (x Map(String, LowCardinality(UInt64))) engine=Memory; -- {serverError SUSPICIOUS_TYPE_FOR_LOW_CARDINALITY} +create table test (x Map(String, Map(String, LowCardinality(UInt64)))) engine=Memory; -- {serverError SUSPICIOUS_TYPE_FOR_LOW_CARDINALITY} +create table test (x Tuple(String, LowCardinality(UInt64))) engine=Memory; -- {serverError SUSPICIOUS_TYPE_FOR_LOW_CARDINALITY} +create table test (x Tuple(String, Array(Map(String, LowCardinality(UInt64))))) engine=Memory; -- {serverError SUSPICIOUS_TYPE_FOR_LOW_CARDINALITY} + + +select ['42']::Array(FixedString(1000000)); -- {serverError ILLEGAL_COLUMN} +select ['42']::Array(FixedString(1000000)); -- {serverError ILLEGAL_COLUMN} +select [[['42']]]::Array(Array(Array(FixedString(1000000)))); -- {serverError ILLEGAL_COLUMN} +select map('a', '42')::Map(String, FixedString(1000000)); -- {serverError ILLEGAL_COLUMN} +select map('a', map('b', ['42']))::Map(String, Map(String, Array(FixedString(1000000)))); -- {serverError ILLEGAL_COLUMN} +select tuple('a', '42')::Tuple(String, FixedString(1000000)); -- {serverError ILLEGAL_COLUMN} +select tuple('a', [map('b', '42')])::Tuple(String, Array(Map(String, FixedString(1000000)))); -- {serverError ILLEGAL_COLUMN} + +create table test (x Array(FixedString(1000000))) engine=Memory; -- {serverError ILLEGAL_COLUMN} +create table test (x Array(Array(FixedString(1000000)))) engine=Memory; -- {serverError ILLEGAL_COLUMN} +create table test (x Map(String, FixedString(1000000))) engine=Memory; -- {serverError ILLEGAL_COLUMN} +create table test (x Map(String, Map(String, FixedString(1000000)))) engine=Memory; -- {serverError ILLEGAL_COLUMN} +create table test (x Tuple(String, FixedString(1000000))) engine=Memory; -- {serverError ILLEGAL_COLUMN} +create table test (x Tuple(String, Array(Map(String, FixedString(1000000))))) engine=Memory; -- {serverError ILLEGAL_COLUMN} + +select [42]::Array(Variant(String, UInt64)); -- {serverError ILLEGAL_COLUMN} +select [[[42]]]::Array(Array(Array(Variant(String, UInt64)))); -- {serverError ILLEGAL_COLUMN} +select map('a', 42)::Map(String, Variant(String, UInt64)); -- {serverError ILLEGAL_COLUMN} +select map('a', map('b', [42]))::Map(String, Map(String, Array(Variant(String, UInt64)))); -- {serverError ILLEGAL_COLUMN} +select tuple('a', 42)::Tuple(String, Variant(String, UInt64)); -- {serverError ILLEGAL_COLUMN} +select tuple('a', [map('b', 42)])::Tuple(String, Array(Map(String, Variant(String, UInt64)))); -- {serverError ILLEGAL_COLUMN} + +create table test (x Array(Variant(String, UInt64))) engine=Memory; -- {serverError ILLEGAL_COLUMN} +create table test (x Array(Array(Variant(String, UInt64)))) engine=Memory; -- {serverError ILLEGAL_COLUMN} +create table test (x Map(String, Variant(String, UInt64))) engine=Memory; -- {serverError ILLEGAL_COLUMN} +create table test (x Map(String, Map(String, Variant(String, UInt64)))) engine=Memory; -- {serverError ILLEGAL_COLUMN} +create table test (x Tuple(String, Variant(String, UInt64))) engine=Memory; -- {serverError ILLEGAL_COLUMN} +create table test (x Tuple(String, Array(Map(String, Variant(String, UInt64))))) engine=Memory; -- {serverError ILLEGAL_COLUMN} + +set allow_experimental_variant_type=1; +select 42::Variant(String, LowCardinality(UInt64)) settings allow_experimental_variant_type=1; -- {serverError SUSPICIOUS_TYPE_FOR_LOW_CARDINALITY} +select tuple('a', [map('b', 42)])::Tuple(String, Array(Map(String, Variant(LowCardinality(UInt64), UInt8)))); -- {serverError SUSPICIOUS_TYPE_FOR_LOW_CARDINALITY} +create table test (x Variant(LowCardinality(UInt64), UInt8)) engine=Memory; -- {serverError SUSPICIOUS_TYPE_FOR_LOW_CARDINALITY} +create table test (x Tuple(String, Array(Map(String, Variant(LowCardinality(UInt64), UInt8))))) engine=Memory; -- {serverError SUSPICIOUS_TYPE_FOR_LOW_CARDINALITY} + +select '42'::Variant(UInt64, FixedString(1000000)); -- {serverError ILLEGAL_COLUMN} +select tuple('a', [map('b', '42')])::Tuple(String, Array(Map(String, Variant(UInt32, FixedString(1000000))))); -- {serverError ILLEGAL_COLUMN} +create table test (x Variant(UInt64, FixedString(1000000))) engine=Memory; -- {serverError ILLEGAL_COLUMN} +create table test (x Tuple(String, Array(Map(String, FixedString(1000000))))) engine=Memory; -- {serverError ILLEGAL_COLUMN} diff --git a/tests/queries/0_stateless/02981_vertical_merges_memory_usage.sql b/tests/queries/0_stateless/02981_vertical_merges_memory_usage.sql index 6ca594ebc7d..b784e734457 100644 --- a/tests/queries/0_stateless/02981_vertical_merges_memory_usage.sql +++ b/tests/queries/0_stateless/02981_vertical_merges_memory_usage.sql @@ -1,4 +1,4 @@ --- Tags: long +-- Tags: long, no-random-merge-tree-settings DROP TABLE IF EXISTS t_vertical_merge_memory; @@ -14,7 +14,9 @@ SETTINGS merge_max_block_size_bytes = '10M'; INSERT INTO t_vertical_merge_memory SELECT number, arrayMap(x -> repeat('a', 50), range(1000)) FROM numbers(3000); -INSERT INTO t_vertical_merge_memory SELECT number, arrayMap(x -> repeat('a', 50), range(1000)) FROM numbers(3000); +-- Why 3001? - Deduplication, which is off with normal MergeTree by default but on for ReplicatedMergeTree and SharedMergeTree. +-- We automatically replace MergeTree with SharedMergeTree in ClickHouse Cloud. +INSERT INTO t_vertical_merge_memory SELECT number, arrayMap(x -> repeat('a', 50), range(1000)) FROM numbers(3001); OPTIMIZE TABLE t_vertical_merge_memory FINAL; diff --git a/tests/queries/0_stateless/02982_dont_infer_exponent_floats.reference b/tests/queries/0_stateless/02982_dont_infer_exponent_floats.reference new file mode 100644 index 00000000000..b6d1ff865e5 --- /dev/null +++ b/tests/queries/0_stateless/02982_dont_infer_exponent_floats.reference @@ -0,0 +1,2 @@ +c1 Nullable(String) +c1 Nullable(Float64) diff --git a/tests/queries/0_stateless/02982_dont_infer_exponent_floats.sql b/tests/queries/0_stateless/02982_dont_infer_exponent_floats.sql new file mode 100644 index 00000000000..2a281e898f1 --- /dev/null +++ b/tests/queries/0_stateless/02982_dont_infer_exponent_floats.sql @@ -0,0 +1,2 @@ +DESC format(CSV, '1E20\n1.1E20') settings input_format_try_infer_exponent_floats = 0; +DESC format(CSV, '1E20\n1.1E20') settings input_format_try_infer_exponent_floats = 1; diff --git a/tests/queries/0_stateless/02985_disable_async_inserts_for_dependent_mv_dedup.reference b/tests/queries/0_stateless/02985_disable_async_inserts_for_dependent_mv_dedup.reference deleted file mode 100644 index 4ff73b99975..00000000000 --- a/tests/queries/0_stateless/02985_disable_async_inserts_for_dependent_mv_dedup.reference +++ /dev/null @@ -1 +0,0 @@ -Values Ok 4 Parsed diff --git a/tests/queries/0_stateless/02985_disable_async_inserts_for_dependent_mv_dedup.sql b/tests/queries/0_stateless/02985_disable_async_inserts_for_dependent_mv_dedup.sql deleted file mode 100644 index 41b23374bfc..00000000000 --- a/tests/queries/0_stateless/02985_disable_async_inserts_for_dependent_mv_dedup.sql +++ /dev/null @@ -1,46 +0,0 @@ --- Tags: no-parallel - -SET async_insert = 1; -SET insert_deduplicate = 1; -SET deduplicate_blocks_in_dependent_materialized_views = 1; - -DROP TABLE IF EXISTS 02985_test; -CREATE TABLE 02985_test -( - d Date, - value UInt64 -) ENGINE = MergeTree ORDER BY tuple() SETTINGS non_replicated_deduplication_window = 1000; - -DROP VIEW IF EXISTS 02985_mv; -CREATE MATERIALIZED VIEW 02985_mv -ENGINE = SummingMergeTree ORDER BY d AS -SELECT - d, sum(value) s -FROM 02985_test GROUP BY d; - --- Inserts are synchronous. -INSERT INTO 02985_test (*) -VALUES ('2024-01-01', 1), ('2024-01-01', 2), ('2024-01-02', 1); - -SYSTEM FLUSH LOGS; - -SELECT format, status, rows, data_kind FROM system.asynchronous_insert_log -WHERE database = currentDatabase() AND table = '02985_test'; - -SET deduplicate_blocks_in_dependent_materialized_views = 0; - --- Set a large value for async_insert_busy_timeout_max_ms to avoid flushing the entry synchronously. -INSERT INTO 02985_test (*) -SETTINGS - async_insert_busy_timeout_min_ms=200, - async_insert_busy_timeout_max_ms=100000 -VALUES ('2024-01-01', 1), ('2024-01-01', 2), ('2024-01-02', 1), ('2024-01-02', 4); - -SYSTEM FLUSH LOGS; - -SELECT format, status, rows, data_kind -FROM system.asynchronous_insert_log -WHERE database = currentDatabase() AND table = '02985_test'; - -DROP VIEW IF EXISTS 02985_mv; -DROP TABLE IF EXISTS 02985_test; diff --git a/tests/queries/0_stateless/02985_if_over_big_int_decimal.reference b/tests/queries/0_stateless/02985_if_over_big_int_decimal.reference new file mode 100644 index 00000000000..1dfad945ee2 --- /dev/null +++ b/tests/queries/0_stateless/02985_if_over_big_int_decimal.reference @@ -0,0 +1,12 @@ +49500 +49500 +49500 +49500 +49500 +49500 +450000 +450000 +450000 +450000 +450000 +450000 diff --git a/tests/queries/0_stateless/02985_if_over_big_int_decimal.sql b/tests/queries/0_stateless/02985_if_over_big_int_decimal.sql new file mode 100644 index 00000000000..0295a64a092 --- /dev/null +++ b/tests/queries/0_stateless/02985_if_over_big_int_decimal.sql @@ -0,0 +1,14 @@ +select sumIf(number::Int128, number % 10 == 0) from numbers(1000); +select sumIf(number::UInt128, number % 10 == 0) from numbers(1000); +select sumIf(number::Int256, number % 10 == 0) from numbers(1000); +select sumIf(number::UInt256, number % 10 == 0) from numbers(1000); +select sumIf(number::Decimal128(3), number % 10 == 0) from numbers(1000); +select sumIf(number::Decimal256(3), number % 10 == 0) from numbers(1000); + +-- Test when the condition is neither 0 nor 1 +select sumIf(number::Int128, number % 10) from numbers(1000); +select sumIf(number::UInt128, number % 10) from numbers(1000); +select sumIf(number::Int256, number % 10) from numbers(1000); +select sumIf(number::UInt256, number % 10) from numbers(1000); +select sumIf(number::Decimal128(3), number % 10) from numbers(1000); +select sumIf(number::Decimal256(3), number % 10) from numbers(1000); diff --git a/tests/queries/0_stateless/02989_group_by_tuple.reference b/tests/queries/0_stateless/02989_group_by_tuple.reference new file mode 100644 index 00000000000..4539bbf2d22 --- /dev/null +++ b/tests/queries/0_stateless/02989_group_by_tuple.reference @@ -0,0 +1,3 @@ +0 +1 +2 diff --git a/tests/queries/0_stateless/02989_group_by_tuple.sql b/tests/queries/0_stateless/02989_group_by_tuple.sql new file mode 100644 index 00000000000..d0a205f5edc --- /dev/null +++ b/tests/queries/0_stateless/02989_group_by_tuple.sql @@ -0,0 +1 @@ +SELECT number FROM numbers(3) GROUP BY (number, number % 2) ORDER BY number; diff --git a/tests/queries/0_stateless/02989_replicated_merge_tree_invalid_metadata_version.reference b/tests/queries/0_stateless/02989_replicated_merge_tree_invalid_metadata_version.reference new file mode 100644 index 00000000000..128e3adcc0a --- /dev/null +++ b/tests/queries/0_stateless/02989_replicated_merge_tree_invalid_metadata_version.reference @@ -0,0 +1,14 @@ +Row 1: +────── +name: metadata +version: 1 +-- +Row 1: +────── +name: metadata_version +value: 1 +-- +id UInt64 +value String +insert_time DateTime +insert_time_updated DateTime diff --git a/tests/queries/0_stateless/02989_replicated_merge_tree_invalid_metadata_version.sql b/tests/queries/0_stateless/02989_replicated_merge_tree_invalid_metadata_version.sql new file mode 100644 index 00000000000..15633586aa8 --- /dev/null +++ b/tests/queries/0_stateless/02989_replicated_merge_tree_invalid_metadata_version.sql @@ -0,0 +1,40 @@ +-- Tags: zookeeper + +DROP TABLE IF EXISTS test_table_replicated; +CREATE TABLE test_table_replicated +( + id UInt64, + value String +) ENGINE=ReplicatedMergeTree('/clickhouse/tables/{database}/test_table_replicated', '1_replica') ORDER BY id; + +ALTER TABLE test_table_replicated ADD COLUMN insert_time DateTime; + +SELECT name, version FROM system.zookeeper +WHERE path = (SELECT zookeeper_path FROM system.replicas WHERE database = currentDatabase() AND table = 'test_table_replicated') +AND name = 'metadata' FORMAT Vertical; + +DROP TABLE IF EXISTS test_table_replicated_second; +CREATE TABLE test_table_replicated_second +( + id UInt64, + value String, + insert_time DateTime +) ENGINE=ReplicatedMergeTree('/clickhouse/tables/{database}/test_table_replicated', '2_replica') ORDER BY id; + +DROP TABLE test_table_replicated; + +SELECT '--'; + +SELECT name, value FROM system.zookeeper +WHERE path = (SELECT replica_path FROM system.replicas WHERE database = currentDatabase() AND table = 'test_table_replicated_second') +AND name = 'metadata_version' FORMAT Vertical; + +SYSTEM RESTART REPLICA test_table_replicated_second; + +ALTER TABLE test_table_replicated_second ADD COLUMN insert_time_updated DateTime; + +SELECT '--'; + +DESCRIBE test_table_replicated_second; + +DROP TABLE test_table_replicated_second; diff --git a/tests/queries/0_stateless/02989_system_tables_metadata_version.reference b/tests/queries/0_stateless/02989_system_tables_metadata_version.reference new file mode 100644 index 00000000000..73f6a1ad346 --- /dev/null +++ b/tests/queries/0_stateless/02989_system_tables_metadata_version.reference @@ -0,0 +1,9 @@ +test_temporary_table_02989 0 +-- +test_table 0 +-- +test_table_replicated 0 +-- +test_table_replicated 1 +-- +test_table_replicated 2 diff --git a/tests/queries/0_stateless/02989_system_tables_metadata_version.sql b/tests/queries/0_stateless/02989_system_tables_metadata_version.sql new file mode 100644 index 00000000000..9534b1f2e82 --- /dev/null +++ b/tests/queries/0_stateless/02989_system_tables_metadata_version.sql @@ -0,0 +1,50 @@ +-- Tags: zookeeper, no-parallel + +DROP TABLE IF EXISTS test_temporary_table_02989; +CREATE TEMPORARY TABLE test_temporary_table_02989 +( + id UInt64, + value String +) ENGINE=MergeTree ORDER BY id; + +SELECT name, metadata_version FROM system.tables WHERE name = 'test_temporary_table_02989' AND is_temporary; + +DROP TABLE test_temporary_table_02989; + +DROP TABLE IF EXISTS test_table; +CREATE TABLE test_table +( + id UInt64, + value String +) ENGINE=MergeTree ORDER BY id; + +SELECT '--'; + +SELECT name, metadata_version FROM system.tables WHERE database = currentDatabase() AND name = 'test_table'; + +DROP TABLE test_table; + +DROP TABLE IF EXISTS test_table_replicated; +CREATE TABLE test_table_replicated +( + id UInt64, + value String +) ENGINE=ReplicatedMergeTree('/clickhouse/tables/{database}/test_table_replicated', '1_replica') ORDER BY id; + +SELECT '--'; + +SELECT name, metadata_version FROM system.tables WHERE database = currentDatabase() AND name = 'test_table_replicated'; + +ALTER TABLE test_table_replicated ADD COLUMN insert_time DateTime; + +SELECT '--'; + +SELECT name, metadata_version FROM system.tables WHERE database = currentDatabase() AND name = 'test_table_replicated'; + +ALTER TABLE test_table_replicated ADD COLUMN insert_time_updated DateTime; + +SELECT '--'; + +SELECT name, metadata_version FROM system.tables WHERE database = currentDatabase() AND name = 'test_table_replicated'; + +DROP TABLE test_table_replicated; diff --git a/tests/queries/0_stateless/02990_arrayFold_nullable_lc.reference b/tests/queries/0_stateless/02990_arrayFold_nullable_lc.reference new file mode 100644 index 00000000000..5bd5d7bbd90 --- /dev/null +++ b/tests/queries/0_stateless/02990_arrayFold_nullable_lc.reference @@ -0,0 +1,16 @@ +23 +23 +23 +23 +3 +3 +\N +1 +\N +\N +\N +23 +23 +23 +\N +\N diff --git a/tests/queries/0_stateless/02990_arrayFold_nullable_lc.sql b/tests/queries/0_stateless/02990_arrayFold_nullable_lc.sql new file mode 100644 index 00000000000..280defdfbb4 --- /dev/null +++ b/tests/queries/0_stateless/02990_arrayFold_nullable_lc.sql @@ -0,0 +1,35 @@ +SET allow_suspicious_low_cardinality_types=1; + +SELECT arrayFold((acc, x) -> (acc + (x * 2)), [1, 2, 3, 4], toInt64(3)); +SELECT arrayFold((acc, x) -> (acc + (x * 2)), [1, 2, 3, 4], toInt64(toNullable(3))); +SELECT arrayFold((acc, x) -> (acc + (x * 2)), [1, 2, 3, 4], materialize(toInt64(toNullable(3)))); + +SELECT arrayFold((acc, x) -> (acc + (x * 2)), [1, 2, 3, 4]::Array(Nullable(Int64)), toInt64(3)); -- { serverError TYPE_MISMATCH } +SELECT arrayFold((acc, x) -> (acc + (x * 2)), [1, 2, 3, 4]::Array(Nullable(Int64)), toInt64(toNullable(3))); + +SELECT arrayFold((acc, x) -> (acc + (x * 2)), []::Array(Int64), toInt64(3)); +SELECT arrayFold((acc, x) -> (acc + (x * 2)), []::Array(Nullable(Int64)), toInt64(toNullable(3))); +SELECT arrayFold((acc, x) -> (acc + (x * 2)), []::Array(Nullable(Int64)), toInt64(NULL)); + +SELECT arrayFold((acc, x) -> x, materialize(CAST('[0, 1]', 'Array(Nullable(UInt8))')), toUInt8(toNullable(0))); +SELECT arrayFold((acc, x) -> x, materialize(CAST([NULL], 'Array(Nullable(UInt8))')), toUInt8(toNullable(0))); +SELECT arrayFold((acc, x) -> acc + x, materialize(CAST([NULL], 'Array(Nullable(UInt8))')), toUInt64(toNullable(0))); +SELECT arrayFold((acc, x) -> acc + x, materialize(CAST([1, 2, NULL], 'Array(Nullable(UInt8))')), toUInt64(toNullable(0))); + +SELECT arrayFold((acc, x) -> toNullable(acc + (x * 2)), [1, 2, 3, 4], toInt64(3)); -- { serverError TYPE_MISMATCH } +SELECT arrayFold((acc, x) -> toNullable(acc + (x * 2)), [1, 2, 3, 4], toNullable(toInt64(3))); + +SELECT arrayFold((acc, x) -> (acc + (x * 2)), [1, 2, 3, 4], toLowCardinality(toInt64(3))); -- { serverError TYPE_MISMATCH } +SELECT arrayFold((acc, x) -> toLowCardinality(acc + (x * 2)), [1, 2, 3, 4], toLowCardinality(toInt64(3))); +SELECT arrayFold((acc, x) -> (acc + (x * 2)), [1, 2, 3, 4]::Array(LowCardinality(Int64)), toInt64(toLowCardinality(3))); -- { serverError TYPE_MISMATCH } +SELECT arrayFold((acc, x) -> toLowCardinality(acc + (x * 2)), [1, 2, 3, 4]::Array(LowCardinality(Int64)), toInt64(toLowCardinality(3))); + +SELECT arrayFold((acc, x) -> acc + (x * 2), [1, 2, 3, 4]::Array(Nullable(Int64)), toInt64(toLowCardinality(3))); -- { serverError TYPE_MISMATCH } +SELECT arrayFold((acc, x) -> toLowCardinality(acc + (x * 2)), [1, 2, 3, 4]::Array(Nullable(Int64)), toInt64(toLowCardinality(3))); -- { serverError TYPE_MISMATCH } +SELECT arrayFold((acc, x) -> toLowCardinality(acc + (x * 2)), [1, 2, 3, 4]::Array(Nullable(Int64)), toInt64(toNullable(3))); -- { serverError TYPE_MISMATCH } + +SELECT arrayFold((acc, x) -> (acc + (x * 2)), [1, 2, 3, 4], NULL); +-- It's debatable which one of the following 2 queries should work, but considering the return type must match the +-- accumulator type it makes sense to be the second one +SELECT arrayFold((acc, x) -> (acc + (x * 2)), [1, 2, 3, 4], NULL::LowCardinality(Nullable(Int64))); -- { serverError TYPE_MISMATCH } +SELECT arrayFold((acc, x) -> (acc + (x * 2))::LowCardinality(Nullable(Int64)), [1, 2, 3, 4], NULL::LowCardinality(Nullable(Int64))); diff --git a/tests/queries/0_stateless/02990_format_lambdas.reference b/tests/queries/0_stateless/02990_format_lambdas.reference new file mode 100644 index 00000000000..f898d6ffa0e --- /dev/null +++ b/tests/queries/0_stateless/02990_format_lambdas.reference @@ -0,0 +1,10 @@ +SELECT lambda(1, 1) +SELECT lambda(1, 1) +SELECT x -> 1 +SELECT x -> 1 +SELECT (x, y) -> 1 +SELECT (x, y) -> 1 +SELECT lambda(f(1), 1) +SELECT lambda(f(1), 1) +SELECT lambda(f(x), 1) +SELECT lambda(f(x), 1) diff --git a/tests/queries/0_stateless/02990_format_lambdas.sh b/tests/queries/0_stateless/02990_format_lambdas.sh new file mode 100755 index 00000000000..9dc5e0f0461 --- /dev/null +++ b/tests/queries/0_stateless/02990_format_lambdas.sh @@ -0,0 +1,11 @@ +#!/usr/bin/env bash + +CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CURDIR"/../shell_config.sh + +QUERY="SELECT lambda(1, 1)"; QUERY2=$(${CLICKHOUSE_FORMAT} --query "$QUERY"); echo "$QUERY2"; QUERY3=$(${CLICKHOUSE_FORMAT} --query "$QUERY2"); echo "$QUERY3"; +QUERY="SELECT lambda(x, 1)"; QUERY2=$(${CLICKHOUSE_FORMAT} --query "$QUERY"); echo "$QUERY2"; QUERY3=$(${CLICKHOUSE_FORMAT} --query "$QUERY2"); echo "$QUERY3"; +QUERY="SELECT lambda((x, y), 1)"; QUERY2=$(${CLICKHOUSE_FORMAT} --query "$QUERY"); echo "$QUERY2"; QUERY3=$(${CLICKHOUSE_FORMAT} --query "$QUERY2"); echo "$QUERY3"; +QUERY="SELECT lambda(f(1), 1)"; QUERY2=$(${CLICKHOUSE_FORMAT} --query "$QUERY"); echo "$QUERY2"; QUERY3=$(${CLICKHOUSE_FORMAT} --query "$QUERY2"); echo "$QUERY3"; +QUERY="SELECT lambda(f(x), 1)"; QUERY2=$(${CLICKHOUSE_FORMAT} --query "$QUERY"); echo "$QUERY2"; QUERY3=$(${CLICKHOUSE_FORMAT} --query "$QUERY2"); echo "$QUERY3"; diff --git a/tests/queries/0_stateless/02990_format_not_precedence.reference b/tests/queries/0_stateless/02990_format_not_precedence.reference new file mode 100644 index 00000000000..f44cf2fdb52 --- /dev/null +++ b/tests/queries/0_stateless/02990_format_not_precedence.reference @@ -0,0 +1,13 @@ +-- { echoOn } +SELECT NOT 0 + NOT 0; +0 +SELECT NOT (0 + (NOT 0)); +0 +SELECT (NOT 0) + (NOT 0); +2 +SELECT formatQuery('SELECT NOT 0 + NOT 0'); +SELECT NOT (0 + (NOT 0)) +SELECT formatQuery('SELECT NOT (0 + (NOT 0))'); +SELECT NOT (0 + (NOT 0)) +SELECT formatQuery('SELECT (NOT 0) + (NOT 0)'); +SELECT (NOT 0) + (NOT 0) diff --git a/tests/queries/0_stateless/02990_format_not_precedence.sql b/tests/queries/0_stateless/02990_format_not_precedence.sql new file mode 100644 index 00000000000..98ef2c9e781 --- /dev/null +++ b/tests/queries/0_stateless/02990_format_not_precedence.sql @@ -0,0 +1,7 @@ +-- { echoOn } +SELECT NOT 0 + NOT 0; +SELECT NOT (0 + (NOT 0)); +SELECT (NOT 0) + (NOT 0); +SELECT formatQuery('SELECT NOT 0 + NOT 0'); +SELECT formatQuery('SELECT NOT (0 + (NOT 0))'); +SELECT formatQuery('SELECT (NOT 0) + (NOT 0)'); diff --git a/tests/queries/0_stateless/02990_format_select_from_explain.reference b/tests/queries/0_stateless/02990_format_select_from_explain.reference new file mode 100644 index 00000000000..7c8dcef3824 --- /dev/null +++ b/tests/queries/0_stateless/02990_format_select_from_explain.reference @@ -0,0 +1,9 @@ +SELECT explain +FROM +( + SELECT * + FROM viewExplain('EXPLAIN AST', '', ( + SELECT * + FROM system.numbers + )) +) diff --git a/tests/queries/0_stateless/02990_format_select_from_explain.sh b/tests/queries/0_stateless/02990_format_select_from_explain.sh new file mode 100755 index 00000000000..4955b733788 --- /dev/null +++ b/tests/queries/0_stateless/02990_format_select_from_explain.sh @@ -0,0 +1,7 @@ +#!/usr/bin/env bash + +CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CURDIR"/../shell_config.sh + +${CLICKHOUSE_FORMAT} --query "SELECT explain FROM (EXPLAIN AST SELECT * FROM system.numbers)" diff --git a/tests/queries/0_stateless/02990_optimize_uniq_to_count_alias.reference b/tests/queries/0_stateless/02990_optimize_uniq_to_count_alias.reference new file mode 100644 index 00000000000..6ed281c757a --- /dev/null +++ b/tests/queries/0_stateless/02990_optimize_uniq_to_count_alias.reference @@ -0,0 +1,2 @@ +1 +1 diff --git a/tests/queries/0_stateless/02990_optimize_uniq_to_count_alias.sql b/tests/queries/0_stateless/02990_optimize_uniq_to_count_alias.sql new file mode 100644 index 00000000000..5ba0be39991 --- /dev/null +++ b/tests/queries/0_stateless/02990_optimize_uniq_to_count_alias.sql @@ -0,0 +1,37 @@ +--https://github.com/ClickHouse/ClickHouse/issues/59999 +DROP TABLE IF EXISTS tags; +CREATE TABLE tags (dev_tag String) ENGINE = Memory AS SELECT '1'; + +SELECT * +FROM +( + SELECT countDistinct(dev_tag) AS total_devtags + FROM + ( + SELECT dev_tag + FROM + ( + SELECT * + FROM tags + ) AS t + GROUP BY dev_tag + ) AS t +) SETTINGS optimize_uniq_to_count=0; + +SELECT * +FROM +( + SELECT countDistinct(dev_tag) AS total_devtags + FROM + ( + SELECT dev_tag + FROM + ( + SELECT * + FROM tags + ) AS t + GROUP BY dev_tag + ) AS t +) SETTINGS optimize_uniq_to_count=1; + +DROP TABLE IF EXISTS tags; diff --git a/tests/queries/0_stateless/02990_rmt_replica_path_uuid.reference b/tests/queries/0_stateless/02990_rmt_replica_path_uuid.reference new file mode 100644 index 00000000000..5521c015fcf --- /dev/null +++ b/tests/queries/0_stateless/02990_rmt_replica_path_uuid.reference @@ -0,0 +1,4 @@ +aaaaaaaa-1111-2222-3333-aaaaaaaaaaaa +/tables/default/aaaaaaaa-1111-2222-3333-aaaaaaaaaaaa/replicas/r1 +aaaaaaaa-1111-2222-3333-aaaaaaaaaaaa +/tables/default/aaaaaaaa-1111-2222-3333-aaaaaaaaaaaa/replicas/r1 diff --git a/tests/queries/0_stateless/02990_rmt_replica_path_uuid.sql b/tests/queries/0_stateless/02990_rmt_replica_path_uuid.sql new file mode 100644 index 00000000000..4fcdff2910f --- /dev/null +++ b/tests/queries/0_stateless/02990_rmt_replica_path_uuid.sql @@ -0,0 +1,23 @@ +-- Tags: no-parallel, no-ordinary-database, no-replicated-database +-- Tag no-parallel: static UUID +-- Tag no-ordinary-database: requires UUID +-- Tag no-replicated-database: executes with ON CLUSTER anyway + +-- Ignore "ATTACH TABLE query with full table definition is not recommended" +-- Ignore BAD_ARGUMENTS +SET send_logs_level='fatal'; + +DROP TABLE IF EXISTS x; + +ATTACH TABLE x UUID 'aaaaaaaa-1111-2222-3333-aaaaaaaaaaaa' (key Int) ENGINE = ReplicatedMergeTree('/tables/{database}/{uuid}', 'r1') ORDER BY tuple(); +SELECT uuid FROM system.tables WHERE database = currentDatabase() and table = 'x'; +SELECT replica_path FROM system.replicas WHERE database = currentDatabase() and table = 'x'; +DROP TABLE x; + +-- {uuid} macro forbidden for CREATE TABLE without explicit UUID +CREATE TABLE x (key Int) ENGINE = ReplicatedMergeTree('/tables/{database}/{uuid}', 'r1') ORDER BY tuple(); -- { serverError BAD_ARGUMENTS } + +CREATE TABLE x UUID 'aaaaaaaa-1111-2222-3333-aaaaaaaaaaaa' (key Int) ENGINE = ReplicatedMergeTree('/tables/{database}/{uuid}', 'r1') ORDER BY tuple(); +SELECT uuid FROM system.tables WHERE database = currentDatabase() and table = 'x'; +SELECT replica_path FROM system.replicas WHERE database = currentDatabase() and table = 'x'; +DROP TABLE x; diff --git a/tests/queries/0_stateless/02991_count_rewrite_analyzer.reference b/tests/queries/0_stateless/02991_count_rewrite_analyzer.reference new file mode 100644 index 00000000000..ccb266fc2b5 --- /dev/null +++ b/tests/queries/0_stateless/02991_count_rewrite_analyzer.reference @@ -0,0 +1,4 @@ +Nullable(UInt64) +UInt64 +Nullable(UInt64) +UInt64 diff --git a/tests/queries/0_stateless/02991_count_rewrite_analyzer.sql b/tests/queries/0_stateless/02991_count_rewrite_analyzer.sql new file mode 100644 index 00000000000..b11aeedd225 --- /dev/null +++ b/tests/queries/0_stateless/02991_count_rewrite_analyzer.sql @@ -0,0 +1,7 @@ +-- Regression test for https://github.com/ClickHouse/ClickHouse/issues/59919 +SET allow_experimental_analyzer=1; + +SELECT toTypeName(sum(toNullable('a') IN toNullable('a'))) AS x; +SELECT toTypeName(count(toNullable('a') IN toNullable('a'))) AS x; +SELECT toTypeName(sum(toFixedString('a', toLowCardinality(toNullable(1))) IN toFixedString('a', 1))) AS x; +SELECT toTypeName(count(toFixedString('a', toLowCardinality(toNullable(1))) IN toFixedString('a', 1))) AS x; diff --git a/tests/queries/0_stateless/02992_settings_overflow.reference b/tests/queries/0_stateless/02992_settings_overflow.reference new file mode 100644 index 00000000000..e69de29bb2d diff --git a/tests/queries/0_stateless/02992_settings_overflow.sql b/tests/queries/0_stateless/02992_settings_overflow.sql new file mode 100644 index 00000000000..d120c3400e5 --- /dev/null +++ b/tests/queries/0_stateless/02992_settings_overflow.sql @@ -0,0 +1 @@ +SET max_threads = -1; -- { serverError CANNOT_CONVERT_TYPE } diff --git a/tests/queries/0_stateless/02993_lazy_index_loading.reference b/tests/queries/0_stateless/02993_lazy_index_loading.reference new file mode 100644 index 00000000000..5bc329ae4eb --- /dev/null +++ b/tests/queries/0_stateless/02993_lazy_index_loading.reference @@ -0,0 +1,4 @@ +100000000 140000000 +0 0 +1 +100000000 100000000 diff --git a/tests/queries/0_stateless/02993_lazy_index_loading.sql b/tests/queries/0_stateless/02993_lazy_index_loading.sql new file mode 100644 index 00000000000..7de4af9ef0e --- /dev/null +++ b/tests/queries/0_stateless/02993_lazy_index_loading.sql @@ -0,0 +1,18 @@ +DROP TABLE IF EXISTS test; +CREATE TABLE test (s String) ENGINE = MergeTree ORDER BY s SETTINGS index_granularity = 1; + +INSERT INTO test SELECT randomString(1000) FROM numbers(100000); +SELECT round(primary_key_bytes_in_memory, -7), round(primary_key_bytes_in_memory_allocated, -7) FROM system.parts WHERE database = currentDatabase() AND table = 'test'; + +DETACH TABLE test; +SET max_memory_usage = '50M'; +ATTACH TABLE test; + +SELECT primary_key_bytes_in_memory, primary_key_bytes_in_memory_allocated FROM system.parts WHERE database = currentDatabase() AND table = 'test'; + +SET max_memory_usage = '200M'; +SELECT s != '' FROM test LIMIT 1; + +SELECT round(primary_key_bytes_in_memory, -7), round(primary_key_bytes_in_memory_allocated, -7) FROM system.parts WHERE database = currentDatabase() AND table = 'test'; + +DROP TABLE test; diff --git a/tests/queries/0_stateless/02993_values_escape_quote.reference b/tests/queries/0_stateless/02993_values_escape_quote.reference new file mode 100644 index 00000000000..29d6a133fec --- /dev/null +++ b/tests/queries/0_stateless/02993_values_escape_quote.reference @@ -0,0 +1,3 @@ +('foo')('foo\'bar')('foo\'\'bar') +output_format_values_escape_quote_with_quote=1 +('foo')('foo''bar')('foo''''bar') diff --git a/tests/queries/0_stateless/02993_values_escape_quote.sql b/tests/queries/0_stateless/02993_values_escape_quote.sql new file mode 100644 index 00000000000..e6fc5f1b280 --- /dev/null +++ b/tests/queries/0_stateless/02993_values_escape_quote.sql @@ -0,0 +1,12 @@ +select 'foo' format Values; +select 'foo\'bar' format Values; +select 'foo\'\'bar' format Values; + +select '\noutput_format_values_escape_quote_with_quote=1' format LineAsString; +set output_format_values_escape_quote_with_quote=1; + +select 'foo' format Values; +select 'foo\'bar' format Values; +select 'foo\'\'bar' format Values; +-- fix no newline at end of file +select '' format LineAsString; diff --git a/tests/queries/0_stateless/02994_cosineDistanceNullable.reference b/tests/queries/0_stateless/02994_cosineDistanceNullable.reference new file mode 100644 index 00000000000..e4fe1f97e7e --- /dev/null +++ b/tests/queries/0_stateless/02994_cosineDistanceNullable.reference @@ -0,0 +1,11 @@ +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N diff --git a/tests/queries/0_stateless/02994_cosineDistanceNullable.sql b/tests/queries/0_stateless/02994_cosineDistanceNullable.sql new file mode 100644 index 00000000000..a62216982f3 --- /dev/null +++ b/tests/queries/0_stateless/02994_cosineDistanceNullable.sql @@ -0,0 +1,3 @@ +-- https://github.com/ClickHouse/ClickHouse/issues/59596 +SELECT cosineDistance((1, 1), (toNullable(0.5), 0.1)); +SELECT cosineDistance((1, 1), (toNullable(0.5), 0.1)) from numbers(10); diff --git a/tests/queries/0_stateless/02995_baseline_23_12_1.tsv b/tests/queries/0_stateless/02995_baseline_23_12_1.tsv new file mode 100644 index 00000000000..4c0c9125b46 --- /dev/null +++ b/tests/queries/0_stateless/02995_baseline_23_12_1.tsv @@ -0,0 +1,940 @@ +add_http_cors_header 0 +additional_result_filter +additional_table_filters {} +aggregate_functions_null_for_empty 0 +aggregation_in_order_max_block_bytes 50000000 +aggregation_memory_efficient_merge_threads 0 +allow_aggregate_partitions_independently 0 +allow_asynchronous_read_from_io_pool_for_merge_tree 0 +allow_changing_replica_until_first_data_packet 0 +allow_create_index_without_type 0 +allow_custom_error_code_in_throwif 0 +allow_ddl 1 +allow_deprecated_database_ordinary 0 +allow_deprecated_syntax_for_merge_tree 0 +allow_distributed_ddl 1 +allow_drop_detached 0 +allow_execute_multiif_columnar 1 +allow_experimental_alter_materialized_view_structure 1 +allow_experimental_analyzer 0 +allow_experimental_annoy_index 0 +allow_experimental_bigint_types 1 +allow_experimental_codecs 0 +allow_experimental_database_atomic 1 +allow_experimental_database_materialized_mysql 0 +allow_experimental_database_materialized_postgresql 0 +allow_experimental_database_replicated 0 +allow_experimental_funnel_functions 0 +allow_experimental_geo_types 1 +allow_experimental_hash_functions 0 +allow_experimental_inverted_index 0 +allow_experimental_lightweight_delete 1 +allow_experimental_live_view 0 +allow_experimental_map_type 1 +allow_experimental_materialized_postgresql_table 0 +allow_experimental_nlp_functions 0 +allow_experimental_object_type 0 +allow_experimental_parallel_reading_from_replicas 0 +allow_experimental_projection_optimization 1 +allow_experimental_query_cache 1 +allow_experimental_query_deduplication 0 +allow_experimental_refreshable_materialized_view 0 +allow_experimental_s3queue 1 +allow_experimental_shared_merge_tree 0 +allow_experimental_statistic 0 +allow_experimental_undrop_table_query 1 +allow_experimental_usearch_index 0 +allow_experimental_window_functions 1 +allow_experimental_window_view 0 +allow_hyperscan 1 +allow_introspection_functions 0 +allow_named_collection_override_by_default 1 +allow_non_metadata_alters 1 +allow_nonconst_timezone_arguments 0 +allow_nondeterministic_mutations 0 +allow_nondeterministic_optimize_skip_unused_shards 0 +allow_prefetched_read_pool_for_local_filesystem 0 +allow_prefetched_read_pool_for_remote_filesystem 1 +allow_push_predicate_when_subquery_contains_with 1 +allow_settings_after_format_in_insert 0 +allow_simdjson 1 +allow_statistic_optimize 0 +allow_suspicious_codecs 0 +allow_suspicious_fixed_string_types 0 +allow_suspicious_indices 0 +allow_suspicious_low_cardinality_types 0 +allow_suspicious_ttl_expressions 0 +allow_unrestricted_reads_from_keeper 0 +alter_move_to_space_execute_async 0 +alter_partition_verbose_result 0 +alter_sync 1 +analyze_index_with_space_filling_curves 1 +annoy_index_search_k_nodes -1 +any_join_distinct_right_table_keys 0 +apply_deleted_mask 1 +apply_mutations_on_fly 0 +asterisk_include_alias_columns 0 +asterisk_include_materialized_columns 0 +async_insert 0 +async_insert_busy_timeout_ms 200 +async_insert_cleanup_timeout_ms 1000 +async_insert_deduplicate 0 +async_insert_max_data_size 1000000 +async_insert_max_query_number 450 +async_insert_stale_timeout_ms 0 +async_insert_threads 16 +async_query_sending_for_remote 1 +async_socket_for_remote 1 +azure_create_new_file_on_insert 0 +azure_list_object_keys_size 1000 +azure_max_single_part_upload_size 104857600 +azure_max_single_read_retries 4 +azure_truncate_on_insert 0 +background_buffer_flush_schedule_pool_size 16 +background_common_pool_size 8 +background_distributed_schedule_pool_size 16 +background_fetches_pool_size 8 +background_merges_mutations_concurrency_ratio 2 +background_message_broker_schedule_pool_size 16 +background_move_pool_size 8 +background_pool_size 16 +background_schedule_pool_size 128 +backup_restore_batch_size_for_keeper_multi 1000 +backup_restore_batch_size_for_keeper_multiread 10000 +backup_restore_keeper_fault_injection_probability 0 +backup_restore_keeper_fault_injection_seed 0 +backup_restore_keeper_max_retries 20 +backup_restore_keeper_retry_initial_backoff_ms 100 +backup_restore_keeper_retry_max_backoff_ms 5000 +backup_restore_keeper_value_max_size 1048576 +backup_threads 16 +bool_false_representation false +bool_true_representation true +cache_warmer_threads 4 +calculate_text_stack_trace 1 +cancel_http_readonly_queries_on_client_close 0 +cast_ipv4_ipv6_default_on_conversion_error 0 +cast_keep_nullable 0 +check_query_single_value_result 1 +check_referential_table_dependencies 0 +check_table_dependencies 1 +checksum_on_read 1 +cloud_mode 0 +cloud_mode_engine 1 +cluster_for_parallel_replicas +collect_hash_table_stats_during_aggregation 1 +column_names_for_schema_inference +compatibility +compatibility_ignore_auto_increment_in_create_table 0 +compatibility_ignore_collation_in_create_table 1 +compile_aggregate_expressions 1 +compile_expressions 0 +compile_sort_description 1 +connect_timeout 10 +connect_timeout_with_failover_ms 1000 +connect_timeout_with_failover_secure_ms 1000 +connection_pool_max_wait_ms 0 +connections_with_failover_max_tries 3 +convert_query_to_cnf 0 +count_distinct_implementation uniqExact +count_distinct_optimization 0 +create_index_ignore_unique 0 +create_replicated_merge_tree_fault_injection_probability 0 +create_table_empty_primary_key_by_default 0 +cross_to_inner_join_rewrite 1 +data_type_default_nullable 0 +database_atomic_wait_for_drop_and_detach_synchronously 0 +database_replicated_allow_only_replicated_engine 0 +database_replicated_allow_replicated_engine_arguments 1 +database_replicated_always_detach_permanently 0 +database_replicated_ddl_output 1 +database_replicated_enforce_synchronous_settings 0 +database_replicated_initial_query_timeout_sec 300 +date_time_input_format basic +date_time_output_format simple +date_time_overflow_behavior ignore +decimal_check_overflow 1 +deduplicate_blocks_in_dependent_materialized_views 0 +default_database_engine Atomic +default_max_bytes_in_join 1000000000 +default_table_engine None +default_temporary_table_engine Memory +describe_compact_output 0 +describe_extend_object_types 0 +describe_include_subcolumns 0 +describe_include_virtual_columns 0 +dialect clickhouse +dictionary_use_async_executor 0 +distinct_overflow_mode throw +distributed_aggregation_memory_efficient 1 +distributed_background_insert_batch 0 +distributed_background_insert_max_sleep_time_ms 30000 +distributed_background_insert_sleep_time_ms 100 +distributed_background_insert_split_batch_on_failure 0 +distributed_background_insert_timeout 0 +distributed_connections_pool_size 1024 +distributed_ddl_entry_format_version 5 +distributed_ddl_output_mode throw +distributed_ddl_task_timeout 180 +distributed_directory_monitor_batch_inserts 0 +distributed_directory_monitor_max_sleep_time_ms 30000 +distributed_directory_monitor_sleep_time_ms 100 +distributed_directory_monitor_split_batch_on_failure 0 +distributed_foreground_insert 0 +distributed_group_by_no_merge 0 +distributed_product_mode deny +distributed_push_down_limit 1 +distributed_replica_error_cap 1000 +distributed_replica_error_half_life 60 +distributed_replica_max_ignored_errors 0 +do_not_merge_across_partitions_select_final 0 +drain_timeout 3 +empty_result_for_aggregation_by_constant_keys_on_empty_set 1 +empty_result_for_aggregation_by_empty_set 0 +enable_debug_queries 0 +enable_deflate_qpl_codec 0 +enable_early_constant_folding 1 +enable_extended_results_for_datetime_functions 0 +enable_filesystem_cache 1 +enable_filesystem_cache_log 0 +enable_filesystem_cache_on_write_operations 0 +enable_filesystem_read_prefetches_log 0 +enable_global_with_statement 1 +enable_http_compression 0 +enable_job_stack_trace 0 +enable_lightweight_delete 1 +enable_memory_bound_merging_of_aggregation_results 1 +enable_multiple_prewhere_read_steps 1 +enable_optimize_predicate_expression 1 +enable_optimize_predicate_expression_to_final_subquery 1 +enable_order_by_all 1 +enable_positional_arguments 1 +enable_reads_from_query_cache 1 +enable_s3_requests_logging 0 +enable_scalar_subquery_optimization 1 +enable_sharing_sets_for_mutations 1 +enable_software_prefetch_in_aggregation 1 +enable_unaligned_array_join 0 +enable_url_encoding 1 +enable_writes_to_query_cache 1 +engine_file_allow_create_multiple_files 0 +engine_file_empty_if_not_exists 0 +engine_file_skip_empty_files 0 +engine_file_truncate_on_insert 0 +engine_url_skip_empty_files 0 +errors_output_format CSV +exact_rows_before_limit 0 +except_default_mode ALL +external_storage_connect_timeout_sec 10 +external_storage_max_read_bytes 0 +external_storage_max_read_rows 0 +external_storage_rw_timeout_sec 300 +external_table_functions_use_nulls 1 +external_table_strict_query 0 +extract_kvp_max_pairs_per_row 1000 +extremes 0 +fallback_to_stale_replicas_for_distributed_queries 1 +filesystem_cache_max_download_size 137438953472 +filesystem_cache_segments_batch_size 20 +filesystem_prefetch_max_memory_usage 1073741824 +filesystem_prefetch_min_bytes_for_single_read_task 2097152 +filesystem_prefetch_step_bytes 0 +filesystem_prefetch_step_marks 0 +filesystem_prefetches_limit 200 +final 0 +flatten_nested 1 +force_aggregate_partitions_independently 0 +force_aggregation_in_order 0 +force_data_skipping_indices +force_grouping_standard_compatibility 1 +force_index_by_date 0 +force_optimize_projection 0 +force_optimize_projection_name +force_optimize_skip_unused_shards 0 +force_optimize_skip_unused_shards_nesting 0 +force_primary_key 0 +force_remove_data_recursively_on_drop 0 +format_avro_schema_registry_url +format_binary_max_array_size 1073741824 +format_binary_max_string_size 1073741824 +format_capn_proto_enum_comparising_mode by_values +format_capn_proto_use_autogenerated_schema 1 +format_csv_allow_double_quotes 1 +format_csv_allow_single_quotes 0 +format_csv_delimiter , +format_csv_null_representation \\N +format_custom_escaping_rule Escaped +format_custom_field_delimiter \t +format_custom_result_after_delimiter +format_custom_result_before_delimiter +format_custom_row_after_delimiter \n +format_custom_row_before_delimiter +format_custom_row_between_delimiter +format_display_secrets_in_show_and_select 0 +format_json_object_each_row_column_for_object_name +format_protobuf_use_autogenerated_schema 1 +format_regexp +format_regexp_escaping_rule Raw +format_regexp_skip_unmatched 0 +format_schema +format_template_resultset +format_template_row +format_template_rows_between_delimiter \n +format_tsv_null_representation \\N +formatdatetime_f_prints_single_zero 0 +formatdatetime_format_without_leading_zeros 0 +formatdatetime_parsedatetime_m_is_month_name 1 +fsync_metadata 1 +function_implementation +function_json_value_return_type_allow_complex 0 +function_json_value_return_type_allow_nullable 0 +function_range_max_elements_in_block 500000000 +function_sleep_max_microseconds_per_block 3000000 +glob_expansion_max_elements 1000 +grace_hash_join_initial_buckets 1 +grace_hash_join_max_buckets 1024 +group_by_overflow_mode throw +group_by_two_level_threshold 100000 +group_by_two_level_threshold_bytes 50000000 +group_by_use_nulls 0 +handle_kafka_error_mode default +handshake_timeout_ms 10000 +hdfs_create_new_file_on_insert 0 +hdfs_replication 0 +hdfs_skip_empty_files 0 +hdfs_truncate_on_insert 0 +hedged_connection_timeout_ms 50 +hsts_max_age 0 +http_connection_timeout 1 +http_headers_progress_interval_ms 100 +http_make_head_request 1 +http_max_chunk_size 107374182400 +http_max_field_name_size 131072 +http_max_field_value_size 131072 +http_max_fields 1000000 +http_max_multipart_form_data_size 1073741824 +http_max_request_param_data_size 10485760 +http_max_tries 10 +http_max_uri_size 1048576 +http_native_compression_disable_checksumming_on_decompress 0 +http_receive_timeout 30 +http_response_buffer_size 0 +http_retry_initial_backoff_ms 100 +http_retry_max_backoff_ms 10000 +http_send_timeout 30 +http_skip_not_found_url_for_globs 1 +http_wait_end_of_query 0 +http_write_exception_in_output_format 1 +http_zlib_compression_level 3 +idle_connection_timeout 3600 +ignore_cold_parts_seconds 0 +ignore_data_skipping_indices +ignore_on_cluster_for_replicated_access_entities_queries 0 +ignore_on_cluster_for_replicated_udf_queries 0 +implicit_transaction 0 +input_format_allow_errors_num 0 +input_format_allow_errors_ratio 0 +input_format_allow_seeks 1 +input_format_arrow_allow_missing_columns 1 +input_format_arrow_case_insensitive_column_matching 0 +input_format_arrow_import_nested 0 +input_format_arrow_skip_columns_with_unsupported_types_in_schema_inference 0 +input_format_avro_allow_missing_fields 0 +input_format_avro_null_as_default 0 +input_format_bson_skip_fields_with_unsupported_types_in_schema_inference 0 +input_format_capn_proto_skip_fields_with_unsupported_types_in_schema_inference 0 +input_format_csv_allow_cr_end_of_line 0 +input_format_csv_allow_variable_number_of_columns 0 +input_format_csv_allow_whitespace_or_tab_as_delimiter 0 +input_format_csv_arrays_as_nested_csv 0 +input_format_csv_detect_header 1 +input_format_csv_empty_as_default 1 +input_format_csv_enum_as_number 0 +input_format_csv_skip_first_lines 0 +input_format_csv_skip_trailing_empty_lines 0 +input_format_csv_trim_whitespaces 1 +input_format_csv_try_infer_numbers_from_strings 0 +input_format_csv_use_best_effort_in_schema_inference 1 +input_format_csv_use_default_on_bad_values 0 +input_format_custom_allow_variable_number_of_columns 0 +input_format_custom_detect_header 1 +input_format_custom_skip_trailing_empty_lines 0 +input_format_defaults_for_omitted_fields 1 +input_format_hive_text_collection_items_delimiter  +input_format_hive_text_fields_delimiter  +input_format_hive_text_map_keys_delimiter  +input_format_import_nested_json 0 +input_format_ipv4_default_on_conversion_error 0 +input_format_ipv6_default_on_conversion_error 0 +input_format_json_compact_allow_variable_number_of_columns 0 +input_format_json_defaults_for_missing_elements_in_named_tuple 1 +input_format_json_ignore_unknown_keys_in_named_tuple 1 +input_format_json_infer_incomplete_types_as_strings 1 +input_format_json_named_tuples_as_objects 1 +input_format_json_read_arrays_as_strings 1 +input_format_json_read_bools_as_numbers 1 +input_format_json_read_numbers_as_strings 1 +input_format_json_read_objects_as_strings 1 +input_format_json_try_infer_named_tuples_from_objects 1 +input_format_json_try_infer_numbers_from_strings 0 +input_format_json_validate_types_from_metadata 1 +input_format_max_bytes_to_read_for_schema_inference 33554432 +input_format_max_rows_to_read_for_schema_inference 25000 +input_format_msgpack_number_of_columns 0 +input_format_mysql_dump_map_column_names 1 +input_format_mysql_dump_table_name +input_format_native_allow_types_conversion 1 +input_format_null_as_default 1 +input_format_orc_allow_missing_columns 1 +input_format_orc_case_insensitive_column_matching 0 +input_format_orc_filter_push_down 1 +input_format_orc_import_nested 0 +input_format_orc_row_batch_size 100000 +input_format_orc_skip_columns_with_unsupported_types_in_schema_inference 0 +input_format_orc_use_fast_decoder 1 +input_format_parallel_parsing 1 +input_format_parquet_allow_missing_columns 1 +input_format_parquet_case_insensitive_column_matching 0 +input_format_parquet_filter_push_down 1 +input_format_parquet_import_nested 0 +input_format_parquet_local_file_min_bytes_for_seek 8192 +input_format_parquet_max_block_size 8192 +input_format_parquet_preserve_order 0 +input_format_parquet_skip_columns_with_unsupported_types_in_schema_inference 0 +input_format_protobuf_flatten_google_wrappers 0 +input_format_protobuf_skip_fields_with_unsupported_types_in_schema_inference 0 +input_format_record_errors_file_path +input_format_skip_unknown_fields 1 +input_format_try_infer_dates 1 +input_format_try_infer_datetimes 1 +input_format_try_infer_integers 1 +input_format_tsv_allow_variable_number_of_columns 0 +input_format_tsv_detect_header 1 +input_format_tsv_empty_as_default 0 +input_format_tsv_enum_as_number 0 +input_format_tsv_skip_first_lines 0 +input_format_tsv_skip_trailing_empty_lines 0 +input_format_tsv_use_best_effort_in_schema_inference 1 +input_format_values_accurate_types_of_literals 1 +input_format_values_allow_data_after_semicolon 0 +input_format_values_deduce_templates_of_expressions 1 +input_format_values_interpret_expressions 1 +input_format_with_names_use_header 1 +input_format_with_types_use_header 1 +insert_allow_materialized_columns 0 +insert_deduplicate 1 +insert_deduplication_token +insert_distributed_one_random_shard 0 +insert_distributed_sync 0 +insert_distributed_timeout 0 +insert_keeper_fault_injection_probability 0 +insert_keeper_fault_injection_seed 0 +insert_keeper_max_retries 20 +insert_keeper_retry_initial_backoff_ms 100 +insert_keeper_retry_max_backoff_ms 10000 +insert_null_as_default 1 +insert_quorum 0 +insert_quorum_parallel 1 +insert_quorum_timeout 600000 +insert_shard_id 0 +interactive_delay 100000 +intersect_default_mode ALL +interval_output_format numeric +join_algorithm default +join_any_take_last_row 0 +join_default_strictness ALL +join_on_disk_max_files_to_merge 64 +join_overflow_mode throw +join_use_nulls 0 +joined_subquery_requires_alias 1 +kafka_disable_num_consumers_limit 0 +kafka_max_wait_ms 5000 +keeper_map_strict_mode 0 +legacy_column_name_of_tuple_literal 0 +limit 0 +live_view_heartbeat_interval 15 +load_balancing random +load_balancing_first_offset 0 +load_marks_asynchronously 0 +local_filesystem_read_method pread_threadpool +local_filesystem_read_prefetch 0 +lock_acquire_timeout 120 +log_comment +log_formatted_queries 0 +log_processors_profiles 0 +log_profile_events 1 +log_queries 1 +log_queries_cut_to_length 100000 +log_queries_min_query_duration_ms 0 +log_queries_min_type QUERY_START +log_queries_probability 1 +log_query_settings 1 +log_query_threads 0 +log_query_views 1 +low_cardinality_allow_in_native_format 1 +low_cardinality_max_dictionary_size 8192 +low_cardinality_use_single_dictionary_for_part 0 +materialize_ttl_after_modify 1 +materialized_views_ignore_errors 0 +max_alter_threads \'auto(16)\' +max_analyze_depth 5000 +max_ast_depth 1000 +max_ast_elements 50000 +max_backup_bandwidth 0 +max_block_size 65409 +max_bytes_before_external_group_by 0 +max_bytes_before_external_sort 0 +max_bytes_before_remerge_sort 1000000000 +max_bytes_in_distinct 0 +max_bytes_in_join 0 +max_bytes_in_set 0 +max_bytes_to_read 0 +max_bytes_to_read_leaf 0 +max_bytes_to_sort 0 +max_bytes_to_transfer 0 +max_columns_to_read 0 +max_compress_block_size 1048576 +max_concurrent_queries_for_all_users 0 +max_concurrent_queries_for_user 0 +max_distributed_connections 1024 +max_distributed_depth 5 +max_download_buffer_size 10485760 +max_download_threads 4 +max_entries_for_hash_table_stats 10000 +max_execution_speed 0 +max_execution_speed_bytes 0 +max_execution_time 0 +max_execution_time_leaf 0 +max_expanded_ast_elements 500000 +max_fetch_partition_retries_count 5 +max_final_threads \'auto(16)\' +max_http_get_redirects 0 +max_hyperscan_regexp_length 0 +max_hyperscan_regexp_total_length 0 +max_insert_block_size 1048449 +max_insert_delayed_streams_for_parallel_write 0 +max_insert_threads 0 +max_joined_block_size_rows 65409 +max_limit_for_ann_queries 1000000 +max_live_view_insert_blocks_before_refresh 64 +max_local_read_bandwidth 0 +max_local_write_bandwidth 0 +max_memory_usage 0 +max_memory_usage_for_all_queries 0 +max_memory_usage_for_user 0 +max_network_bandwidth 0 +max_network_bandwidth_for_all_users 0 +max_network_bandwidth_for_user 0 +max_network_bytes 0 +max_number_of_partitions_for_independent_aggregation 128 +max_parallel_replicas 1 +max_parser_depth 1000 +max_partition_size_to_drop 50000000000 +max_partitions_per_insert_block 100 +max_partitions_to_read -1 +max_pipeline_depth 0 +max_query_size 262144 +max_read_buffer_size 1048576 +max_read_buffer_size_local_fs 131072 +max_read_buffer_size_remote_fs 0 +max_remote_read_network_bandwidth 0 +max_remote_read_network_bandwidth_for_server 0 +max_remote_write_network_bandwidth 0 +max_remote_write_network_bandwidth_for_server 0 +max_replica_delay_for_distributed_queries 300 +max_replicated_fetches_network_bandwidth_for_server 0 +max_replicated_sends_network_bandwidth_for_server 0 +max_result_bytes 0 +max_result_rows 0 +max_rows_in_distinct 0 +max_rows_in_join 0 +max_rows_in_set 0 +max_rows_in_set_to_optimize_join 100000 +max_rows_to_group_by 0 +max_rows_to_read 0 +max_rows_to_read_leaf 0 +max_rows_to_sort 0 +max_rows_to_transfer 0 +max_sessions_for_user 0 +max_size_to_preallocate_for_aggregation 100000000 +max_streams_for_merge_tree_reading 0 +max_streams_multiplier_for_merge_tables 5 +max_streams_to_max_threads_ratio 1 +max_subquery_depth 100 +max_table_size_to_drop 50000000000 +max_temporary_columns 0 +max_temporary_data_on_disk_size_for_query 0 +max_temporary_data_on_disk_size_for_user 0 +max_temporary_non_const_columns 0 +max_threads \'auto(16)\' +max_threads_for_annoy_index_creation 4 +max_threads_for_indexes 0 +max_untracked_memory 4194304 +memory_overcommit_ratio_denominator 1073741824 +memory_overcommit_ratio_denominator_for_user 1073741824 +memory_profiler_sample_max_allocation_size 0 +memory_profiler_sample_min_allocation_size 0 +memory_profiler_sample_probability 0 +memory_profiler_step 4194304 +memory_tracker_fault_probability 0 +memory_usage_overcommit_max_wait_microseconds 5000000 +merge_tree_clear_old_parts_interval_seconds 1 +merge_tree_clear_old_temporary_directories_interval_seconds 60 +merge_tree_coarse_index_granularity 8 +merge_tree_compact_parts_min_granules_to_multibuffer_read 16 +merge_tree_determine_task_size_by_prewhere_columns 1 +merge_tree_max_bytes_to_use_cache 2013265920 +merge_tree_max_rows_to_use_cache 1048576 +merge_tree_min_bytes_for_concurrent_read 251658240 +merge_tree_min_bytes_for_concurrent_read_for_remote_filesystem 251658240 +merge_tree_min_bytes_for_seek 0 +merge_tree_min_bytes_per_task_for_remote_reading 4194304 +merge_tree_min_rows_for_concurrent_read 163840 +merge_tree_min_rows_for_concurrent_read_for_remote_filesystem 163840 +merge_tree_min_rows_for_seek 0 +merge_tree_use_const_size_tasks_for_remote_reading 1 +metrics_perf_events_enabled 0 +metrics_perf_events_list +min_bytes_to_use_direct_io 0 +min_bytes_to_use_mmap_io 0 +min_chunk_bytes_for_parallel_parsing 10485760 +min_compress_block_size 65536 +min_count_to_compile_aggregate_expression 3 +min_count_to_compile_expression 3 +min_count_to_compile_sort_description 3 +min_execution_speed 0 +min_execution_speed_bytes 0 +min_free_disk_space_for_temporary_data 0 +min_hit_rate_to_use_consecutive_keys_optimization 0.5 +min_insert_block_size_bytes 268402944 +min_insert_block_size_bytes_for_materialized_views 0 +min_insert_block_size_rows 1048449 +min_insert_block_size_rows_for_materialized_views 0 +move_all_conditions_to_prewhere 1 +move_primary_key_columns_to_end_of_prewhere 1 +multiple_joins_rewriter_version 0 +multiple_joins_try_to_keep_original_names 0 +mutations_execute_nondeterministic_on_initiator 0 +mutations_execute_subqueries_on_initiator 0 +mutations_max_literal_size_to_replace 16384 +mutations_sync 0 +mysql_datatypes_support_level +mysql_map_fixed_string_to_text_in_show_columns 0 +mysql_map_string_to_text_in_show_columns 0 +mysql_max_rows_to_insert 65536 +network_compression_method LZ4 +network_zstd_compression_level 1 +normalize_function_names 1 +number_of_mutations_to_delay 0 +number_of_mutations_to_throw 0 +odbc_bridge_connection_pool_size 16 +odbc_bridge_use_connection_pooling 1 +odbc_max_field_size 0 +offset 0 +opentelemetry_start_trace_probability 0 +opentelemetry_trace_processors 0 +optimize_aggregation_in_order 0 +optimize_aggregators_of_group_by_keys 1 +optimize_append_index 0 +optimize_arithmetic_operations_in_aggregate_functions 1 +optimize_count_from_files 1 +optimize_distinct_in_order 1 +optimize_distributed_group_by_sharding_key 1 +optimize_duplicate_order_by_and_distinct 0 +optimize_functions_to_subcolumns 0 +optimize_fuse_sum_count_avg 0 +optimize_group_by_constant_keys 1 +optimize_group_by_function_keys 1 +optimize_if_chain_to_multiif 0 +optimize_if_transform_strings_to_enum 0 +optimize_injective_functions_inside_uniq 1 +optimize_min_equality_disjunction_chain_length 3 +optimize_min_inequality_conjunction_chain_length 3 +optimize_monotonous_functions_in_order_by 0 +optimize_move_functions_out_of_any 0 +optimize_move_to_prewhere 1 +optimize_move_to_prewhere_if_final 0 +optimize_multiif_to_if 1 +optimize_normalize_count_variants 1 +optimize_on_insert 1 +optimize_or_like_chain 0 +optimize_read_in_order 1 +optimize_read_in_window_order 1 +optimize_redundant_functions_in_order_by 1 +optimize_respect_aliases 1 +optimize_rewrite_aggregate_function_with_if 1 +optimize_rewrite_array_exists_to_has 0 +optimize_rewrite_sum_if_to_count_if 0 +optimize_skip_merged_partitions 0 +optimize_skip_unused_shards 0 +optimize_skip_unused_shards_limit 1000 +optimize_skip_unused_shards_nesting 0 +optimize_skip_unused_shards_rewrite_in 1 +optimize_sorting_by_input_stream_properties 1 +optimize_substitute_columns 0 +optimize_syntax_fuse_functions 0 +optimize_throw_if_noop 0 +optimize_trivial_approximate_count_query 0 +optimize_trivial_count_query 1 +optimize_trivial_insert_select 1 +optimize_uniq_to_count 1 +optimize_use_implicit_projections 1 +optimize_use_projections 1 +optimize_using_constraints 0 +os_thread_priority 0 +output_format_arrow_compression_method lz4_frame +output_format_arrow_fixed_string_as_fixed_byte_array 1 +output_format_arrow_low_cardinality_as_dictionary 0 +output_format_arrow_string_as_string 0 +output_format_avro_codec +output_format_avro_rows_in_file 1 +output_format_avro_string_column_pattern +output_format_avro_sync_interval 16384 +output_format_bson_string_as_string 0 +output_format_csv_crlf_end_of_line 0 +output_format_decimal_trailing_zeros 0 +output_format_enable_streaming 0 +output_format_json_array_of_rows 0 +output_format_json_escape_forward_slashes 1 +output_format_json_named_tuples_as_objects 1 +output_format_json_quote_64bit_floats 0 +output_format_json_quote_64bit_integers 1 +output_format_json_quote_decimals 0 +output_format_json_quote_denormals 0 +output_format_json_skip_null_value_in_named_tuples 0 +output_format_json_validate_utf8 0 +output_format_markdown_escape_special_characters 0 +output_format_msgpack_uuid_representation ext +output_format_orc_compression_method lz4 +output_format_orc_row_index_stride 10000 +output_format_orc_string_as_string 0 +output_format_parallel_formatting 1 +output_format_parquet_batch_size 1024 +output_format_parquet_compliant_nested_types 1 +output_format_parquet_compression_method lz4 +output_format_parquet_data_page_size 1048576 +output_format_parquet_fixed_string_as_fixed_byte_array 1 +output_format_parquet_parallel_encoding 1 +output_format_parquet_row_group_size 1000000 +output_format_parquet_row_group_size_bytes 536870912 +output_format_parquet_string_as_string 0 +output_format_parquet_use_custom_encoder 0 +output_format_parquet_version 2.latest +output_format_pretty_color 1 +output_format_pretty_grid_charset UTF-8 +output_format_pretty_max_column_pad_width 250 +output_format_pretty_max_rows 10000 +output_format_pretty_max_value_width 10000 +output_format_pretty_row_numbers 0 +output_format_protobuf_nullables_with_google_wrappers 0 +output_format_schema +output_format_sql_insert_include_column_names 1 +output_format_sql_insert_max_batch_size 65409 +output_format_sql_insert_quote_names 1 +output_format_sql_insert_table_name table +output_format_sql_insert_use_replace 0 +output_format_tsv_crlf_end_of_line 0 +output_format_write_statistics 1 +parallel_distributed_insert_select 0 +parallel_replica_offset 0 +parallel_replicas_count 0 +parallel_replicas_custom_key +parallel_replicas_custom_key_filter_type default +parallel_replicas_for_non_replicated_merge_tree 0 +parallel_replicas_min_number_of_granules_to_enable 0 +parallel_replicas_min_number_of_rows_per_replica 0 +parallel_replicas_single_task_marks_count_multiplier 2 +parallel_view_processing 0 +parallelize_output_from_storages 1 +parsedatetime_parse_without_leading_zeros 1 +partial_merge_join_left_table_buffer_bytes 0 +partial_merge_join_optimizations 0 +partial_merge_join_rows_in_right_blocks 65536 +partial_result_on_first_cancel 0 +parts_to_delay_insert 0 +parts_to_throw_insert 0 +periodic_live_view_refresh 60 +poll_interval 10 +postgresql_connection_pool_auto_close_connection 0 +postgresql_connection_pool_size 16 +postgresql_connection_pool_wait_timeout 5000 +precise_float_parsing 0 +prefer_column_name_to_alias 0 +prefer_global_in_and_join 0 +prefer_localhost_replica 1 +prefer_warmed_unmerged_parts_seconds 0 +preferred_block_size_bytes 1000000 +preferred_max_column_in_block_size_bytes 0 +preferred_optimize_projection_name +prefetch_buffer_size 1048576 +print_pretty_type_names 0 +priority 0 +query_cache_compress_entries 1 +query_cache_max_entries 0 +query_cache_max_size_in_bytes 0 +query_cache_min_query_duration 0 +query_cache_min_query_runs 0 +query_cache_nondeterministic_function_handling throw +query_cache_share_between_users 0 +query_cache_squash_partial_results 1 +query_cache_store_results_of_queries_with_nondeterministic_functions 0 +query_cache_ttl 60 +query_plan_aggregation_in_order 1 +query_plan_enable_multithreading_after_window_functions 1 +query_plan_enable_optimizations 1 +query_plan_execute_functions_after_sorting 1 +query_plan_filter_push_down 1 +query_plan_lift_up_array_join 1 +query_plan_lift_up_union 1 +query_plan_max_optimizations_to_apply 10000 +query_plan_merge_expressions 1 +query_plan_optimize_primary_key 1 +query_plan_optimize_projection 1 +query_plan_push_down_limit 1 +query_plan_read_in_order 1 +query_plan_remove_redundant_distinct 1 +query_plan_remove_redundant_sorting 1 +query_plan_reuse_storage_ordering_for_window_functions 1 +query_plan_split_filter 1 +query_profiler_cpu_time_period_ns 1000000000 +query_profiler_real_time_period_ns 1000000000 +queue_max_wait_ms 0 +rabbitmq_max_wait_ms 5000 +read_backoff_max_throughput 1048576 +read_backoff_min_concurrency 1 +read_backoff_min_events 2 +read_backoff_min_interval_between_events_ms 1000 +read_backoff_min_latency_ms 1000 +read_from_filesystem_cache_if_exists_otherwise_bypass_cache 0 +read_in_order_two_level_merge_threshold 100 +read_overflow_mode throw +read_overflow_mode_leaf throw +read_priority 0 +readonly 0 +receive_data_timeout_ms 2000 +receive_timeout 300 +regexp_dict_allow_hyperscan 1 +regexp_dict_flag_case_insensitive 0 +regexp_dict_flag_dotall 0 +regexp_max_matches_per_row 1000 +reject_expensive_hyperscan_regexps 1 +remerge_sort_lowered_memory_bytes_ratio 2 +remote_filesystem_read_method threadpool +remote_filesystem_read_prefetch 1 +remote_fs_read_backoff_max_tries 5 +remote_fs_read_max_backoff_ms 10000 +remote_read_min_bytes_for_seek 4194304 +rename_files_after_processing +replace_running_query 0 +replace_running_query_max_wait_ms 5000 +replication_alter_columns_timeout 60 +replication_alter_partitions_sync 1 +replication_wait_for_inactive_replica_timeout 120 +restore_threads 16 +result_overflow_mode throw +rewrite_count_distinct_if_with_count_distinct_implementation 0 +s3_allow_parallel_part_upload 1 +s3_check_objects_after_upload 0 +s3_create_new_file_on_insert 0 +s3_disable_checksum 0 +s3_http_connection_pool_size 1000 +s3_list_object_keys_size 1000 +s3_max_connections 1024 +s3_max_get_burst 0 +s3_max_get_rps 0 +s3_max_inflight_parts_for_one_file 20 +s3_max_put_burst 0 +s3_max_put_rps 0 +s3_max_redirects 10 +s3_max_single_part_upload_size 33554432 +s3_max_single_read_retries 4 +s3_max_unexpected_write_error_retries 4 +s3_max_upload_part_size 5368709120 +s3_min_upload_part_size 16777216 +s3_request_timeout_ms 30000 +s3_retry_attempts 100 +s3_skip_empty_files 0 +s3_strict_upload_part_size 0 +s3_throw_on_zero_files_match 0 +s3_truncate_on_insert 0 +s3_upload_part_size_multiply_factor 2 +s3_upload_part_size_multiply_parts_count_threshold 500 +s3_use_adaptive_timeouts 1 +s3queue_default_zookeeper_path /clickhouse/s3queue/ +s3queue_enable_logging_to_s3queue_log 0 +schema_inference_cache_require_modification_time_for_url 1 +schema_inference_hints +schema_inference_make_columns_nullable 1 +schema_inference_mode default +schema_inference_use_cache_for_azure 1 +schema_inference_use_cache_for_file 1 +schema_inference_use_cache_for_hdfs 1 +schema_inference_use_cache_for_s3 1 +schema_inference_use_cache_for_url 1 +select_sequential_consistency 0 +send_logs_level fatal +send_logs_source_regexp +send_progress_in_http_headers 0 +send_timeout 300 +session_timezone +set_overflow_mode throw +short_circuit_function_evaluation enable +show_table_uuid_in_table_create_query_if_not_nil 0 +single_join_prefer_left_table 1 +skip_download_if_exceeds_query_cache 1 +skip_unavailable_shards 0 +sleep_after_receiving_query_ms 0 +sleep_in_send_data_ms 0 +sleep_in_send_tables_status_ms 0 +sort_overflow_mode throw +splitby_max_substrings_includes_remaining_string 0 +stop_refreshable_materialized_views_on_startup 0 +storage_file_read_method pread +storage_system_stack_trace_pipe_read_timeout_ms 100 +stream_flush_interval_ms 7500 +stream_like_engine_allow_direct_select 0 +stream_like_engine_insert_queue +stream_poll_timeout_ms 500 +system_events_show_zero_values 0 +table_function_remote_max_addresses 1000 +tcp_keep_alive_timeout 290 +temporary_files_codec LZ4 +temporary_live_view_timeout 1 +throw_if_no_data_to_insert 1 +throw_on_error_from_cache_on_write_operations 0 +throw_on_max_partitions_per_insert_block 1 +throw_on_unsupported_query_inside_transaction 1 +timeout_before_checking_execution_speed 10 +timeout_overflow_mode throw +timeout_overflow_mode_leaf throw +totals_auto_threshold 0.5 +totals_mode after_having_exclusive +trace_profile_events 0 +transfer_overflow_mode throw +transform_null_in 0 +union_default_mode +unknown_packet_in_send_data 0 +use_cache_for_count_from_files 1 +use_client_time_zone 0 +use_compact_format_in_distributed_parts_names 1 +use_concurrency_control 1 +use_hedged_requests 1 +use_index_for_in_with_subqueries 1 +use_index_for_in_with_subqueries_max_values 0 +use_local_cache_for_remote_storage 1 +use_mysql_types_in_show_columns 0 +use_query_cache 0 +use_skip_indexes 1 +use_skip_indexes_if_final 0 +use_structure_from_insertion_table_in_table_functions 2 +use_uncompressed_cache 0 +use_with_fill_by_sorting_prefix 1 +validate_polygons 1 +wait_changes_become_visible_after_commit_mode wait_unknown +wait_for_async_insert 1 +wait_for_async_insert_timeout 120 +wait_for_window_view_fire_signal_timeout 10 +window_view_clean_interval 60 +window_view_heartbeat_interval 15 +workload default +zstd_window_log_max 0 diff --git a/tests/queries/0_stateless/02995_new_settings_history.reference b/tests/queries/0_stateless/02995_new_settings_history.reference new file mode 100644 index 00000000000..e69de29bb2d diff --git a/tests/queries/0_stateless/02995_new_settings_history.sh b/tests/queries/0_stateless/02995_new_settings_history.sh new file mode 100755 index 00000000000..8de98c55b6a --- /dev/null +++ b/tests/queries/0_stateless/02995_new_settings_history.sh @@ -0,0 +1,46 @@ +#!/usr/bin/env bash +# Tags: no-tsan, no-asan, no-ubsan, no-msan, no-cpu-aarch64, no-random-settings +# Some settings can be different for builds with sanitizers or aarch64 + +CUR_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CUR_DIR"/../shell_config.sh + +# Note that this is a broad check. A per version check is done in the upgrade test +# Baseline generated with 23.12.1 +# clickhouse local --query "select name, default from system.settings order by name format TSV" > 02995_baseline_23_12_1.tsv +$CLICKHOUSE_LOCAL --query " + WITH old_settings AS + ( + SELECT * FROM file('${CUR_DIR}/02995_baseline_23_12_1.tsv', 'TSV', 'name String, default String') + ), + new_settings AS + ( + -- Ignore settings that depend on the machine config (max_threads and similar) + SELECT name, default FROM system.settings WHERE default NOT LIKE '%auto(%' + ) + SELECT * FROM + ( + SELECT 'PLEASE ADD THE NEW SETTING TO SettingsChangesHistory.h: ' || name || ' WAS ADDED', + FROM new_settings + WHERE (name NOT IN ( + SELECT name + FROM old_settings + )) AND (name NOT IN ( + SELECT arrayJoin(tupleElement(changes, 'name')) + FROM system.settings_changes + WHERE splitByChar('.', version())[1] >= '24' + )) + UNION ALL + ( + SELECT 'PLEASE ADD THE SETTING VALUE CHANGE TO SettingsChangesHistory.h: ' || name || ' WAS CHANGED FROM ' || old_settings.default || ' TO ' || new_settings.default, + FROM new_settings + LEFT JOIN old_settings ON new_settings.name = old_settings.name + WHERE (new_settings.default != old_settings.default) AND (name NOT IN ( + SELECT arrayJoin(tupleElement(changes, 'name')) + FROM system.settings_changes + WHERE splitByChar('.', version())[1] >= '24' + )) + ) + ) +" diff --git a/tests/queries/0_stateless/02996_analyzer_prewhere_projection.reference b/tests/queries/0_stateless/02996_analyzer_prewhere_projection.reference new file mode 100644 index 00000000000..72749c905a3 --- /dev/null +++ b/tests/queries/0_stateless/02996_analyzer_prewhere_projection.reference @@ -0,0 +1 @@ +1 1 1 diff --git a/tests/queries/0_stateless/02996_analyzer_prewhere_projection.sql b/tests/queries/0_stateless/02996_analyzer_prewhere_projection.sql new file mode 100644 index 00000000000..9d676001010 --- /dev/null +++ b/tests/queries/0_stateless/02996_analyzer_prewhere_projection.sql @@ -0,0 +1,7 @@ +SET allow_suspicious_low_cardinality_types=1; + +CREATE TABLE t__fuzz_0 (`i` LowCardinality(Int32), `j` Int32, `k` Int32, PROJECTION p (SELECT * ORDER BY j)) ENGINE = MergeTree ORDER BY i SETTINGS index_granularity = 1; +INSERT INTO t__fuzz_0 Select number, number, number FROM numbers(100); + +SELECT * FROM t__fuzz_0 PREWHERE 7 AND (i < 2147483647) AND (j IN (2147483646, -2, 1)) +SETTINGS allow_experimental_analyzer = true; diff --git a/tests/queries/0_stateless/02996_nullable_arrayReduce.reference b/tests/queries/0_stateless/02996_nullable_arrayReduce.reference new file mode 100644 index 00000000000..96afb8546ef --- /dev/null +++ b/tests/queries/0_stateless/02996_nullable_arrayReduce.reference @@ -0,0 +1,15 @@ +-- { echoOn } +SELECT arrayReduce('sum', []::Array(UInt8)) as a, toTypeName(a); +0 UInt64 +SELECT arrayReduce('sumOrNull', []::Array(UInt8)) as a, toTypeName(a); +\N Nullable(UInt64) +SELECT arrayReduce('sum', [NULL]::Array(Nullable(UInt8))) as a, toTypeName(a); +\N Nullable(UInt64) +SELECT arrayReduce('sum', [NULL, 10]::Array(Nullable(UInt8))) as a, toTypeName(a); +10 Nullable(UInt64) +SELECT arrayReduce('any_respect_nulls', [NULL, 10]::Array(Nullable(UInt8))) as a, toTypeName(a); +\N Nullable(UInt8) +SELECT arrayReduce('any_respect_nulls', [10, NULL]::Array(Nullable(UInt8))) as a, toTypeName(a); +10 Nullable(UInt8) +SELECT arrayReduce('median', [toLowCardinality(toNullable(8))]) as t, toTypeName(t); +8 Nullable(Float64) diff --git a/tests/queries/0_stateless/02996_nullable_arrayReduce.sql b/tests/queries/0_stateless/02996_nullable_arrayReduce.sql new file mode 100644 index 00000000000..8f69296dbe5 --- /dev/null +++ b/tests/queries/0_stateless/02996_nullable_arrayReduce.sql @@ -0,0 +1,17 @@ +-- https://github.com/ClickHouse/ClickHouse/issues/59600 +SELECT arrayReduce(toNullable('stddevSampOrNull'), [1]); -- { serverError ILLEGAL_TYPE_OF_ARGUMENT } +SELECT arrayReduce(toNullable('median'), [toDecimal32OrNull(toFixedString('1', 1), 2), 8]); -- { serverError ILLEGAL_TYPE_OF_ARGUMENT } +SELECT toFixedString('--- Int Empty ---', toLowCardinality(17)), arrayReduce(toNullable('avgOrNull'), [1]); -- { serverError ILLEGAL_TYPE_OF_ARGUMENT } +SELECT arrayReduce('any', toNullable(3)); -- { serverError ILLEGAL_TYPE_OF_ARGUMENT } +SELECT arrayReduce(toLowCardinality('median'), [toLowCardinality(toNullable(8))]); -- { serverError ILLEGAL_TYPE_OF_ARGUMENT } + +-- { echoOn } +SELECT arrayReduce('sum', []::Array(UInt8)) as a, toTypeName(a); +SELECT arrayReduce('sumOrNull', []::Array(UInt8)) as a, toTypeName(a); +SELECT arrayReduce('sum', [NULL]::Array(Nullable(UInt8))) as a, toTypeName(a); +SELECT arrayReduce('sum', [NULL, 10]::Array(Nullable(UInt8))) as a, toTypeName(a); +SELECT arrayReduce('any_respect_nulls', [NULL, 10]::Array(Nullable(UInt8))) as a, toTypeName(a); +SELECT arrayReduce('any_respect_nulls', [10, NULL]::Array(Nullable(UInt8))) as a, toTypeName(a); + +SELECT arrayReduce('median', [toLowCardinality(toNullable(8))]) as t, toTypeName(t); +-- { echoOff } diff --git a/tests/queries/0_stateless/02997_projections_formatting.reference b/tests/queries/0_stateless/02997_projections_formatting.reference new file mode 100644 index 00000000000..6a60da1089a --- /dev/null +++ b/tests/queries/0_stateless/02997_projections_formatting.reference @@ -0,0 +1,26 @@ +CREATE TEMPORARY TABLE t_proj +( + `t` DateTime, + `id` UInt64, + PROJECTION p + ( + SELECT + id, + t + ORDER BY toStartOfDay(t) + ) +) +ENGINE = MergeTree +ORDER BY id +CREATE TEMPORARY TABLE t_proj2 +( + `a` UInt32, + `b` UInt32, + PROJECTION p + ( + SELECT a + ORDER BY b * 2 + ) +) +ENGINE = MergeTree +ORDER BY a diff --git a/tests/queries/0_stateless/02997_projections_formatting.sql b/tests/queries/0_stateless/02997_projections_formatting.sql new file mode 100644 index 00000000000..b593c2576b1 --- /dev/null +++ b/tests/queries/0_stateless/02997_projections_formatting.sql @@ -0,0 +1,5 @@ +CREATE TEMPORARY TABLE t_proj (t DateTime, id UInt64, PROJECTION p (SELECT id, t ORDER BY toStartOfDay(t))) ENGINE = MergeTree ORDER BY id; +SHOW CREATE TEMPORARY TABLE t_proj FORMAT TSVRaw; + +CREATE TEMPORARY TABLE t_proj2 (a UInt32, b UInt32, PROJECTION p (SELECT a ORDER BY b * 2)) ENGINE = MergeTree ORDER BY a; +SHOW CREATE TEMPORARY TABLE t_proj2 FORMAT TSVRaw; diff --git a/tests/queries/1_stateful/00037_uniq_state_merge1.sql b/tests/queries/1_stateful/00037_uniq_state_merge1.sql index c941a14b571..6abaad7297f 100644 --- a/tests/queries/1_stateful/00037_uniq_state_merge1.sql +++ b/tests/queries/1_stateful/00037_uniq_state_merge1.sql @@ -1 +1,2 @@ +SET max_bytes_before_external_group_by = '1G'; SELECT k, any(u) AS u, uniqMerge(us) AS us FROM (SELECT domain(URL) AS k, uniq(UserID) AS u, uniqState(UserID) AS us FROM test.hits GROUP BY k) GROUP BY k ORDER BY u DESC, k ASC LIMIT 100 diff --git a/tests/queries/1_stateful/00038_uniq_state_merge2.sql b/tests/queries/1_stateful/00038_uniq_state_merge2.sql index 677458daeda..f97395943a1 100644 --- a/tests/queries/1_stateful/00038_uniq_state_merge2.sql +++ b/tests/queries/1_stateful/00038_uniq_state_merge2.sql @@ -1 +1,2 @@ +SET max_bytes_before_external_group_by = '1G'; SELECT topLevelDomain(concat('http://', k)) AS tld, sum(u) AS u, uniqMerge(us) AS us FROM (SELECT domain(URL) AS k, uniq(UserID) AS u, uniqState(UserID) AS us FROM test.hits GROUP BY k) GROUP BY tld ORDER BY u DESC, tld ASC LIMIT 100 diff --git a/tests/queries/1_stateful/00098_primary_key_memory_allocated.reference b/tests/queries/1_stateful/00098_primary_key_memory_allocated.reference new file mode 100644 index 00000000000..72749c905a3 --- /dev/null +++ b/tests/queries/1_stateful/00098_primary_key_memory_allocated.reference @@ -0,0 +1 @@ +1 1 1 diff --git a/tests/queries/1_stateful/00098_primary_key_memory_allocated.sql b/tests/queries/1_stateful/00098_primary_key_memory_allocated.sql new file mode 100644 index 00000000000..7371678a0f6 --- /dev/null +++ b/tests/queries/1_stateful/00098_primary_key_memory_allocated.sql @@ -0,0 +1 @@ +SELECT primary_key_bytes_in_memory < 16000, primary_key_bytes_in_memory_allocated < 16000, primary_key_bytes_in_memory_allocated / primary_key_bytes_in_memory < 1.1 FROM system.parts WHERE database = 'test' AND table = 'hits'; diff --git a/utils/check-style/aspell-ignore/en/aspell-dict.txt b/utils/check-style/aspell-ignore/en/aspell-dict.txt index 51aa8222a89..f2e5a744a21 100644 --- a/utils/check-style/aspell-ignore/en/aspell-dict.txt +++ b/utils/check-style/aspell-ignore/en/aspell-dict.txt @@ -190,6 +190,7 @@ CustomSeparatedWithNamesAndTypes DBAs DBMSs DBeaver +DD DDLWORKER DDLWorker DDLWorkerThreads @@ -215,7 +216,6 @@ DatabaseOrdinaryThreadsActive DateTime DateTimes DbCL -DD Decrypted Deduplicate Deduplication @@ -840,6 +840,7 @@ Sematext SendExternalTables SendScalars ShareAlike +Shortkeys SimHash Simhash SimpleAggregateFunction @@ -952,8 +953,8 @@ TotalRowsOfMergeTreeTables TotalTemporaryFiles Tradeoff Transactional -TwoColumnList Tukey +TwoColumnList UBSan UDFs UInt @@ -990,6 +991,7 @@ VIEWs Vadim Valgrind Vectorized +VersionBadge VersionInteger VersionedCollapsingMergeTree VideoContainer @@ -1325,6 +1327,7 @@ cosineDistance countDigits countEqual countMatches +countMatchesCaseInsensitive countSubstrings covarPop covarSamp @@ -1688,6 +1691,7 @@ hudi hyperscan hypot hyvor +iTerm icosahedron icudata idempotency @@ -2132,6 +2136,7 @@ py qryn quantile quantileBFloat +quantileDD quantileDeterministic quantileExact quantileExactExclusive @@ -2146,6 +2151,7 @@ quantileTDigestWeighted quantileTiming quantileTimingWeighted quantilebfloat +quantileddsketch quantiledeterministic quantileexact quantileexactweighted @@ -2158,8 +2164,6 @@ quantiletdigest quantiletdigestweighted quantiletiming quantiletimingweighted -quantileddsketch -quantileDD quartile queryID queryString @@ -2292,8 +2296,8 @@ seektable sequenceCount sequenceMatch sequenceNextNode -seriesOutliersDetectTukey seriesDecomposeSTL +seriesOutliersDetectTukey seriesPeriodDetectFFT serverTimeZone serverTimezone @@ -2307,6 +2311,7 @@ shardNum sharded sharding shortcircuit +shortkeys shoutout simdjson simpleLinearRegression @@ -2730,6 +2735,3 @@ znode znodes zookeeperSessionUptime zstd -iTerm -shortkeys -Shortkeys diff --git a/utils/check-style/check-style b/utils/check-style/check-style index daee2e7fb00..6c12970c4bb 100755 --- a/utils/check-style/check-style +++ b/utils/check-style/check-style @@ -76,6 +76,7 @@ EXTERN_TYPES_EXCLUDES=( ProfileEvents::getProfileEvents ProfileEvents::ThreadIdToCountersSnapshot ProfileEvents::LOCAL_NAME + ProfileEvents::keeper_profile_events ProfileEvents::CountersIncrement CurrentMetrics::add @@ -87,6 +88,7 @@ EXTERN_TYPES_EXCLUDES=( CurrentMetrics::Metric CurrentMetrics::values CurrentMetrics::Value + CurrentMetrics::keeper_metrics ErrorCodes::ErrorCode ErrorCodes::getName @@ -106,7 +108,7 @@ for extern_type in ${!EXTERN_TYPES[@]}; do find $ROOT_PATH/{src,base,programs,utils} -name '*.h' -or -name '*.cpp' | { # NOTE: the check is pretty dumb and distinguish only by the type_of_extern, # and this matches with zkutil::CreateMode - grep -v 'src/Common/ZooKeeper/Types.h' + grep -v -e 'src/Common/ZooKeeper/Types.h' -e 'src/Coordination/KeeperConstants.cpp' } | { grep -vP $EXCLUDE_DIRS | xargs grep -l -P "extern const $type_of_extern $allowed_chars" } | while read file; do @@ -274,6 +276,11 @@ find $ROOT_PATH/{src,base,programs,utils} -name '*.h' -or -name '*.cpp' | grep -vP $EXCLUDE_DIRS | xargs grep -F '!!!' | grep -P '.' && echo "Too many exclamation marks (looks dirty, unconfident)." +# Exclamation mark in a message +find $ROOT_PATH/{src,base,programs,utils} -name '*.h' -or -name '*.cpp' | + grep -vP $EXCLUDE_DIRS | + xargs grep -F '!",' | grep -P '.' && echo "No need for an exclamation mark (looks dirty, unconfident)." + # Trailing whitespaces find $ROOT_PATH/{src,base,programs,utils} -name '*.h' -or -name '*.cpp' | grep -vP $EXCLUDE_DIRS | @@ -435,3 +442,9 @@ ls -1d $ROOT_PATH/contrib/*-cmake | xargs -I@ find @ -name 'CMakeLists.txt' -or # DOS/Windows newlines find $ROOT_PATH/{base,src,programs,utils,docs} -name '*.md' -or -name '*.h' -or -name '*.cpp' -or -name '*.js' -or -name '*.py' -or -name '*.html' | xargs grep -l -P '\r$' && echo "^ Files contain DOS/Windows newlines (\r\n instead of \n)." + +# Wrong spelling of abbreviations, e.g. SQL is right, Sql is wrong. XMLHttpRequest is very wrong. +find $ROOT_PATH/{src,base,programs,utils} -name '*.h' -or -name '*.cpp' | + grep -vP $EXCLUDE_DIRS | + xargs grep -P 'Sql|Html|Xml|Cpu|Tcp|Udp|Http|Db|Json|Yaml' | grep -v -P 'RabbitMQ|Azure|Aws|aws|Avro|IO/S3' && + echo "Abbreviations such as SQL, XML, HTTP, should be in all caps. For example, SQL is right, Sql is wrong. XMLHttpRequest is very wrong." diff --git a/utils/keeper-data-dumper/main.cpp b/utils/keeper-data-dumper/main.cpp index 351a4ab90bc..21626665a42 100644 --- a/utils/keeper-data-dumper/main.cpp +++ b/utils/keeper-data-dumper/main.cpp @@ -63,11 +63,11 @@ int main(int argc, char *argv[]) ResponsesQueue queue(std::numeric_limits::max()); SnapshotsQueue snapshots_queue{1}; CoordinationSettingsPtr settings = std::make_shared(); - KeeperContextPtr keeper_context = std::make_shared(true); + KeeperContextPtr keeper_context = std::make_shared(true, settings); keeper_context->setLogDisk(std::make_shared("LogDisk", argv[2])); keeper_context->setSnapshotDisk(std::make_shared("SnapshotDisk", argv[1])); - auto state_machine = std::make_shared(queue, snapshots_queue, settings, keeper_context, nullptr); + auto state_machine = std::make_shared(queue, snapshots_queue, keeper_context, nullptr); state_machine->init(); size_t last_commited_index = state_machine->last_commit_index(); diff --git a/utils/list-versions/version_date.tsv b/utils/list-versions/version_date.tsv index 312a33ac2d6..23fc0032056 100644 --- a/utils/list-versions/version_date.tsv +++ b/utils/list-versions/version_date.tsv @@ -1,3 +1,4 @@ +v24.1.5.6-stable 2024-02-14 v24.1.4.20-stable 2024-02-14 v24.1.3.31-stable 2024-02-09 v24.1.2.5-stable 2024-02-02