diff --git a/.clang-tidy b/.clang-tidy index 7241c372319..f8622039f29 100644 --- a/.clang-tidy +++ b/.clang-tidy @@ -41,6 +41,8 @@ Checks: '*, -clang-analyzer-security.insecureAPI.strcpy, -cppcoreguidelines-avoid-c-arrays, + -cppcoreguidelines-avoid-const-or-ref-data-members, + -cppcoreguidelines-avoid-do-while, -cppcoreguidelines-avoid-goto, -cppcoreguidelines-avoid-magic-numbers, -cppcoreguidelines-avoid-non-const-global-variables, @@ -128,6 +130,7 @@ Checks: '*, -portability-simd-intrinsics, -readability-braces-around-statements, + -readability-convert-member-functions-to-static, -readability-else-after-return, -readability-function-cognitive-complexity, -readability-function-size, diff --git a/.github/workflows/backport_branches.yml b/.github/workflows/backport_branches.yml index 7cdf11fec0f..867cca9d037 100644 --- a/.github/workflows/backport_branches.yml +++ b/.github/workflows/backport_branches.yml @@ -9,8 +9,22 @@ on: # yamllint disable-line rule:truthy branches: - 'backport/**' jobs: + CheckLabels: + runs-on: [self-hosted, style-checker] + # Run the first check always, even if the CI is cancelled + if: ${{ always() }} + steps: + - name: Check out repository code + uses: ClickHouse/checkout@v1 + with: + clear-repository: true + - name: Labels check + run: | + cd "$GITHUB_WORKSPACE/tests/ci" + python3 run_check.py PythonUnitTests: runs-on: [self-hosted, style-checker] + needs: CheckLabels steps: - name: Check out repository code uses: ClickHouse/checkout@v1 @@ -22,6 +36,7 @@ jobs: python3 -m unittest discover -s . -p '*_test.py' DockerHubPushAarch64: runs-on: [self-hosted, style-checker-aarch64] + needs: CheckLabels steps: - name: Check out repository code uses: ClickHouse/checkout@v1 @@ -38,6 +53,7 @@ jobs: path: ${{ runner.temp }}/docker_images_check/changed_images_aarch64.json DockerHubPushAmd64: runs-on: [self-hosted, style-checker] + needs: CheckLabels steps: - name: Check out repository code uses: ClickHouse/checkout@v1 diff --git a/CMakeLists.txt b/CMakeLists.txt index 59b38e7763f..5550a19b699 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -121,6 +121,7 @@ if (ENABLE_COLORED_BUILD AND CMAKE_GENERATOR STREQUAL "Ninja") set (CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fdiagnostics-color=always") # ... such manually setting of flags can be removed once CMake supports a variable to # activate colors in *all* build systems: https://gitlab.kitware.com/cmake/cmake/-/issues/15502 + # --> available since CMake 3.24: https://stackoverflow.com/a/73349744 endif () include (cmake/check_flags.cmake) @@ -134,24 +135,15 @@ if (COMPILER_CLANG) set(COMPILER_FLAGS "${COMPILER_FLAGS} -gdwarf-aranges") endif () - if (HAS_USE_CTOR_HOMING) - # For more info see https://blog.llvm.org/posts/2021-04-05-constructor-homing-for-debug-info/ - if (CMAKE_BUILD_TYPE_UC STREQUAL "DEBUG" OR CMAKE_BUILD_TYPE_UC STREQUAL "RELWITHDEBINFO") - set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Xclang -fuse-ctor-homing") - set (CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Xclang -fuse-ctor-homing") - endif() + # See https://blog.llvm.org/posts/2021-04-05-constructor-homing-for-debug-info/ + if (CMAKE_BUILD_TYPE_UC STREQUAL "DEBUG" OR CMAKE_BUILD_TYPE_UC STREQUAL "RELWITHDEBINFO") + set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Xclang -fuse-ctor-homing") + set (CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Xclang -fuse-ctor-homing") endif() no_warning(enum-constexpr-conversion) # breaks Protobuf in clang-16 endif () -# If compiler has support for -Wreserved-identifier. It is difficult to detect by clang version, -# because there are two different branches of clang: clang and AppleClang. -# (AppleClang is not supported by ClickHouse, but some developers have misfortune to use it). -if (HAS_RESERVED_IDENTIFIER) - add_compile_definitions (HAS_RESERVED_IDENTIFIER) -endif () - option(ENABLE_TESTS "Provide unit_test_dbms target with Google.Test unit tests" ON) option(ENABLE_EXAMPLES "Build all example programs in 'examples' subdirectories" OFF) option(ENABLE_BENCHMARKS "Build all benchmark programs in 'benchmarks' subdirectories" OFF) @@ -188,7 +180,6 @@ if (NOT CMAKE_BUILD_TYPE_UC STREQUAL "RELEASE") # Can be lld or ld-lld or lld-13 or /path/to/lld. if (LINKER_NAME MATCHES "lld" AND OS_LINUX) set (CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -Wl,--gdb-index") - set (CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} -Wl,--gdb-index") message (STATUS "Adding .gdb-index via --gdb-index linker option.") endif () endif() @@ -297,6 +288,7 @@ set (CMAKE_C_STANDARD_REQUIRED ON) if (COMPILER_GCC OR COMPILER_CLANG) # Enable C++14 sized global deallocation functions. It should be enabled by setting -std=c++14 but I'm not sure. + # See https://reviews.llvm.org/D112921 set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsized-deallocation") endif () @@ -315,11 +307,7 @@ if (ARCH_AMD64) set(BRANCHES_WITHIN_32B_BOUNDARIES "-Wa,${BRANCHES_WITHIN_32B_BOUNDARIES}") endif() - include(CheckCXXCompilerFlag) - check_cxx_compiler_flag("${BRANCHES_WITHIN_32B_BOUNDARIES}" HAS_BRANCHES_WITHIN_32B_BOUNDARIES) - if (HAS_BRANCHES_WITHIN_32B_BOUNDARIES) - set(COMPILER_FLAGS "${COMPILER_FLAGS} ${BRANCHES_WITHIN_32B_BOUNDARIES}") - endif() + set(COMPILER_FLAGS "${COMPILER_FLAGS} ${BRANCHES_WITHIN_32B_BOUNDARIES}") endif() if (COMPILER_GCC) @@ -361,7 +349,17 @@ set (CMAKE_ASM_FLAGS_DEBUG "${CMAKE_ASM_FLAGS_DEBUG} -O0 ${DEBUG_I if (COMPILER_CLANG) if (OS_DARWIN) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -stdlib=libc++") + set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -Wl,-U,_inside_main") + + # The LLVM MachO linker (ld64.lld, used in native builds) generates by default unwind info in 'compact' format which the internal + # unwinder doesn't support and the server will not come up ('invalid compact unwind encoding'). Disable it. You will see warning + # during the build "ld64.lld: warning: Option `-no_compact_unwind' is undocumented. Should lld implement it?". Yes, ld64.lld does + # not document the option, likely for compat with Apple's system ld after which ld64.lld is modeled after and which also does not + # document it. + if (NOT CMAKE_CROSSCOMPILING) + set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -Wl,-no_compact_unwind") + endif () endif() # Display absolute paths in error messages. Otherwise KDevelop fails to navigate to correct file and opens a new file instead. @@ -431,6 +429,7 @@ option(WERROR "Enable -Werror compiler option" ON) if (WERROR) # Don't pollute CMAKE_CXX_FLAGS with -Werror as it will break some CMake checks. # Instead, adopt modern cmake usage requirement. + # TODO: Set CMAKE_COMPILE_WARNING_AS_ERROR (cmake 3.24) target_compile_options(global-group INTERFACE "-Werror") endif () diff --git a/PreLoad.cmake b/PreLoad.cmake index 0e1ee70fc8f..b456c724cc6 100644 --- a/PreLoad.cmake +++ b/PreLoad.cmake @@ -19,8 +19,8 @@ endif() if (NOT "$ENV{CFLAGS}" STREQUAL "" OR NOT "$ENV{CXXFLAGS}" STREQUAL "" OR NOT "$ENV{LDFLAGS}" STREQUAL "" - OR CMAKE_C_FLAGS OR CMAKE_CXX_FLAGS OR CMAKE_EXE_LINKER_FLAGS OR CMAKE_SHARED_LINKER_FLAGS OR CMAKE_MODULE_LINKER_FLAGS - OR CMAKE_C_FLAGS_INIT OR CMAKE_CXX_FLAGS_INIT OR CMAKE_EXE_LINKER_FLAGS_INIT OR CMAKE_SHARED_LINKER_FLAGS_INIT OR CMAKE_MODULE_LINKER_FLAGS_INIT) + OR CMAKE_C_FLAGS OR CMAKE_CXX_FLAGS OR CMAKE_EXE_LINKER_FLAGS OR CMAKE_MODULE_LINKER_FLAGS + OR CMAKE_C_FLAGS_INIT OR CMAKE_CXX_FLAGS_INIT OR CMAKE_EXE_LINKER_FLAGS_INIT OR CMAKE_MODULE_LINKER_FLAGS_INIT) # if $ENV message("CFLAGS: $ENV{CFLAGS}") @@ -36,7 +36,6 @@ if (NOT "$ENV{CFLAGS}" STREQUAL "" message("CMAKE_C_FLAGS_INIT: ${CMAKE_C_FLAGS_INIT}") message("CMAKE_CXX_FLAGS_INIT: ${CMAKE_CXX_FLAGS_INIT}") message("CMAKE_EXE_LINKER_FLAGS_INIT: ${CMAKE_EXE_LINKER_FLAGS_INIT}") - message("CMAKE_SHARED_LINKER_FLAGS_INIT: ${CMAKE_SHARED_LINKER_FLAGS_INIT}") message("CMAKE_MODULE_LINKER_FLAGS_INIT: ${CMAKE_MODULE_LINKER_FLAGS_INIT}") message(FATAL_ERROR " diff --git a/README.md b/README.md index 17b4df154a9..61d840ecd34 100644 --- a/README.md +++ b/README.md @@ -21,11 +21,10 @@ curl https://clickhouse.com/ | sh * [Contacts](https://clickhouse.com/company/contact) can help to get your questions answered if there are any. ## Upcoming Events -* [**v23.2 Release Webinar**](https://clickhouse.com/company/events/v23-2-release-webinar?utm_source=github&utm_medium=social&utm_campaign=release-webinar-2023-02) - Feb 23 - 23.2 is rapidly approaching. Original creator, co-founder, and CTO of ClickHouse Alexey Milovidov will walk us through the highlights of the release. -* [**ClickHouse Meetup in Amsterdam**](https://www.meetup.com/clickhouse-netherlands-user-group/events/291485868/) - Mar 9 - The first ClickHouse Amsterdam Meetup of 2023 is here! 🎉 Join us for short lightning talks and long discussions. Food, drinks & good times on us. -* [**ClickHouse Meetup in SF Bay Area**](https://www.meetup.com/clickhouse-silicon-valley-meetup-group/events/291490121/) - Mar 14 - A night to meet with ClickHouse team in the San Francisco area! Food and drink are a given...but networking is the primary focus. -* [**ClickHouse Meetup in Austin**](https://www.meetup.com/clickhouse-austin-user-group/events/291486654/) - Mar 16 - The first ClickHouse Meetup in Austin is happening soon! Interested in speaking, let us know! +* [**ClickHouse Meetup in Austin**](https://www.meetup.com/clickhouse-austin-user-group/events/291486654/) - Mar 30 - The first ClickHouse Meetup in Austin is happening soon! Interested in speaking, let us know! +* [**v23.3 Release Webinar**](https://clickhouse.com/company/events/v23-3-release-webinar?utm_source=github&utm_medium=social&utm_campaign=release-webinar-2023-02) - Mar 30 - 23.3 is rapidly approaching. Original creator, co-founder, and CTO of ClickHouse Alexey Milovidov will walk us through the highlights of the release. ## Recent Recordings * **FOSDEM 2023**: In the "Fast and Streaming Data" room Alexey gave a talk entitled "Building Analytical Apps With ClickHouse" that looks at the landscape of data tools, an interesting data set, and how you can interact with data quickly. Check out the recording on **[YouTube](https://www.youtube.com/watch?v=JlcI2Vfz_uk)**. -* **Recording available**: [**v23.1 Release Webinar**](https://www.youtube.com/watch?v=zYSZXBnTMSE) 23.1 is the ClickHouse New Year release. Original creator, co-founder, and CTO of ClickHouse Alexey Milovidov will walk us through the highlights of the release. Inverted indices, query cache, and so -- very -- much more. +* **Recording available**: [**v23.2 Release Webinar**](https://www.youtube.com/watch?v=2o0vRMMIrkY) NTILE Window Function support, Partition Key for GROUP By, io_uring, Apache Iceberg support, Dynamic Disks, integrations updates! Watch it now! +* **All release webinar recordings**: [YouTube playlist](https://www.youtube.com/playlist?list=PL0Z2YDlm0b3jAlSy1JxyP8zluvXaN3nxU) diff --git a/base/base/coverage.cpp b/base/base/coverage.cpp index 043f97f9593..1027638be3d 100644 --- a/base/base/coverage.cpp +++ b/base/base/coverage.cpp @@ -2,6 +2,8 @@ #if WITH_COVERAGE +#pragma GCC diagnostic ignored "-Wreserved-identifier" + # include # include diff --git a/base/base/hex.h b/base/base/hex.h index e0c57f9dd42..b8cf95db893 100644 --- a/base/base/hex.h +++ b/base/base/hex.h @@ -1,5 +1,6 @@ #pragma once +#include #include #include "types.h" diff --git a/base/base/phdr_cache.cpp b/base/base/phdr_cache.cpp index c3d7fed2d3f..7d37f01b560 100644 --- a/base/base/phdr_cache.cpp +++ b/base/base/phdr_cache.cpp @@ -1,6 +1,4 @@ -#ifdef HAS_RESERVED_IDENTIFIER #pragma clang diagnostic ignored "-Wreserved-identifier" -#endif /// This code was based on the code by Fedor Korotkiy https://www.linkedin.com/in/fedor-korotkiy-659a1838/ diff --git a/base/base/unit.h b/base/base/unit.h index 1fb530be1f0..0fc314af479 100644 --- a/base/base/unit.h +++ b/base/base/unit.h @@ -5,10 +5,8 @@ constexpr size_t KiB = 1024; constexpr size_t MiB = 1024 * KiB; constexpr size_t GiB = 1024 * MiB; -#ifdef HAS_RESERVED_IDENTIFIER -# pragma clang diagnostic push -# pragma clang diagnostic ignored "-Wreserved-identifier" -#endif +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Wreserved-identifier" // NOLINTBEGIN(google-runtime-int) constexpr size_t operator"" _KiB(unsigned long long val) { return val * KiB; } @@ -16,6 +14,4 @@ constexpr size_t operator"" _MiB(unsigned long long val) { return val * MiB; } constexpr size_t operator"" _GiB(unsigned long long val) { return val * GiB; } // NOLINTEND(google-runtime-int) -#ifdef HAS_RESERVED_IDENTIFIER -# pragma clang diagnostic pop -#endif +#pragma clang diagnostic pop diff --git a/base/base/wide_integer_impl.h b/base/base/wide_integer_impl.h index 4a54c0fb2a4..30d08be2e4f 100644 --- a/base/base/wide_integer_impl.h +++ b/base/base/wide_integer_impl.h @@ -732,9 +732,10 @@ public: if (std::numeric_limits::is_signed && (is_negative(lhs) != is_negative(rhs))) return is_negative(rhs); + integer t = rhs; for (unsigned i = 0; i < item_count; ++i) { - base_type rhs_item = get_item(rhs, big(i)); + base_type rhs_item = get_item(t, big(i)); if (lhs.items[big(i)] != rhs_item) return lhs.items[big(i)] > rhs_item; @@ -757,9 +758,10 @@ public: if (std::numeric_limits::is_signed && (is_negative(lhs) != is_negative(rhs))) return is_negative(lhs); + integer t = rhs; for (unsigned i = 0; i < item_count; ++i) { - base_type rhs_item = get_item(rhs, big(i)); + base_type rhs_item = get_item(t, big(i)); if (lhs.items[big(i)] != rhs_item) return lhs.items[big(i)] < rhs_item; @@ -779,9 +781,10 @@ public: { if constexpr (should_keep_size()) { + integer t = rhs; for (unsigned i = 0; i < item_count; ++i) { - base_type rhs_item = get_item(rhs, any(i)); + base_type rhs_item = get_item(t, any(i)); if (lhs.items[any(i)] != rhs_item) return false; diff --git a/base/base/wide_integer_to_string.h b/base/base/wide_integer_to_string.h index 160bf599516..c2cbe8d82e3 100644 --- a/base/base/wide_integer_to_string.h +++ b/base/base/wide_integer_to_string.h @@ -64,6 +64,6 @@ struct fmt::formatter> template auto format(const wide::integer & value, FormatContext & ctx) { - return format_to(ctx.out(), "{}", to_string(value)); + return fmt::format_to(ctx.out(), "{}", to_string(value)); } }; diff --git a/base/poco/Util/src/XMLConfiguration.cpp b/base/poco/Util/src/XMLConfiguration.cpp index 974361044d7..e0d363cc870 100644 --- a/base/poco/Util/src/XMLConfiguration.cpp +++ b/base/poco/Util/src/XMLConfiguration.cpp @@ -27,7 +27,7 @@ #include "Poco/Exception.h" #include "Poco/NumberParser.h" #include "Poco/NumberFormatter.h" -#include +#include namespace Poco { diff --git a/base/readpassphrase/readpassphrase.c b/base/readpassphrase/readpassphrase.c index a84ec43767c..fbd582ffe79 100644 --- a/base/readpassphrase/readpassphrase.c +++ b/base/readpassphrase/readpassphrase.c @@ -27,9 +27,7 @@ #define _PATH_TTY "/dev/tty" #endif -#ifdef HAS_RESERVED_IDENTIFIER #pragma clang diagnostic ignored "-Wreserved-identifier" -#endif #include #include diff --git a/cmake/check_flags.cmake b/cmake/check_flags.cmake index 518f9ecf8de..294f135e8ee 100644 --- a/cmake/check_flags.cmake +++ b/cmake/check_flags.cmake @@ -1,7 +1,5 @@ include (CheckCXXCompilerFlag) include (CheckCCompilerFlag) -check_cxx_compiler_flag("-Wreserved-identifier" HAS_RESERVED_IDENTIFIER) -check_cxx_compiler_flag("-Wsuggest-destructor-override" HAS_SUGGEST_DESTRUCTOR_OVERRIDE) -check_cxx_compiler_flag("-Wsuggest-override" HAS_SUGGEST_OVERRIDE) -check_cxx_compiler_flag("-Xclang -fuse-ctor-homing" HAS_USE_CTOR_HOMING) +# Set/unset variable based on existence of compiler flags. Example: +# check_cxx_compiler_flag("-Wreserved-identifier" HAS_RESERVED_IDENTIFIER) diff --git a/cmake/clang_tidy.cmake b/cmake/clang_tidy.cmake index ceaafdaa9aa..96c295b6bb9 100644 --- a/cmake/clang_tidy.cmake +++ b/cmake/clang_tidy.cmake @@ -5,14 +5,14 @@ if (ENABLE_CLANG_TIDY) find_program (CLANG_TIDY_CACHE_PATH NAMES "clang-tidy-cache") if (CLANG_TIDY_CACHE_PATH) - find_program (_CLANG_TIDY_PATH NAMES "clang-tidy-15" "clang-tidy-14" "clang-tidy-13" "clang-tidy-12" "clang-tidy") + find_program (_CLANG_TIDY_PATH NAMES "clang-tidy-16" "clang-tidy-15" "clang-tidy-14" "clang-tidy") # Why do we use ';' here? # It's a cmake black magic: https://cmake.org/cmake/help/latest/prop_tgt/LANG_CLANG_TIDY.html#prop_tgt:%3CLANG%3E_CLANG_TIDY # The CLANG_TIDY_PATH is passed to CMAKE_CXX_CLANG_TIDY, which follows CXX_CLANG_TIDY syntax. set (CLANG_TIDY_PATH "${CLANG_TIDY_CACHE_PATH};${_CLANG_TIDY_PATH}" CACHE STRING "A combined command to run clang-tidy with caching wrapper") else () - find_program (CLANG_TIDY_PATH NAMES "clang-tidy-15" "clang-tidy-14" "clang-tidy-13" "clang-tidy-12" "clang-tidy") + find_program (CLANG_TIDY_PATH NAMES "clang-tidy-16" "clang-tidy-15" "clang-tidy-14" "clang-tidy") endif () if (CLANG_TIDY_PATH) diff --git a/cmake/linux/toolchain-riscv64.cmake b/cmake/linux/toolchain-riscv64.cmake index 49a036c2972..ea57c3b2c42 100644 --- a/cmake/linux/toolchain-riscv64.cmake +++ b/cmake/linux/toolchain-riscv64.cmake @@ -22,7 +22,6 @@ set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} --gcc-toolchain=${TOOLCHAIN_PATH}") set (CMAKE_ASM_FLAGS "${CMAKE_ASM_FLAGS} --gcc-toolchain=${TOOLCHAIN_PATH}") set (CMAKE_EXE_LINKER_FLAGS_INIT "-fuse-ld=bfd") -set (CMAKE_SHARED_LINKER_FLAGS_INIT "-fuse-ld=bfd") # Currently, lld does not work with the error: # ld.lld: error: section size decrease is too large diff --git a/cmake/linux/toolchain-x86_64.cmake b/cmake/linux/toolchain-x86_64.cmake index e73d779284a..55b9df79f70 100644 --- a/cmake/linux/toolchain-x86_64.cmake +++ b/cmake/linux/toolchain-x86_64.cmake @@ -30,7 +30,6 @@ set (CMAKE_SYSROOT "${TOOLCHAIN_PATH}/x86_64-linux-gnu/libc") set (CMAKE_C_FLAGS "${CMAKE_C_FLAGS} --gcc-toolchain=${TOOLCHAIN_PATH}") set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} --gcc-toolchain=${TOOLCHAIN_PATH}") set (CMAKE_ASM_FLAGS "${CMAKE_ASM_FLAGS} --gcc-toolchain=${TOOLCHAIN_PATH}") -set (CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} --gcc-toolchain=${TOOLCHAIN_PATH}") set (CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} --gcc-toolchain=${TOOLCHAIN_PATH}") set (CMAKE_MODULE_LINKER_FLAGS "${CMAKE_MODULE_LINKER_FLAGS} --gcc-toolchain=${TOOLCHAIN_PATH}") diff --git a/cmake/tools.cmake b/cmake/tools.cmake index 4e1954f27f7..974b0bd1d3d 100644 --- a/cmake/tools.cmake +++ b/cmake/tools.cmake @@ -57,52 +57,46 @@ if (LINKER_NAME MATCHES "gold") message (FATAL_ERROR "Linking with gold is unsupported. Please use lld.") endif () -# s390x doesnt support lld -if (NOT ARCH_S390X) - if (NOT LINKER_NAME) - if (COMPILER_GCC) - find_program (LLD_PATH NAMES "ld.lld") - elseif (COMPILER_CLANG) - # llvm lld is a generic driver. - # Invoke ld.lld (Unix), ld64.lld (macOS), lld-link (Windows), wasm-ld (WebAssembly) instead - if (OS_LINUX) +if (NOT LINKER_NAME) + if (COMPILER_GCC) + find_program (LLD_PATH NAMES "ld.lld") + elseif (COMPILER_CLANG) + # llvm lld is a generic driver. + # Invoke ld.lld (Unix), ld64.lld (macOS), lld-link (Windows), wasm-ld (WebAssembly) instead + if (OS_LINUX) + if (NOT ARCH_S390X) # s390x doesnt support lld find_program (LLD_PATH NAMES "ld.lld-${COMPILER_VERSION_MAJOR}" "ld.lld") - elseif (OS_DARWIN) - find_program (LLD_PATH NAMES "ld64.lld-${COMPILER_VERSION_MAJOR}" "ld64.lld") + endif () + elseif (OS_DARWIN) + find_program (LLD_PATH NAMES "ld64.lld-${COMPILER_VERSION_MAJOR}" "ld64.lld") + endif () + endif () + if (OS_LINUX OR OS_DARWIN) + if (LLD_PATH) + if (COMPILER_GCC) + # GCC driver requires one of supported linker names like "lld". + set (LINKER_NAME "lld") + else () + # Clang driver simply allows full linker path. + set (LINKER_NAME ${LLD_PATH}) endif () endif () endif() endif() -if ((OS_LINUX OR OS_DARWIN) AND NOT LINKER_NAME) - if (LLD_PATH) - if (COMPILER_GCC) - # GCC driver requires one of supported linker names like "lld". - set (LINKER_NAME "lld") - else () - # Clang driver simply allows full linker path. - set (LINKER_NAME ${LLD_PATH}) - endif () - endif () -endif () -# TODO: allow different linker on != OS_LINUX - if (LINKER_NAME) + find_program (LLD_PATH NAMES ${LINKER_NAME}) + if (NOT LLD_PATH) + message (FATAL_ERROR "Using linker ${LINKER_NAME} but can't find its path.") + endif () if (COMPILER_CLANG) - find_program (LLD_PATH NAMES ${LINKER_NAME}) - if (NOT LLD_PATH) - message (FATAL_ERROR "Using linker ${LINKER_NAME} but can't find its path.") - endif () - - # This a temporary quirk to emit .debug_aranges with ThinLTO + # This a temporary quirk to emit .debug_aranges with ThinLTO, can be removed after upgrade to clang-16 set (LLD_WRAPPER "${CMAKE_CURRENT_BINARY_DIR}/ld.lld") configure_file ("${CMAKE_CURRENT_SOURCE_DIR}/cmake/ld.lld.in" "${LLD_WRAPPER}" @ONLY) set (CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} --ld-path=${LLD_WRAPPER}") - set (CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} --ld-path=${LLD_WRAPPER}") else () set (CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -fuse-ld=${LINKER_NAME}") - set (CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} -fuse-ld=${LINKER_NAME}") endif () endif () diff --git a/contrib/croaring b/contrib/croaring index 2c867e9f9c9..f40ed52bcdd 160000 --- a/contrib/croaring +++ b/contrib/croaring @@ -1 +1 @@ -Subproject commit 2c867e9f9c9e2a3a7032791f94c4c7ae3013f6e0 +Subproject commit f40ed52bcdd635840a79877cef4857315dba817c diff --git a/contrib/croaring-cmake/CMakeLists.txt b/contrib/croaring-cmake/CMakeLists.txt index 0bb7d0bd221..794c0426b96 100644 --- a/contrib/croaring-cmake/CMakeLists.txt +++ b/contrib/croaring-cmake/CMakeLists.txt @@ -17,7 +17,8 @@ set(SRCS "${LIBRARY_DIR}/src/containers/run.c" "${LIBRARY_DIR}/src/roaring.c" "${LIBRARY_DIR}/src/roaring_priority_queue.c" - "${LIBRARY_DIR}/src/roaring_array.c") + "${LIBRARY_DIR}/src/roaring_array.c" + "${LIBRARY_DIR}/src/memory.c") add_library(_roaring ${SRCS}) diff --git a/contrib/llvm-project b/contrib/llvm-project index a8bf69e9cd3..4bfaeb31dd0 160000 --- a/contrib/llvm-project +++ b/contrib/llvm-project @@ -1 +1 @@ -Subproject commit a8bf69e9cd39a23140a2b633c172d201484172da +Subproject commit 4bfaeb31dd0ef13f025221f93c138974a3e0a22a diff --git a/contrib/murmurhash/src/MurmurHash2.cpp b/contrib/murmurhash/src/MurmurHash2.cpp index 1c4469b0a02..0bd0a352dc4 100644 --- a/contrib/murmurhash/src/MurmurHash2.cpp +++ b/contrib/murmurhash/src/MurmurHash2.cpp @@ -31,6 +31,40 @@ #define BIG_CONSTANT(x) (x##LLU) #endif // !defined(_MSC_VER) +// +//----------------------------------------------------------------------------- +// Block read - on little-endian machines this is a single load, +// while on big-endian or unknown machines the byte accesses should +// still get optimized into the most efficient instruction. +static inline uint32_t getblock ( const uint32_t * p ) +{ +#if defined(__BYTE_ORDER__) && (__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__) + return *p; +#else + const uint8_t *c = (const uint8_t *)p; + return (uint32_t)c[0] | + (uint32_t)c[1] << 8 | + (uint32_t)c[2] << 16 | + (uint32_t)c[3] << 24; +#endif +} + +static inline uint64_t getblock ( const uint64_t * p ) +{ +#if defined(__BYTE_ORDER__) && (__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__) + return *p; +#else + const uint8_t *c = (const uint8_t *)p; + return (uint64_t)c[0] | + (uint64_t)c[1] << 8 | + (uint64_t)c[2] << 16 | + (uint64_t)c[3] << 24 | + (uint64_t)c[4] << 32 | + (uint64_t)c[5] << 40 | + (uint64_t)c[6] << 48 | + (uint64_t)c[7] << 56; +#endif +} //----------------------------------------------------------------------------- @@ -52,7 +86,7 @@ uint32_t MurmurHash2 ( const void * key, size_t len, uint32_t seed ) while(len >= 4) { - uint32_t k = *(uint32_t*)data; + uint32_t k = getblock((const uint32_t *)data); k *= m; k ^= k >> r; @@ -105,7 +139,7 @@ uint64_t MurmurHash64A ( const void * key, size_t len, uint64_t seed ) while(data != end) { - uint64_t k = *data++; + uint64_t k = getblock(data++); k *= m; k ^= k >> r; @@ -151,12 +185,12 @@ uint64_t MurmurHash64B ( const void * key, size_t len, uint64_t seed ) while(len >= 8) { - uint32_t k1 = *data++; + uint32_t k1 = getblock(data++); k1 *= m; k1 ^= k1 >> r; k1 *= m; h1 *= m; h1 ^= k1; len -= 4; - uint32_t k2 = *data++; + uint32_t k2 = getblock(data++); k2 *= m; k2 ^= k2 >> r; k2 *= m; h2 *= m; h2 ^= k2; len -= 4; @@ -164,7 +198,7 @@ uint64_t MurmurHash64B ( const void * key, size_t len, uint64_t seed ) if(len >= 4) { - uint32_t k1 = *data++; + uint32_t k1 = getblock(data++); k1 *= m; k1 ^= k1 >> r; k1 *= m; h1 *= m; h1 ^= k1; len -= 4; @@ -215,7 +249,7 @@ uint32_t MurmurHash2A ( const void * key, size_t len, uint32_t seed ) while(len >= 4) { - uint32_t k = *(uint32_t*)data; + uint32_t k = getblock((const uint32_t *)data); mmix(h,k); @@ -278,7 +312,7 @@ public: while(len >= 4) { - uint32_t k = *(uint32_t*)data; + uint32_t k = getblock((const uint32_t *)data); mmix(m_hash,k); @@ -427,7 +461,7 @@ uint32_t MurmurHashAligned2 ( const void * key, size_t len, uint32_t seed ) while(len >= 4) { - d = *(uint32_t *)data; + d = getblock((const uint32_t *)data); t = (t >> sr) | (d << sl); uint32_t k = t; @@ -492,7 +526,7 @@ uint32_t MurmurHashAligned2 ( const void * key, size_t len, uint32_t seed ) { while(len >= 4) { - uint32_t k = *(uint32_t *)data; + uint32_t k = getblock((const uint32_t *)data); MIX(h,k,m); diff --git a/contrib/murmurhash/src/MurmurHash3.cpp b/contrib/murmurhash/src/MurmurHash3.cpp index cf5158e97ad..6573c470be3 100644 --- a/contrib/murmurhash/src/MurmurHash3.cpp +++ b/contrib/murmurhash/src/MurmurHash3.cpp @@ -55,14 +55,32 @@ inline uint64_t rotl64 ( uint64_t x, int8_t r ) FORCE_INLINE uint32_t getblock32 ( const uint32_t * p, int i ) { - uint32_t res; - memcpy(&res, p + i, sizeof(res)); - return res; +#if defined(__BYTE_ORDER__) && (__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__) + return p[i]; +#else + const uint8_t *c = (const uint8_t *)&p[i]; + return (uint32_t)c[0] | + (uint32_t)c[1] << 8 | + (uint32_t)c[2] << 16 | + (uint32_t)c[3] << 24; +#endif } FORCE_INLINE uint64_t getblock64 ( const uint64_t * p, int i ) { +#if defined(__BYTE_ORDER__) && (__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__) return p[i]; +#else + const uint8_t *c = (const uint8_t *)&p[i]; + return (uint64_t)c[0] | + (uint64_t)c[1] << 8 | + (uint64_t)c[2] << 16 | + (uint64_t)c[3] << 24 | + (uint64_t)c[4] << 32 | + (uint64_t)c[5] << 40 | + (uint64_t)c[6] << 48 | + (uint64_t)c[7] << 56; +#endif } //----------------------------------------------------------------------------- @@ -329,9 +347,13 @@ void MurmurHash3_x64_128 ( const void * key, const size_t len, h1 += h2; h2 += h1; - +#if defined(__BYTE_ORDER__) && (__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__) ((uint64_t*)out)[0] = h1; ((uint64_t*)out)[1] = h2; +#else + ((uint64_t*)out)[0] = h2; + ((uint64_t*)out)[1] = h1; +#endif } //----------------------------------------------------------------------------- diff --git a/contrib/qpl-cmake/benchmark_sample/client_scripts/allin1_ssb.sh b/contrib/qpl-cmake/benchmark_sample/client_scripts/allin1_ssb.sh new file mode 100644 index 00000000000..31017b565b6 --- /dev/null +++ b/contrib/qpl-cmake/benchmark_sample/client_scripts/allin1_ssb.sh @@ -0,0 +1,530 @@ +#!/bin/bash +ckhost="localhost" +ckport=("9000" "9001" "9002" "9003") +WORKING_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)/.." +OUTPUT_DIR="${WORKING_DIR}/output" +LOG_DIR="${OUTPUT_DIR}/log" +RAWDATA_DIR="${WORKING_DIR}/rawdata_dir" +database_dir="${WORKING_DIR}/database_dir" +CLIENT_SCRIPTS_DIR="${WORKING_DIR}/client_scripts" +LOG_PACK_FILE="$(date +%Y-%m-%d-%H-%M-%S)" +QUERY_FILE="queries_ssb.sql" +SERVER_BIND_CMD[0]="numactl -m 0 -N 0" +SERVER_BIND_CMD[1]="numactl -m 0 -N 0" +SERVER_BIND_CMD[2]="numactl -m 1 -N 1" +SERVER_BIND_CMD[3]="numactl -m 1 -N 1" +CLIENT_BIND_CMD="" +SSB_GEN_FACTOR=20 +TABLE_NAME="lineorder_flat" +TALBE_ROWS="119994608" +CODEC_CONFIG="lz4 deflate zstd" + +# define instance number +inst_num=$1 +if [ ! -n "$1" ]; then + echo "Please clarify instance number from 1,2,3 or 4" + exit 1 +else + echo "Benchmarking with instance number:$1" +fi + +if [ ! -d "$OUTPUT_DIR" ]; then +mkdir $OUTPUT_DIR +fi +if [ ! -d "$LOG_DIR" ]; then +mkdir $LOG_DIR +fi +if [ ! -d "$RAWDATA_DIR" ]; then +mkdir $RAWDATA_DIR +fi + +# define different directories +dir_server=("" "_s2" "_s3" "_s4") +ckreadSql=" + CREATE TABLE customer + ( + C_CUSTKEY UInt32, + C_NAME String, + C_ADDRESS String, + C_CITY LowCardinality(String), + C_NATION LowCardinality(String), + C_REGION LowCardinality(String), + C_PHONE String, + C_MKTSEGMENT LowCardinality(String) + ) + ENGINE = MergeTree ORDER BY (C_CUSTKEY); + + CREATE TABLE lineorder + ( + LO_ORDERKEY UInt32, + LO_LINENUMBER UInt8, + LO_CUSTKEY UInt32, + LO_PARTKEY UInt32, + LO_SUPPKEY UInt32, + LO_ORDERDATE Date, + LO_ORDERPRIORITY LowCardinality(String), + LO_SHIPPRIORITY UInt8, + LO_QUANTITY UInt8, + LO_EXTENDEDPRICE UInt32, + LO_ORDTOTALPRICE UInt32, + LO_DISCOUNT UInt8, + LO_REVENUE UInt32, + LO_SUPPLYCOST UInt32, + LO_TAX UInt8, + LO_COMMITDATE Date, + LO_SHIPMODE LowCardinality(String) + ) + ENGINE = MergeTree PARTITION BY toYear(LO_ORDERDATE) ORDER BY (LO_ORDERDATE, LO_ORDERKEY); + + CREATE TABLE part + ( + P_PARTKEY UInt32, + P_NAME String, + P_MFGR LowCardinality(String), + P_CATEGORY LowCardinality(String), + P_BRAND LowCardinality(String), + P_COLOR LowCardinality(String), + P_TYPE LowCardinality(String), + P_SIZE UInt8, + P_CONTAINER LowCardinality(String) + ) + ENGINE = MergeTree ORDER BY P_PARTKEY; + + CREATE TABLE supplier + ( + S_SUPPKEY UInt32, + S_NAME String, + S_ADDRESS String, + S_CITY LowCardinality(String), + S_NATION LowCardinality(String), + S_REGION LowCardinality(String), + S_PHONE String + ) + ENGINE = MergeTree ORDER BY S_SUPPKEY; +" +supplier_table=" + CREATE TABLE supplier + ( + S_SUPPKEY UInt32, + S_NAME String, + S_ADDRESS String, + S_CITY LowCardinality(String), + S_NATION LowCardinality(String), + S_REGION LowCardinality(String), + S_PHONE String + ) + ENGINE = MergeTree ORDER BY S_SUPPKEY; +" +part_table=" + CREATE TABLE part + ( + P_PARTKEY UInt32, + P_NAME String, + P_MFGR LowCardinality(String), + P_CATEGORY LowCardinality(String), + P_BRAND LowCardinality(String), + P_COLOR LowCardinality(String), + P_TYPE LowCardinality(String), + P_SIZE UInt8, + P_CONTAINER LowCardinality(String) + ) + ENGINE = MergeTree ORDER BY P_PARTKEY; +" +lineorder_table=" + CREATE TABLE lineorder + ( + LO_ORDERKEY UInt32, + LO_LINENUMBER UInt8, + LO_CUSTKEY UInt32, + LO_PARTKEY UInt32, + LO_SUPPKEY UInt32, + LO_ORDERDATE Date, + LO_ORDERPRIORITY LowCardinality(String), + LO_SHIPPRIORITY UInt8, + LO_QUANTITY UInt8, + LO_EXTENDEDPRICE UInt32, + LO_ORDTOTALPRICE UInt32, + LO_DISCOUNT UInt8, + LO_REVENUE UInt32, + LO_SUPPLYCOST UInt32, + LO_TAX UInt8, + LO_COMMITDATE Date, + LO_SHIPMODE LowCardinality(String) + ) + ENGINE = MergeTree PARTITION BY toYear(LO_ORDERDATE) ORDER BY (LO_ORDERDATE, LO_ORDERKEY); +" +customer_table=" + CREATE TABLE customer + ( + C_CUSTKEY UInt32, + C_NAME String, + C_ADDRESS String, + C_CITY LowCardinality(String), + C_NATION LowCardinality(String), + C_REGION LowCardinality(String), + C_PHONE String, + C_MKTSEGMENT LowCardinality(String) + ) + ENGINE = MergeTree ORDER BY (C_CUSTKEY); +" + +lineorder_flat_table=" + SET max_memory_usage = 20000000000; + CREATE TABLE lineorder_flat + ENGINE = MergeTree + PARTITION BY toYear(LO_ORDERDATE) + ORDER BY (LO_ORDERDATE, LO_ORDERKEY) AS + SELECT + l.LO_ORDERKEY AS LO_ORDERKEY, + l.LO_LINENUMBER AS LO_LINENUMBER, + l.LO_CUSTKEY AS LO_CUSTKEY, + l.LO_PARTKEY AS LO_PARTKEY, + l.LO_SUPPKEY AS LO_SUPPKEY, + l.LO_ORDERDATE AS LO_ORDERDATE, + l.LO_ORDERPRIORITY AS LO_ORDERPRIORITY, + l.LO_SHIPPRIORITY AS LO_SHIPPRIORITY, + l.LO_QUANTITY AS LO_QUANTITY, + l.LO_EXTENDEDPRICE AS LO_EXTENDEDPRICE, + l.LO_ORDTOTALPRICE AS LO_ORDTOTALPRICE, + l.LO_DISCOUNT AS LO_DISCOUNT, + l.LO_REVENUE AS LO_REVENUE, + l.LO_SUPPLYCOST AS LO_SUPPLYCOST, + l.LO_TAX AS LO_TAX, + l.LO_COMMITDATE AS LO_COMMITDATE, + l.LO_SHIPMODE AS LO_SHIPMODE, + c.C_NAME AS C_NAME, + c.C_ADDRESS AS C_ADDRESS, + c.C_CITY AS C_CITY, + c.C_NATION AS C_NATION, + c.C_REGION AS C_REGION, + c.C_PHONE AS C_PHONE, + c.C_MKTSEGMENT AS C_MKTSEGMENT, + s.S_NAME AS S_NAME, + s.S_ADDRESS AS S_ADDRESS, + s.S_CITY AS S_CITY, + s.S_NATION AS S_NATION, + s.S_REGION AS S_REGION, + s.S_PHONE AS S_PHONE, + p.P_NAME AS P_NAME, + p.P_MFGR AS P_MFGR, + p.P_CATEGORY AS P_CATEGORY, + p.P_BRAND AS P_BRAND, + p.P_COLOR AS P_COLOR, + p.P_TYPE AS P_TYPE, + p.P_SIZE AS P_SIZE, + p.P_CONTAINER AS P_CONTAINER + FROM lineorder AS l + INNER JOIN customer AS c ON c.C_CUSTKEY = l.LO_CUSTKEY + INNER JOIN supplier AS s ON s.S_SUPPKEY = l.LO_SUPPKEY + INNER JOIN part AS p ON p.P_PARTKEY = l.LO_PARTKEY; + show settings ilike 'max_memory_usage'; +" + +function insert_data(){ + echo "insert_data:$1" + create_table_prefix="clickhouse client --host ${ckhost} --port $2 --multiquery -q" + insert_data_prefix="clickhouse client --query " + case $1 in + all) + clickhouse client --host ${ckhost} --port $2 --multiquery -q"$ckreadSql" && { + ${insert_data_prefix} "INSERT INTO customer FORMAT CSV" < ${RAWDATA_DIR}/ssb-dbgen/customer.tbl --port=$2 + ${insert_data_prefix} "INSERT INTO part FORMAT CSV" < ${RAWDATA_DIR}/ssb-dbgen/part.tbl --port=$2 + ${insert_data_prefix} "INSERT INTO supplier FORMAT CSV" < ${RAWDATA_DIR}/ssb-dbgen/supplier.tbl --port=$2 + ${insert_data_prefix} "INSERT INTO lineorder FORMAT CSV" < ${RAWDATA_DIR}/ssb-dbgen/lineorder.tbl --port=$2 + } + ${create_table_prefix}"${lineorder_flat_table}" + ;; + customer) + echo ${create_table_prefix}\"${customer_table}\" + ${create_table_prefix}"${customer_table}" && { + echo "${insert_data_prefix} \"INSERT INTO $1 FORMAT CSV\" < ${RAWDATA_DIR}/ssb-dbgen/$1.tbl --port=$2" + ${insert_data_prefix} "INSERT INTO $1 FORMAT CSV" < ${RAWDATA_DIR}/ssb-dbgen/$1.tbl --port=$2 + } + ;; + part) + echo ${create_table_prefix}\"${part_table}\" + ${create_table_prefix}"${part_table}" && { + echo "${insert_data_prefix} \"INSERT INTO $1 FORMAT CSV\" < ${RAWDATA_DIR}/ssb-dbgen/$1.tbl --port=$2" + ${insert_data_prefix} "INSERT INTO $1 FORMAT CSV" < ${RAWDATA_DIR}/ssb-dbgen/$1.tbl --port=$2 + } + ;; + supplier) + echo ${create_table_prefix}"${supplier_table}" + ${create_table_prefix}"${supplier_table}" && { + echo "${insert_data_prefix} \"INSERT INTO $1 FORMAT CSV\" < ${RAWDATA_DIR}/ssb-dbgen/$1.tbl --port=$2" + ${insert_data_prefix} "INSERT INTO $1 FORMAT CSV" < ${RAWDATA_DIR}/ssb-dbgen/$1.tbl --port=$2 + } + ;; + lineorder) + echo ${create_table_prefix}"${lineorder_table}" + ${create_table_prefix}"${lineorder_table}" && { + echo "${insert_data_prefix} \"INSERT INTO $1 FORMAT CSV\" < ${RAWDATA_DIR}/ssb-dbgen/$1.tbl --port=$2" + ${insert_data_prefix} "INSERT INTO $1 FORMAT CSV" < ${RAWDATA_DIR}/ssb-dbgen/$1.tbl --port=$2 + } + ;; + lineorder_flat) + echo ${create_table_prefix}"${lineorder_flat_table}" + ${create_table_prefix}"${lineorder_flat_table}" + return 0 + ;; + *) + exit 0 + ;; + + esac +} + +function check_sql(){ + select_sql="select * from "$1" limit 1" + clickhouse client --host ${ckhost} --port $2 --multiquery -q"${select_sql}" +} + +function check_table(){ + checknum=0 + source_tables="customer part supplier lineorder lineorder_flat" + test_tables=${1:-${source_tables}} + echo "Checking table data required in server..." + for i in $(seq 0 $[inst_num-1]) + do + for j in `echo ${test_tables}` + do + check_sql $j ${ckport[i]} &> /dev/null || { + let checknum+=1 && insert_data "$j" ${ckport[i]} + } + done + done + + for i in $(seq 0 $[inst_num-1]) + do + echo "clickhouse client --host ${ckhost} --port ${ckport[i]} -m -q\"select count() from ${TABLE_NAME};\"" + var=$(clickhouse client --host ${ckhost} --port ${ckport[i]} -m -q"select count() from ${TABLE_NAME};") + if [ $var -eq $TALBE_ROWS ];then + echo "Instance_${i} Table data integrity check OK -> Rows:$var" + else + echo "Instance_${i} Table data integrity check Failed -> Rows:$var" + exit 1 + fi + done + if [ $checknum -gt 0 ];then + echo "Need sleep 10s after first table data insertion...$checknum" + sleep 10 + fi +} + +function check_instance(){ +instance_alive=0 +for i in {1..10} +do + sleep 1 + netstat -nltp | grep ${1} > /dev/null + if [ $? -ne 1 ];then + instance_alive=1 + break + fi + +done + +if [ $instance_alive -eq 0 ];then + echo "check_instance -> clickhouse server instance faild to launch due to 10s timeout!" + exit 1 +else + echo "check_instance -> clickhouse server instance launch successfully!" +fi +} + +function start_clickhouse_for_insertion(){ + echo "start_clickhouse_for_insertion" + for i in $(seq 0 $[inst_num-1]) + do + echo "cd ${database_dir}/$1${dir_server[i]}" + echo "${SERVER_BIND_CMD[i]} clickhouse server -C config_${1}${dir_server[i]}.xml >&${LOG_DIR}/${1}_${i}_server_log& > /dev/null" + + cd ${database_dir}/$1${dir_server[i]} + ${SERVER_BIND_CMD[i]} clickhouse server -C config_${1}${dir_server[i]}.xml >&${LOG_DIR}/${1}_${i}_server_log& > /dev/null + check_instance ${ckport[i]} + done +} + +function start_clickhouse_for_stressing(){ + echo "start_clickhouse_for_stressing" + for i in $(seq 0 $[inst_num-1]) + do + echo "cd ${database_dir}/$1${dir_server[i]}" + echo "${SERVER_BIND_CMD[i]} clickhouse server -C config_${1}${dir_server[i]}.xml >&/dev/null&" + + cd ${database_dir}/$1${dir_server[i]} + ${SERVER_BIND_CMD[i]} clickhouse server -C config_${1}${dir_server[i]}.xml >&/dev/null& + check_instance ${ckport[i]} + done +} +yum -y install git make gcc sudo net-tools &> /dev/null +pip3 install clickhouse_driver numpy &> /dev/null +test -d ${RAWDATA_DIR}/ssb-dbgen || git clone https://github.com/vadimtk/ssb-dbgen.git ${RAWDATA_DIR}/ssb-dbgen && cd ${RAWDATA_DIR}/ssb-dbgen + +if [ ! -f ${RAWDATA_DIR}/ssb-dbgen/dbgen ];then + make && { + test -f ${RAWDATA_DIR}/ssb-dbgen/customer.tbl || echo y |./dbgen -s ${SSB_GEN_FACTOR} -T c + test -f ${RAWDATA_DIR}/ssb-dbgen/part.tbl || echo y | ./dbgen -s ${SSB_GEN_FACTOR} -T p + test -f ${RAWDATA_DIR}/ssb-dbgen/supplier.tbl || echo y | ./dbgen -s ${SSB_GEN_FACTOR} -T s + test -f ${RAWDATA_DIR}/ssb-dbgen/date.tbl || echo y | ./dbgen -s ${SSB_GEN_FACTOR} -T d + test -f ${RAWDATA_DIR}/ssb-dbgen/lineorder.tbl || echo y | ./dbgen -s ${SSB_GEN_FACTOR} -T l + } +else + test -f ${RAWDATA_DIR}/ssb-dbgen/customer.tbl || echo y | ./dbgen -s ${SSB_GEN_FACTOR} -T c + test -f ${RAWDATA_DIR}/ssb-dbgen/part.tbl || echo y | ./dbgen -s ${SSB_GEN_FACTOR} -T p + test -f ${RAWDATA_DIR}/ssb-dbgen/supplier.tbl || echo y | ./dbgen -s ${SSB_GEN_FACTOR} -T s + test -f ${RAWDATA_DIR}/ssb-dbgen/date.tbl || echo y | ./dbgen -s ${SSB_GEN_FACTOR} -T d + test -f ${RAWDATA_DIR}/ssb-dbgen/lineorder.tbl || echo y | ./dbgen -s ${SSB_GEN_FACTOR} -T l + +fi + +filenum=`find ${RAWDATA_DIR}/ssb-dbgen/ -name "*.tbl" | wc -l` + +if [ $filenum -ne 5 ];then + echo "generate ssb data file *.tbl faild" + exit 1 +fi + +function kill_instance(){ +instance_alive=1 +for i in {1..2} +do + pkill clickhouse && sleep 5 + instance_alive=0 + for i in $(seq 0 $[inst_num-1]) + do + netstat -nltp | grep ${ckport[i]} > /dev/null + if [ $? -ne 1 ];then + instance_alive=1 + break; + fi + done + if [ $instance_alive -eq 0 ];then + break; + fi +done +if [ $instance_alive -eq 0 ];then + echo "kill_instance OK!" +else + echo "kill_instance Failed -> clickhouse server instance still alive due to 10s timeout" + exit 1 +fi +} + +function run_test(){ +is_xml=0 +for i in $(seq 0 $[inst_num-1]) +do + if [ -f ${database_dir}/${1}${dir_server[i]}/config_${1}${dir_server[i]}.xml ]; then + is_xml=$[is_xml+1] + fi +done +if [ $is_xml -eq $inst_num ];then + echo "Benchmark with $inst_num instance" + start_clickhouse_for_insertion ${1} + + for i in $(seq 0 $[inst_num-1]) + do + clickhouse client --host ${ckhost} --port ${ckport[i]} -m -q"show databases;" >/dev/null + done + + if [ $? -eq 0 ];then + check_table + fi + kill_instance + + if [ $1 == "deflate" ];then + test -f ${LOG_DIR}/${1}_server_log && deflatemsg=`cat ${LOG_DIR}/${1}_server_log | grep DeflateJobHWPool` + if [ -n "$deflatemsg" ];then + echo ------------------------------------------------------ + echo $deflatemsg + echo ------------------------------------------------------ + fi + fi + echo "Check table data required in server_${1} -> Done! " + + start_clickhouse_for_stressing ${1} + for i in $(seq 0 $[inst_num-1]) + do + clickhouse client --host ${ckhost} --port ${ckport[i]} -m -q"show databases;" >/dev/null + done + if [ $? -eq 0 ];then + test -d ${CLIENT_SCRIPTS_DIR} && cd ${CLIENT_SCRIPTS_DIR} + echo "Client stressing... " + echo "${CLIENT_BIND_CMD} python3 client_stressing_test.py ${QUERY_FILE} $inst_num &> ${LOG_DIR}/${1}.log" + ${CLIENT_BIND_CMD} python3 client_stressing_test.py ${QUERY_FILE} $inst_num &> ${LOG_DIR}/${1}.log + echo "Completed client stressing, checking log... " + finish_log=`grep "Finished" ${LOG_DIR}/${1}.log | wc -l` + if [ $finish_log -eq 1 ] ;then + kill_instance + test -f ${LOG_DIR}/${1}.log && echo "${1}.log ===> ${LOG_DIR}/${1}.log" + else + kill_instance + echo "No find 'Finished' in client log -> Performance test may fail" + exit 1 + + fi + + else + echo "${1} clickhouse server start fail" + exit 1 + fi +else + echo "clickhouse server start fail -> Please check xml files required in ${database_dir} for each instance" + exit 1 + +fi +} +function clear_log(){ + if [ -d "$LOG_DIR" ]; then + cd ${LOG_DIR} && rm -rf * + fi +} + +function gather_log_for_codec(){ + cd ${OUTPUT_DIR} && mkdir -p ${LOG_PACK_FILE}/${1} + cp -rf ${LOG_DIR} ${OUTPUT_DIR}/${LOG_PACK_FILE}/${1} +} + +function pack_log(){ + if [ -e "${OUTPUT_DIR}/run.log" ]; then + cp ${OUTPUT_DIR}/run.log ${OUTPUT_DIR}/${LOG_PACK_FILE}/ + fi + echo "Please check all log information in ${OUTPUT_DIR}/${LOG_PACK_FILE}" +} + +function setup_check(){ + + iax_dev_num=`accel-config list | grep iax | wc -l` + if [ $iax_dev_num -eq 0 ] ;then + iax_dev_num=`accel-config list | grep iax | wc -l` + if [ $iax_dev_num -eq 0 ] ;then + echo "No IAA devices available -> Please check IAA hardware setup manually!" + exit 1 + else + echo "IAA enabled devices number:$iax_dev_num" + fi + else + echo "IAA enabled devices number:$iax_dev_num" + fi + libaccel_version=`accel-config -v` + clickhouser_version=`clickhouse server --version` + kernel_dxd_log=`dmesg | grep dxd` + echo "libaccel_version:$libaccel_version" + echo "clickhouser_version:$clickhouser_version" + echo -e "idxd section in kernel log:\n$kernel_dxd_log" +} + +setup_check +export CLICKHOUSE_WATCHDOG_ENABLE=0 +for i in ${CODEC_CONFIG[@]} +do + clear_log + codec=${i} + echo "run test------------$codec" + run_test $codec + gather_log_for_codec $codec +done + +pack_log +echo "Done." \ No newline at end of file diff --git a/contrib/qpl-cmake/benchmark_sample/client_scripts/client_stressing_test.py b/contrib/qpl-cmake/benchmark_sample/client_scripts/client_stressing_test.py new file mode 100644 index 00000000000..f12381a198c --- /dev/null +++ b/contrib/qpl-cmake/benchmark_sample/client_scripts/client_stressing_test.py @@ -0,0 +1,278 @@ +from operator import eq +import os +import random +import time +import sys +from clickhouse_driver import Client +import numpy as np +import subprocess +import multiprocessing +from multiprocessing import Manager + +warmup_runs = 10 +calculated_runs = 10 +seconds = 30 +max_instances_number = 8 +retest_number = 3 +retest_tolerance = 10 + + +def checkInt(str): + try: + int(str) + return True + except ValueError: + return False + + +def setup_client(index): + if index < 4: + port_idx = index + else: + port_idx = index + 4 + client = Client( + host="localhost", + database="default", + user="default", + password="", + port="900%d" % port_idx, + ) + union_mode_query = "SET union_default_mode='DISTINCT'" + client.execute(union_mode_query) + return client + + +def warm_client(clientN, clientL, query, loop): + for c_idx in range(clientN): + for _ in range(loop): + clientL[c_idx].execute(query) + + +def read_queries(queries_list): + queries = list() + queries_id = list() + with open(queries_list, "r") as f: + for line in f: + line = line.rstrip() + line = line.split("$") + queries_id.append(line[0]) + queries.append(line[1]) + return queries_id, queries + + +def run_task(client, cname, query, loop, query_latency): + start_time = time.time() + for i in range(loop): + client.execute(query) + query_latency.append(client.last_query.elapsed) + + end_time = time.time() + p95 = np.percentile(query_latency, 95) + print( + "CLIENT: {0} end. -> P95: %f, qps: %f".format(cname) + % (p95, loop / (end_time - start_time)) + ) + + +def run_multi_clients(clientN, clientList, query, loop): + client_pids = {} + start_time = time.time() + manager = multiprocessing.Manager() + query_latency_list0 = manager.list() + query_latency_list1 = manager.list() + query_latency_list2 = manager.list() + query_latency_list3 = manager.list() + query_latency_list4 = manager.list() + query_latency_list5 = manager.list() + query_latency_list6 = manager.list() + query_latency_list7 = manager.list() + + for c_idx in range(clientN): + client_name = "Role_%d" % c_idx + if c_idx == 0: + client_pids[c_idx] = multiprocessing.Process( + target=run_task, + args=(clientList[c_idx], client_name, query, loop, query_latency_list0), + ) + elif c_idx == 1: + client_pids[c_idx] = multiprocessing.Process( + target=run_task, + args=(clientList[c_idx], client_name, query, loop, query_latency_list1), + ) + elif c_idx == 2: + client_pids[c_idx] = multiprocessing.Process( + target=run_task, + args=(clientList[c_idx], client_name, query, loop, query_latency_list2), + ) + elif c_idx == 3: + client_pids[c_idx] = multiprocessing.Process( + target=run_task, + args=(clientList[c_idx], client_name, query, loop, query_latency_list3), + ) + elif c_idx == 4: + client_pids[c_idx] = multiprocessing.Process( + target=run_task, + args=(clientList[c_idx], client_name, query, loop, query_latency_list4), + ) + elif c_idx == 5: + client_pids[c_idx] = multiprocessing.Process( + target=run_task, + args=(clientList[c_idx], client_name, query, loop, query_latency_list5), + ) + elif c_idx == 6: + client_pids[c_idx] = multiprocessing.Process( + target=run_task, + args=(clientList[c_idx], client_name, query, loop, query_latency_list6), + ) + elif c_idx == 7: + client_pids[c_idx] = multiprocessing.Process( + target=run_task, + args=(clientList[c_idx], client_name, query, loop, query_latency_list7), + ) + else: + print("ERROR: CLIENT number dismatch!!") + exit() + print("CLIENT: %s start" % client_name) + client_pids[c_idx].start() + + for c_idx in range(clientN): + client_pids[c_idx].join() + end_time = time.time() + totalT = end_time - start_time + + query_latencyTotal = list() + for item in query_latency_list0: + query_latencyTotal.append(item) + for item in query_latency_list1: + query_latencyTotal.append(item) + for item in query_latency_list2: + query_latencyTotal.append(item) + for item in query_latency_list3: + query_latencyTotal.append(item) + for item in query_latency_list4: + query_latencyTotal.append(item) + for item in query_latency_list5: + query_latencyTotal.append(item) + for item in query_latency_list6: + query_latencyTotal.append(item) + for item in query_latency_list7: + query_latencyTotal.append(item) + + totalP95 = np.percentile(query_latencyTotal, 95) * 1000 + return totalT, totalP95 + + +def run_task_caculated(client, cname, query, loop): + query_latency = list() + start_time = time.time() + for i in range(loop): + client.execute(query) + query_latency.append(client.last_query.elapsed) + end_time = time.time() + p95 = np.percentile(query_latency, 95) + + +def run_multi_clients_caculated(clientN, clientList, query, loop): + client_pids = {} + start_time = time.time() + for c_idx in range(clientN): + client_name = "Role_%d" % c_idx + client_pids[c_idx] = multiprocessing.Process( + target=run_task_caculated, + args=(clientList[c_idx], client_name, query, loop), + ) + client_pids[c_idx].start() + for c_idx in range(clientN): + client_pids[c_idx].join() + end_time = time.time() + totalT = end_time - start_time + return totalT + + +if __name__ == "__main__": + client_number = 1 + queries = list() + queries_id = list() + + if len(sys.argv) != 3: + print( + "usage: python3 client_stressing_test.py [queries_file_path] [client_number]" + ) + sys.exit() + else: + queries_list = sys.argv[1] + client_number = int(sys.argv[2]) + print( + "queries_file_path: %s, client_number: %d" % (queries_list, client_number) + ) + if not os.path.isfile(queries_list) or not os.access(queries_list, os.R_OK): + print("please check the right path for queries file") + sys.exit() + if ( + not checkInt(sys.argv[2]) + or int(sys.argv[2]) > max_instances_number + or int(sys.argv[2]) < 1 + ): + print("client_number should be in [1~%d]" % max_instances_number) + sys.exit() + + client_list = {} + queries_id, queries = read_queries(queries_list) + + for c_idx in range(client_number): + client_list[c_idx] = setup_client(c_idx) + # clear cache + os.system("sync; echo 3 > /proc/sys/vm/drop_caches") + + print("###Polit Run Begin") + for i in queries: + warm_client(client_number, client_list, i, 1) + print("###Polit Run End -> Start stressing....") + + query_index = 0 + for q in queries: + print( + "\n###START -> Index: %d, ID: %s, Query: %s" + % (query_index, queries_id[query_index], q) + ) + warm_client(client_number, client_list, q, warmup_runs) + print("###Warm Done!") + for j in range(0, retest_number): + totalT = run_multi_clients_caculated( + client_number, client_list, q, calculated_runs + ) + curr_loop = int(seconds * calculated_runs / totalT) + 1 + print( + "###Calculation Done! -> loopN: %d, expected seconds:%d" + % (curr_loop, seconds) + ) + + print("###Stress Running! -> %d iterations......" % curr_loop) + + totalT, totalP95 = run_multi_clients( + client_number, client_list, q, curr_loop + ) + + if totalT > (seconds - retest_tolerance) and totalT < ( + seconds + retest_tolerance + ): + break + else: + print( + "###totalT:%d is far way from expected seconds:%d. Run again ->j:%d!" + % (totalT, seconds, j) + ) + + print( + "###Completed! -> ID: %s, clientN: %d, totalT: %.2f s, latencyAVG: %.2f ms, P95: %.2f ms, QPS_Final: %.2f" + % ( + queries_id[query_index], + client_number, + totalT, + totalT * 1000 / (curr_loop * client_number), + totalP95, + ((curr_loop * client_number) / totalT), + ) + ) + query_index += 1 + print("###Finished!") diff --git a/contrib/qpl-cmake/benchmark_sample/client_scripts/queries_ssb.sql b/contrib/qpl-cmake/benchmark_sample/client_scripts/queries_ssb.sql new file mode 100644 index 00000000000..abf2df6503a --- /dev/null +++ b/contrib/qpl-cmake/benchmark_sample/client_scripts/queries_ssb.sql @@ -0,0 +1,10 @@ +Q1.1$SELECT sum(LO_EXTENDEDPRICE * LO_DISCOUNT) AS revenue FROM lineorder_flat WHERE toYear(LO_ORDERDATE) = 1993 AND LO_DISCOUNT BETWEEN 1 AND 3 AND LO_QUANTITY < 25; +Q2.1$SELECT sum(LO_REVENUE),toYear(LO_ORDERDATE) AS year,P_BRAND FROM lineorder_flat WHERE P_CATEGORY = 'MFGR#12' AND S_REGION = 'AMERICA' GROUP BY year,P_BRAND ORDER BY year,P_BRAND; +Q2.2$SELECT sum(LO_REVENUE),toYear(LO_ORDERDATE) AS year,P_BRAND FROM lineorder_flat WHERE P_BRAND >= 'MFGR#2221' AND P_BRAND <= 'MFGR#2228' AND S_REGION = 'ASIA' GROUP BY year,P_BRAND ORDER BY year,P_BRAND; +Q2.3$SELECT sum(LO_REVENUE),toYear(LO_ORDERDATE) AS year,P_BRAND FROM lineorder_flat WHERE P_BRAND = 'MFGR#2239' AND S_REGION = 'EUROPE' GROUP BY year,P_BRAND ORDER BY year,P_BRAND; +Q3.1$SELECT C_NATION,S_NATION,toYear(LO_ORDERDATE) AS year,sum(LO_REVENUE) AS revenue FROM lineorder_flat WHERE C_REGION = 'ASIA' AND S_REGION = 'ASIA' AND year >= 1992 AND year <= 1997 GROUP BY C_NATION,S_NATION,year ORDER BY year ASC,revenue DESC; +Q3.2$SELECT C_CITY,S_CITY,toYear(LO_ORDERDATE) AS year,sum(LO_REVENUE) AS revenue FROM lineorder_flat WHERE C_NATION = 'UNITED STATES' AND S_NATION = 'UNITED STATES' AND year >= 1992 AND year <= 1997 GROUP BY C_CITY,S_CITY,year ORDER BY year ASC,revenue DESC; +Q3.3$SELECT C_CITY,S_CITY,toYear(LO_ORDERDATE) AS year,sum(LO_REVENUE) AS revenue FROM lineorder_flat WHERE (C_CITY = 'UNITED KI1' OR C_CITY = 'UNITED KI5') AND (S_CITY = 'UNITED KI1' OR S_CITY = 'UNITED KI5') AND year >= 1992 AND year <= 1997 GROUP BY C_CITY,S_CITY,year ORDER BY year ASC,revenue DESC; +Q4.1$SELECT toYear(LO_ORDERDATE) AS year,C_NATION,sum(LO_REVENUE - LO_SUPPLYCOST) AS profit FROM lineorder_flat WHERE C_REGION = 'AMERICA' AND S_REGION = 'AMERICA' AND (P_MFGR = 'MFGR#1' OR P_MFGR = 'MFGR#2') GROUP BY year,C_NATION ORDER BY year ASC,C_NATION ASC; +Q4.2$SELECT toYear(LO_ORDERDATE) AS year,S_NATION,P_CATEGORY,sum(LO_REVENUE - LO_SUPPLYCOST) AS profit FROM lineorder_flat WHERE C_REGION = 'AMERICA' AND S_REGION = 'AMERICA' AND (year = 1997 OR year = 1998) AND (P_MFGR = 'MFGR#1' OR P_MFGR = 'MFGR#2') GROUP BY year,S_NATION,P_CATEGORY ORDER BY year ASC,S_NATION ASC,P_CATEGORY ASC; +Q4.3$SELECT toYear(LO_ORDERDATE) AS year,S_CITY,P_BRAND,sum(LO_REVENUE - LO_SUPPLYCOST) AS profit FROM lineorder_flat WHERE S_NATION = 'UNITED STATES' AND (year = 1997 OR year = 1998) AND P_CATEGORY = 'MFGR#14' GROUP BY year,S_CITY,P_BRAND ORDER BY year ASC,S_CITY ASC,P_BRAND ASC; diff --git a/contrib/qpl-cmake/benchmark_sample/client_scripts/run_ssb.sh b/contrib/qpl-cmake/benchmark_sample/client_scripts/run_ssb.sh new file mode 100644 index 00000000000..6067b1058f2 --- /dev/null +++ b/contrib/qpl-cmake/benchmark_sample/client_scripts/run_ssb.sh @@ -0,0 +1,6 @@ +WORKING_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)/.." +if [ ! -d "${WORKING_DIR}/output" ]; then +mkdir ${WORKING_DIR}/output +fi +bash allin1_ssb.sh 2 > ${WORKING_DIR}/output/run.log +echo "Please check log in: ${WORKING_DIR}/output/run.log" \ No newline at end of file diff --git a/contrib/qpl-cmake/benchmark_sample/database_dir/deflate/config_deflate.xml b/contrib/qpl-cmake/benchmark_sample/database_dir/deflate/config_deflate.xml new file mode 100644 index 00000000000..ab77a9cdcbe --- /dev/null +++ b/contrib/qpl-cmake/benchmark_sample/database_dir/deflate/config_deflate.xml @@ -0,0 +1,49 @@ + + + + + trace + true + + + 8123 + 9000 + 9004 + + ./ + + 8589934592 + 5368709120 + true + + + + deflate_qpl + + + + + + + + + ::/0 + + + default + default + 1 + + + + + + + + + + + diff --git a/contrib/qpl-cmake/benchmark_sample/database_dir/deflate_s2/config_deflate_s2.xml b/contrib/qpl-cmake/benchmark_sample/database_dir/deflate_s2/config_deflate_s2.xml new file mode 100644 index 00000000000..b71456486f5 --- /dev/null +++ b/contrib/qpl-cmake/benchmark_sample/database_dir/deflate_s2/config_deflate_s2.xml @@ -0,0 +1,49 @@ + + + + + trace + true + + + 8124 + 9001 + 9005 + + ./ + + 8589934592 + 5368709120 + true + + + + deflate_qpl + + + + + + + + + ::/0 + + + default + default + 1 + + + + + + + + + + + diff --git a/contrib/qpl-cmake/benchmark_sample/database_dir/lz4/config_lz4.xml b/contrib/qpl-cmake/benchmark_sample/database_dir/lz4/config_lz4.xml new file mode 100644 index 00000000000..f4dc59b60aa --- /dev/null +++ b/contrib/qpl-cmake/benchmark_sample/database_dir/lz4/config_lz4.xml @@ -0,0 +1,49 @@ + + + + + trace + true + + + 8123 + 9000 + 9004 + + ./ + + 8589934592 + 5368709120 + true + + + + lz4 + + + + + + + + + ::/0 + + + default + default + 1 + + + + + + + + + + + diff --git a/contrib/qpl-cmake/benchmark_sample/database_dir/lz4_s2/config_lz4_s2.xml b/contrib/qpl-cmake/benchmark_sample/database_dir/lz4_s2/config_lz4_s2.xml new file mode 100644 index 00000000000..357db8942d7 --- /dev/null +++ b/contrib/qpl-cmake/benchmark_sample/database_dir/lz4_s2/config_lz4_s2.xml @@ -0,0 +1,49 @@ + + + + + trace + true + + + 8124 + 9001 + 9005 + + ./ + + 8589934592 + 5368709120 + true + + + + lz4 + + + + + + + + + ::/0 + + + default + default + 1 + + + + + + + + + + + diff --git a/contrib/qpl-cmake/benchmark_sample/database_dir/zstd/config_zstd.xml b/contrib/qpl-cmake/benchmark_sample/database_dir/zstd/config_zstd.xml new file mode 100644 index 00000000000..1c4c738edaf --- /dev/null +++ b/contrib/qpl-cmake/benchmark_sample/database_dir/zstd/config_zstd.xml @@ -0,0 +1,49 @@ + + + + + trace + true + + + 8123 + 9000 + 9004 + + ./ + + 8589934592 + 5368709120 + true + + + + zstd + + + + + + + + + ::/0 + + + default + default + 1 + + + + + + + + + + + diff --git a/contrib/qpl-cmake/benchmark_sample/database_dir/zstd_s2/config_zstd_s2.xml b/contrib/qpl-cmake/benchmark_sample/database_dir/zstd_s2/config_zstd_s2.xml new file mode 100644 index 00000000000..f3db01b7739 --- /dev/null +++ b/contrib/qpl-cmake/benchmark_sample/database_dir/zstd_s2/config_zstd_s2.xml @@ -0,0 +1,49 @@ + + + + + trace + true + + + 8124 + 9001 + 9005 + + ./ + + 8589934592 + 5368709120 + true + + + + zstd + + + + + + + + + ::/0 + + + default + default + 1 + + + + + + + + + + + diff --git a/contrib/vectorscan b/contrib/vectorscan index f6250ae3e5a..b4bba94b1a2 160000 --- a/contrib/vectorscan +++ b/contrib/vectorscan @@ -1 +1 @@ -Subproject commit f6250ae3e5a3085000239313ad0689cc1e00cdc2 +Subproject commit b4bba94b1a250603b0b198e0394946e32f6c3f30 diff --git a/docker/test/performance-comparison/download.sh b/docker/test/performance-comparison/download.sh index e3df98c7da1..aee11030068 100755 --- a/docker/test/performance-comparison/download.sh +++ b/docker/test/performance-comparison/download.sh @@ -3,7 +3,9 @@ set -ex set -o pipefail trap "exit" INT TERM trap 'kill $(jobs -pr) ||:' EXIT +S3_URL=${S3_URL:="https://clickhouse-builds.s3.amazonaws.com"} BUILD_NAME=${BUILD_NAME:-package_release} +export S3_URL BUILD_NAME mkdir db0 ||: mkdir left ||: @@ -28,8 +30,9 @@ function download # Historically there were various paths for the performance test package. # Test all of them. declare -a urls_to_try=( - "https://s3.amazonaws.com/clickhouse-builds/$left_pr/$left_sha/$BUILD_NAME/performance.tar.zst" - "https://s3.amazonaws.com/clickhouse-builds/$left_pr/$left_sha/$BUILD_NAME/performance.tgz" + "$S3_URL/PRs/$left_pr/$left_sha/$BUILD_NAME/performance.tar.zst" + "$S3_URL/$left_pr/$left_sha/$BUILD_NAME/performance.tar.zst" + "$S3_URL/$left_pr/$left_sha/$BUILD_NAME/performance.tgz" ) for path in "${urls_to_try[@]}" diff --git a/docker/test/performance-comparison/entrypoint.sh b/docker/test/performance-comparison/entrypoint.sh index 75b25412ac4..74571777be0 100755 --- a/docker/test/performance-comparison/entrypoint.sh +++ b/docker/test/performance-comparison/entrypoint.sh @@ -6,11 +6,7 @@ export CHPC_CHECK_START_TIMESTAMP S3_URL=${S3_URL:="https://clickhouse-builds.s3.amazonaws.com"} BUILD_NAME=${BUILD_NAME:-package_release} - -COMMON_BUILD_PREFIX="/clickhouse_build_check" -if [[ $S3_URL == *"s3.amazonaws.com"* ]]; then - COMMON_BUILD_PREFIX="" -fi +export S3_URL BUILD_NAME # Sometimes AWS responde with DNS error and it's impossible to retry it with # current curl version options. @@ -66,8 +62,9 @@ function find_reference_sha # test all of them. unset found declare -a urls_to_try=( - "https://s3.amazonaws.com/clickhouse-builds/0/$REF_SHA/$BUILD_NAME/performance.tar.zst" - "https://s3.amazonaws.com/clickhouse-builds/0/$REF_SHA/$BUILD_NAME/performance.tgz" + "$S3_URL/PRs/0/$REF_SHA/$BUILD_NAME/performance.tar.zst" + "$S3_URL/0/$REF_SHA/$BUILD_NAME/performance.tar.zst" + "$S3_URL/0/$REF_SHA/$BUILD_NAME/performance.tgz" ) for path in "${urls_to_try[@]}" do @@ -92,10 +89,15 @@ chmod 777 workspace output cd workspace # Download the package for the version we are going to test. -if curl_with_retry "$S3_URL/$PR_TO_TEST/$SHA_TO_TEST$COMMON_BUILD_PREFIX/$BUILD_NAME/performance.tar.zst" -then - right_path="$S3_URL/$PR_TO_TEST/$SHA_TO_TEST$COMMON_BUILD_PREFIX/$BUILD_NAME/performance.tar.zst" -fi +# A temporary solution for migrating into PRs directory +for prefix in "$S3_URL/PRs" "$S3_URL"; +do + if curl_with_retry "$prefix/$PR_TO_TEST/$SHA_TO_TEST/$BUILD_NAME/performance.tar.zst" + then + right_path="$prefix/$PR_TO_TEST/$SHA_TO_TEST/$BUILD_NAME/performance.tar.zst" + break + fi +done mkdir right wget -nv -nd -c "$right_path" -O- | tar -C right --no-same-owner --strip-components=1 --zstd --extract --verbose diff --git a/docker/test/performance-comparison/perf.py b/docker/test/performance-comparison/perf.py index 65bf49c2914..7a4e6386d0d 100755 --- a/docker/test/performance-comparison/perf.py +++ b/docker/test/performance-comparison/perf.py @@ -26,6 +26,7 @@ logging.basicConfig( total_start_seconds = time.perf_counter() stage_start_seconds = total_start_seconds + # Thread executor that does not hides exception that happens during function # execution, and rethrows it after join() class SafeThread(Thread): @@ -158,6 +159,7 @@ for e in subst_elems: available_parameters[name] = values + # Takes parallel lists of templates, substitutes them with all combos of # parameters. The set of parameters is determined based on the first list. # Note: keep the order of queries -- sometimes we have DROP IF EXISTS diff --git a/docker/test/performance-comparison/report.py b/docker/test/performance-comparison/report.py index 782cf29863c..214f2d550b4 100755 --- a/docker/test/performance-comparison/report.py +++ b/docker/test/performance-comparison/report.py @@ -670,7 +670,6 @@ if args.report == "main": ) elif args.report == "all-queries": - print((header_template.format())) add_tested_commits() diff --git a/docker/test/stateful/run.sh b/docker/test/stateful/run.sh index e7a400b8216..c973b6c6ec6 100755 --- a/docker/test/stateful/run.sh +++ b/docker/test/stateful/run.sh @@ -128,7 +128,7 @@ function run_tests() set +e if [[ -n "$USE_PARALLEL_REPLICAS" ]] && [[ "$USE_PARALLEL_REPLICAS" -eq 1 ]]; then - clickhouse-test --client="clickhouse-client --use_hedged_requests=0 --allow_experimental_parallel_reading_from_replicas=1 \ + clickhouse-test --client="clickhouse-client --use_hedged_requests=0 --allow_experimental_parallel_reading_from_replicas=1 --parallel_replicas_for_non_replicated_merge_tree=1 \ --max_parallel_replicas=100 --cluster_for_parallel_replicas='parallel_replicas'" \ -j 2 --testname --shard --zookeeper --check-zookeeper-session --no-stateless --no-parallel-replicas --hung-check --print-time "${ADDITIONAL_OPTIONS[@]}" \ "$SKIP_TESTS_OPTION" 2>&1 | ts '%Y-%m-%d %H:%M:%S' | tee test_output/test_result.txt diff --git a/docker/test/stateful/s3downloader b/docker/test/stateful/s3downloader index b1302877d6a..96f2aa96dd5 100755 --- a/docker/test/stateful/s3downloader +++ b/docker/test/stateful/s3downloader @@ -10,31 +10,38 @@ import requests import tempfile -DEFAULT_URL = 'https://clickhouse-datasets.s3.amazonaws.com' +DEFAULT_URL = "https://clickhouse-datasets.s3.amazonaws.com" AVAILABLE_DATASETS = { - 'hits': 'hits_v1.tar', - 'visits': 'visits_v1.tar', + "hits": "hits_v1.tar", + "visits": "visits_v1.tar", } RETRIES_COUNT = 5 + def _get_temp_file_name(): - return os.path.join(tempfile._get_default_tempdir(), next(tempfile._get_candidate_names())) + return os.path.join( + tempfile._get_default_tempdir(), next(tempfile._get_candidate_names()) + ) + def build_url(base_url, dataset): - return os.path.join(base_url, dataset, 'partitions', AVAILABLE_DATASETS[dataset]) + return os.path.join(base_url, dataset, "partitions", AVAILABLE_DATASETS[dataset]) + def dowload_with_progress(url, path): logging.info("Downloading from %s to temp path %s", url, path) for i in range(RETRIES_COUNT): try: - with open(path, 'wb') as f: + with open(path, "wb") as f: response = requests.get(url, stream=True) response.raise_for_status() - total_length = response.headers.get('content-length') + total_length = response.headers.get("content-length") if total_length is None or int(total_length) == 0: - logging.info("No content-length, will download file without progress") + logging.info( + "No content-length, will download file without progress" + ) f.write(response.content) else: dl = 0 @@ -46,7 +53,11 @@ def dowload_with_progress(url, path): if sys.stdout.isatty(): done = int(50 * dl / total_length) percent = int(100 * float(dl) / total_length) - sys.stdout.write("\r[{}{}] {}%".format('=' * done, ' ' * (50-done), percent)) + sys.stdout.write( + "\r[{}{}] {}%".format( + "=" * done, " " * (50 - done), percent + ) + ) sys.stdout.flush() break except Exception as ex: @@ -56,14 +67,21 @@ def dowload_with_progress(url, path): if os.path.exists(path): os.remove(path) else: - raise Exception("Cannot download dataset from {}, all retries exceeded".format(url)) + raise Exception( + "Cannot download dataset from {}, all retries exceeded".format(url) + ) sys.stdout.write("\n") logging.info("Downloading finished") + def unpack_to_clickhouse_directory(tar_path, clickhouse_path): - logging.info("Will unpack data from temp path %s to clickhouse db %s", tar_path, clickhouse_path) - with tarfile.open(tar_path, 'r') as comp_file: + logging.info( + "Will unpack data from temp path %s to clickhouse db %s", + tar_path, + clickhouse_path, + ) + with tarfile.open(tar_path, "r") as comp_file: comp_file.extractall(path=clickhouse_path) logging.info("Unpack finished") @@ -72,15 +90,21 @@ if __name__ == "__main__": logging.basicConfig(level=logging.INFO) parser = argparse.ArgumentParser( - description="Simple tool for dowloading datasets for clickhouse from S3") + description="Simple tool for dowloading datasets for clickhouse from S3" + ) - parser.add_argument('--dataset-names', required=True, nargs='+', choices=list(AVAILABLE_DATASETS.keys())) - parser.add_argument('--url-prefix', default=DEFAULT_URL) - parser.add_argument('--clickhouse-data-path', default='/var/lib/clickhouse/') + parser.add_argument( + "--dataset-names", + required=True, + nargs="+", + choices=list(AVAILABLE_DATASETS.keys()), + ) + parser.add_argument("--url-prefix", default=DEFAULT_URL) + parser.add_argument("--clickhouse-data-path", default="/var/lib/clickhouse/") args = parser.parse_args() datasets = args.dataset_names - logging.info("Will fetch following datasets: %s", ', '.join(datasets)) + logging.info("Will fetch following datasets: %s", ", ".join(datasets)) for dataset in datasets: logging.info("Processing %s", dataset) temp_archive_path = _get_temp_file_name() @@ -92,10 +116,11 @@ if __name__ == "__main__": logging.info("Some exception occured %s", str(ex)) raise finally: - logging.info("Will remove downloaded file %s from filesystem if it exists", temp_archive_path) + logging.info( + "Will remove downloaded file %s from filesystem if it exists", + temp_archive_path, + ) if os.path.exists(temp_archive_path): os.remove(temp_archive_path) logging.info("Processing of %s finished", dataset) logging.info("Fetch finished, enjoy your tables!") - - diff --git a/docker/test/stateless/run.sh b/docker/test/stateless/run.sh index ade59224035..e509809c028 100755 --- a/docker/test/stateless/run.sh +++ b/docker/test/stateless/run.sh @@ -170,6 +170,7 @@ if [[ -n "$USE_DATABASE_REPLICATED" ]] && [[ "$USE_DATABASE_REPLICATED" -eq 1 ]] fi rg -Fa "" /var/log/clickhouse-server/clickhouse-server.log ||: +rg -A50 -Fa "============" /var/log/clickhouse-server/stderr.log ||: zstd --threads=0 < /var/log/clickhouse-server/clickhouse-server.log > /test_output/clickhouse-server.log.zst & # Compress tables. diff --git a/docker/test/stress/run.sh b/docker/test/stress/run.sh index 314e9c2acfd..bfad2c9a7c5 100644 --- a/docker/test/stress/run.sh +++ b/docker/test/stress/run.sh @@ -41,6 +41,9 @@ if [ "$is_tsan_build" -eq "0" ]; then export THREAD_FUZZER_pthread_mutex_lock_AFTER_SLEEP_TIME_US=10000 export THREAD_FUZZER_pthread_mutex_unlock_BEFORE_SLEEP_TIME_US=10000 export THREAD_FUZZER_pthread_mutex_unlock_AFTER_SLEEP_TIME_US=10000 + + export THREAD_FUZZER_EXPLICIT_SLEEP_PROBABILITY=0.01 + export THREAD_FUZZER_EXPLICIT_MEMORY_EXCEPTION_PROBABILITY=0.01 fi export ZOOKEEPER_FAULT_INJECTION=1 diff --git a/docker/test/style/Dockerfile b/docker/test/style/Dockerfile index e8c5e17024c..746cc7bb2d5 100644 --- a/docker/test/style/Dockerfile +++ b/docker/test/style/Dockerfile @@ -11,13 +11,14 @@ RUN apt-get update && env DEBIAN_FRONTEND=noninteractive apt-get install --yes \ aspell \ curl \ git \ + file \ libxml2-utils \ moreutils \ python3-fuzzywuzzy \ python3-pip \ shellcheck \ yamllint \ - && pip3 install black==22.8.0 boto3 codespell==2.2.1 dohq-artifactory mypy PyGithub unidiff pylint==2.6.2 \ + && pip3 install black==23.1.0 boto3 codespell==2.2.1 dohq-artifactory mypy PyGithub unidiff pylint==2.6.2 \ && apt-get clean \ && rm -rf /root/.cache/pip diff --git a/docs/en/development/build.md b/docs/en/development/build.md index d52b018a5a7..804aa8a3dc5 100644 --- a/docs/en/development/build.md +++ b/docs/en/development/build.md @@ -159,4 +159,3 @@ The CI checks build the binaries on each commit to [ClickHouse](https://github.c 1. Find the type of package for your operating system that you need and download the files. ![build artifact check](images/find-build-artifact.png) - diff --git a/docs/en/development/building_and_benchmarking_deflate_qpl.md b/docs/en/development/building_and_benchmarking_deflate_qpl.md new file mode 100644 index 00000000000..60d6b1c7b76 --- /dev/null +++ b/docs/en/development/building_and_benchmarking_deflate_qpl.md @@ -0,0 +1,283 @@ +--- +slug: /en/development/building_and_benchmarking_deflate_qpl +sidebar_position: 73 +sidebar_label: Building and Benchmarking DEFLATE_QPL +description: How to build Clickhouse and run benchmark with DEFLATE_QPL Codec +--- +# Build Clickhouse with DEFLATE_QPL +- Make sure your target machine meet the QPL required [Prerequisites](https://intel.github.io/qpl/documentation/get_started_docs/installation.html#prerequisites) +- Pass the following flag to CMake when building ClickHouse, depending on the capabilities of your target machine: +``` bash +cmake -DENABLE_AVX2=1 -DENABLE_QPL=1 .. +``` +or +``` bash +cmake -DENABLE_AVX512=1 -DENABLE_QPL=1 .. +``` +- For generic requirements, please refer to Clickhouse generic [build instructions](/docs/en/development/build.md) + +# Run Benchmark with DEFLATE_QPL +## Files list +The folders `benchmark_sample` under [qpl-cmake](https://github.com/ClickHouse/ClickHouse/tree/master/contrib/qpl-cmake) give example to run benchmark with python scripts: + +`client_scripts` contains python scripts for running typical benchmark, for example: +- `client_stressing_test.py`: The python script for query stress test with [1~4] server instances. +- `queries_ssb.sql`: The file lists all queries for [Star Schema Benchmark](https://clickhouse.com/docs/en/getting-started/example-datasets/star-schema/) +- `allin1_ssb.sh`: This shell script executes benchmark workflow all in one automatically. + +`database_files` means it will store database files according to lz4/deflate/zstd codec. + +## Run benchmark automatically for Star Schema: +``` bash +$ cd ./benchmark_sample/client_scripts +$ sh run_ssb.sh +``` +After complete, please check all the results in this folder:`./output/` + +In case you run into failure, please manually run benchmark as below sections. + +## Definition +[CLICKHOUSE_EXE] means the path of clickhouse executable program. + +## Environment +- CPU: Sapphire Rapid +- OS Requirements refer to [System Requirements for QPL](https://intel.github.io/qpl/documentation/get_started_docs/installation.html#system-requirements) +- IAA Setup refer to [Accelerator Configuration](https://intel.github.io/qpl/documentation/get_started_docs/installation.html#accelerator-configuration) +- Install python modules: +``` bash +pip3 install clickhouse_driver numpy +``` +[Self-check for IAA] +``` bash +$ accel-config list | grep -P 'iax|state' +``` +Expected output like this: +``` bash + "dev":"iax1", + "state":"enabled", + "state":"enabled", +``` +If you see nothing output, it means IAA is not ready to work. Please check IAA setup again. + +## Generate raw data +``` bash +$ cd ./benchmark_sample +$ mkdir rawdata_dir && cd rawdata_dir +``` +Use [`dbgen`](https://clickhouse.com/docs/en/getting-started/example-datasets/star-schema) to generate 100 million rows data with the parameters: +-s 20 + +The files like `*.tbl` are expected to output under `./benchmark_sample/rawdata_dir/ssb-dbgen`: + +## Database setup +Set up database with LZ4 codec + +``` bash +$ cd ./database_dir/lz4 +$ [CLICKHOUSE_EXE] server -C config_lz4.xml >&/dev/null& +$ [CLICKHOUSE_EXE] client +``` +Here you should see the message `Connected to ClickHouse server` from console which means client successfully setup connection with server. + +Complete below three steps mentioned in [Star Schema Benchmark](https://clickhouse.com/docs/en/getting-started/example-datasets/star-schema) +- Creating tables in ClickHouse +- Inserting data. Here should use `./benchmark_sample/rawdata_dir/ssb-dbgen/*.tbl` as input data. +- Converting “star schema†to de-normalized “flat schema†+ +Set up database with with IAA Deflate codec + +``` bash +$ cd ./database_dir/deflate +$ [CLICKHOUSE_EXE] server -C config_deflate.xml >&/dev/null& +$ [CLICKHOUSE_EXE] client +``` +Complete three steps same as lz4 above + +Set up database with with ZSTD codec + +``` bash +$ cd ./database_dir/zstd +$ [CLICKHOUSE_EXE] server -C config_zstd.xml >&/dev/null& +$ [CLICKHOUSE_EXE] client +``` +Complete three steps same as lz4 above + +[self-check] +For each codec(lz4/zstd/deflate), please execute below query to make sure the databases are created successfully: +```sql +select count() from lineorder_flat +``` +You are expected to see below output: +```sql +┌───count()─┠+│ 119994608 │ +└───────────┘ +``` +[Self-check for IAA Deflate codec] +At the first time you execute insertion or query from client, clickhouse server console is expected to print this log: +```text +Hardware-assisted DeflateQpl codec is ready! +``` +If you never find this, but see another log as below: +```text +Initialization of hardware-assisted DeflateQpl codec failed +``` +That means IAA devices is not ready, you need check IAA setup again. + +## Benchmark with single instance +- Before start benchmark, Please disable C6 and set CPU frequency governor to be `performance` +``` bash +$ cpupower idle-set -d 3 +$ cpupower frequency-set -g performance +``` +- To eliminate impact of memory bound on cross sockets, we use `numactl` to bind server on one socket and client on another socket. +- Single instance means single server connected with single client + +Now run benchmark for LZ4/Deflate/ZSTD respectively: + +LZ4: +``` bash +$ cd ./database_dir/lz4 +$ numactl -m 0 -N 0 [CLICKHOUSE_EXE] server -C config_lz4.xml >&/dev/null& +$ cd ./client_scripts +$ numactl -m 1 -N 1 python3 client_stressing_test.py queries_ssb.sql 1 > lz4.log +``` + +IAA deflate: +``` bash +$ cd ./database_dir/deflate +$ numactl -m 0 -N 0 [CLICKHOUSE_EXE] server -C config_deflate.xml >&/dev/null& +$ cd ./client_scripts +$ numactl -m 1 -N 1 python3 client_stressing_test.py queries_ssb.sql 1 > deflate.log +``` +ZSTD: +``` bash +$ cd ./database_dir/zstd +$ numactl -m 0 -N 0 [CLICKHOUSE_EXE] server -C config_zstd.xml >&/dev/null& +$ cd ./client_scripts +$ numactl -m 1 -N 1 python3 client_stressing_test.py queries_ssb.sql 1 > zstd.log +``` + +Now three logs should be output as expected: +```text +lz4.log +deflate.log +zstd.log +``` + +How to check performance metrics: + +We focus on QPS, please search the keyword: `QPS_Final` and collect statistics + +## Benchmark with multi-instances +- To reduce impact of memory bound on too much threads, We recommend run benchmark with multi-instances. +- Multi-instance means multiple(2 or 4)servers connected with respective client. +- The cores of one socket need to be divided equally and assigned to the servers respectively. +- For multi-instances, must create new folder for each codec and insert dataset by following the similar steps as single instance. + +There are 2 differences: +- For client side, you need launch clickhouse with the assigned port during table creation and data insertion. +- For server side, you need launch clickhouse with the specific xml config file in which port has been assigned. All customized xml config files for multi-instances has been provided under ./server_config. + +Here we assume there are 60 cores per socket and take 2 instances for example. +Launch server for first instance +LZ4: +``` bash +$ cd ./database_dir/lz4 +$ numactl -C 0-29,120-149 [CLICKHOUSE_EXE] server -C config_lz4.xml >&/dev/null& +``` +ZSTD: +``` bash +$ cd ./database_dir/zstd +$ numactl -C 0-29,120-149 [CLICKHOUSE_EXE] server -C config_zstd.xml >&/dev/null& +``` +IAA Deflate: +``` bash +$ cd ./database_dir/deflate +$ numactl -C 0-29,120-149 [CLICKHOUSE_EXE] server -C config_deflate.xml >&/dev/null& +``` +[Launch server for second instance] + +LZ4: +``` bash +$ cd ./database_dir && mkdir lz4_s2 && cd lz4_s2 +$ cp ../../server_config/config_lz4_s2.xml ./ +$ numactl -C 30-59,150-179 [CLICKHOUSE_EXE] server -C config_lz4_s2.xml >&/dev/null& +``` +ZSTD: +``` bash +$ cd ./database_dir && mkdir zstd_s2 && cd zstd_s2 +$ cp ../../server_config/config_zstd_s2.xml ./ +$ numactl -C 30-59,150-179 [CLICKHOUSE_EXE] server -C config_zstd_s2.xml >&/dev/null& +``` +IAA Deflate: +``` bash +$ cd ./database_dir && mkdir deflate_s2 && cd deflate_s2 +$ cp ../../server_config/config_deflate_s2.xml ./ +$ numactl -C 30-59,150-179 [CLICKHOUSE_EXE] server -C config_deflate_s2.xml >&/dev/null& +``` + +Creating tables && Inserting data for second instance + +Creating tables: +``` bash +$ [CLICKHOUSE_EXE] client -m --port=9001 +``` +Inserting data: +``` bash +$ [CLICKHOUSE_EXE] client --query "INSERT INTO [TBL_FILE_NAME] FORMAT CSV" < [TBL_FILE_NAME].tbl --port=9001 +``` +- [TBL_FILE_NAME] represents the name of a file named with the regular expression: *. tbl under `./benchmark_sample/rawdata_dir/ssb-dbgen`. +- `--port=9001` stands for the assigned port for server instance which is also defined in config_lz4_s2.xml/config_zstd_s2.xml/config_deflate_s2.xml. For even more instances, you need replace it with the value: 9002/9003 which stand for s3/s4 instance respectively. If you don't assign it, the port is 9000 by default which has been used by first instance. + +Benchmarking with 2 instances + +LZ4: +``` bash +$ cd ./database_dir/lz4 +$ numactl -C 0-29,120-149 [CLICKHOUSE_EXE] server -C config_lz4.xml >&/dev/null& +$ cd ./database_dir/lz4_s2 +$ numactl -C 30-59,150-179 [CLICKHOUSE_EXE] server -C config_lz4_s2.xml >&/dev/null& +$ cd ./client_scripts +$ numactl -m 1 -N 1 python3 client_stressing_test.py queries_ssb.sql 2 > lz4_2insts.log +``` +ZSTD: +``` bash +$ cd ./database_dir/zstd +$ numactl -C 0-29,120-149 [CLICKHOUSE_EXE] server -C config_zstd.xml >&/dev/null& +$ cd ./database_dir/zstd_s2 +$ numactl -C 30-59,150-179 [CLICKHOUSE_EXE] server -C config_zstd_s2.xml >&/dev/null& +$ cd ./client_scripts +$ numactl -m 1 -N 1 python3 client_stressing_test.py queries_ssb.sql 2 > zstd_2insts.log +``` +IAA deflate +``` bash +$ cd ./database_dir/deflate +$ numactl -C 0-29,120-149 [CLICKHOUSE_EXE] server -C config_deflate.xml >&/dev/null& +$ cd ./database_dir/deflate_s2 +$ numactl -C 30-59,150-179 [CLICKHOUSE_EXE] server -C config_deflate_s2.xml >&/dev/null& +$ cd ./client_scripts +$ numactl -m 1 -N 1 python3 client_stressing_test.py queries_ssb.sql 2 > deflate_2insts.log +``` +Here the last argument: `2` of client_stressing_test.py stands for the number of instances. For more instances, you need replace it with the value: 3 or 4. This script support up to 4 instances/ + +Now three logs should be output as expected: +``` text +lz4_2insts.log +deflate_2insts.log +zstd_2insts.log +``` +How to check performance metrics: + +We focus on QPS, please search the keyword: `QPS_Final` and collect statistics + +Benchmark setup for 4 instances is similar with 2 instances above. +We recommend use 2 instances benchmark data as final report for review. + +## Tips +Each time before launch new clickhouse server, please make sure no background clickhouse process running, please check and kill old one: +``` bash +$ ps -aux| grep clickhouse +$ kill -9 [PID] +``` +By comparing the query list in ./client_scripts/queries_ssb.sql with official [Star Schema Benchmark](https://clickhouse.com/docs/en/getting-started/example-datasets/star-schema), you will find 3 queries are not included: Q1.2/Q1.3/Q3.4 . This is because cpu utilization% is very low <10% for these queries which means cannot demonstrate performance differences. diff --git a/docs/en/development/tests.md b/docs/en/development/tests.md index eb424ee7cbe..98dbe5f8d57 100644 --- a/docs/en/development/tests.md +++ b/docs/en/development/tests.md @@ -71,7 +71,7 @@ SELECT 1 | `global` | Same as `shard`. Prefer `shard` || | `zookeeper` | Test requires Zookeeper or ClickHouse Keeper to run | Test uses `ReplicatedMergeTree` | | `replica` | Same as `zookeeper`. Prefer `zookeeper` || -| `no-fasttest`| Test is not run under [Fast test](continuous-integration#fast-test) | Test uses `MySQL` table engine which is disabled in Fast test| +| `no-fasttest`| Test is not run under [Fast test](continuous-integration.md#fast-test) | Test uses `MySQL` table engine which is disabled in Fast test| | `no-[asan, tsan, msan, ubsan]` | Disables tests in build with [sanitizers](#sanitizers) | Test is run under QEMU which doesn't work with sanitizers | | `no-replicated-database` ||| | `no-ordinary-database` ||| diff --git a/docs/en/engines/_category_.yml b/docs/en/engines/_category_.yml index 8c6ba12c6f1..2aa5df72955 100644 --- a/docs/en/engines/_category_.yml +++ b/docs/en/engines/_category_.yml @@ -4,5 +4,4 @@ collapsible: true collapsed: true link: type: generated-index - title: Database & Table Engines slug: /en/engines diff --git a/docs/en/engines/table-engines/integrations/mysql.md b/docs/en/engines/table-engines/integrations/mysql.md index e00347c3163..4b285ee80a5 100644 --- a/docs/en/engines/table-engines/integrations/mysql.md +++ b/docs/en/engines/table-engines/integrations/mysql.md @@ -180,4 +180,4 @@ Default value: `300`. ## See Also {#see-also} - [The mysql table function](../../../sql-reference/table-functions/mysql.md) -- [Using MySQL as a dictionary source](../../../sql-reference/dictionaries/external-dictionaries/external-dicts-dict-sources.md#dicts-external_dicts_dict_sources-mysql) +- [Using MySQL as a dictionary source](../../../sql-reference/dictionaries/index.md#dictionary-sources#dicts-external_dicts_dict_sources-mysql) diff --git a/docs/en/engines/table-engines/integrations/odbc.md b/docs/en/engines/table-engines/integrations/odbc.md index aabc37442f9..8aac9dc3af0 100644 --- a/docs/en/engines/table-engines/integrations/odbc.md +++ b/docs/en/engines/table-engines/integrations/odbc.md @@ -126,5 +126,5 @@ SELECT * FROM odbc_t ## See Also {#see-also} -- [ODBC dictionaries](../../../sql-reference/dictionaries/external-dictionaries/external-dicts-dict-sources.md#dicts-external_dicts_dict_sources-odbc) +- [ODBC dictionaries](../../../sql-reference/dictionaries/index.md#dictionary-sources#dicts-external_dicts_dict_sources-odbc) - [ODBC table function](../../../sql-reference/table-functions/odbc.md) diff --git a/docs/en/engines/table-engines/integrations/postgresql.md b/docs/en/engines/table-engines/integrations/postgresql.md index b73d28c8508..18e884f3bcc 100644 --- a/docs/en/engines/table-engines/integrations/postgresql.md +++ b/docs/en/engines/table-engines/integrations/postgresql.md @@ -174,7 +174,7 @@ CREATE TABLE pg_table_schema_with_dots (a UInt32) **See Also** - [The `postgresql` table function](../../../sql-reference/table-functions/postgresql.md) -- [Using PostgreSQL as a dictionary source](../../../sql-reference/dictionaries/external-dictionaries/external-dicts-dict-sources.md#dicts-external_dicts_dict_sources-postgresql) +- [Using PostgreSQL as a dictionary source](../../../sql-reference/dictionaries/index.md#dictionary-sources#dicts-external_dicts_dict_sources-postgresql) ## Related content - Blog: [ClickHouse and PostgreSQL - a match made in data heaven - part 1](https://clickhouse.com/blog/migrating-data-between-clickhouse-postgres) diff --git a/docs/en/engines/table-engines/integrations/s3.md b/docs/en/engines/table-engines/integrations/s3.md index 723425429a5..dd843945e10 100644 --- a/docs/en/engines/table-engines/integrations/s3.md +++ b/docs/en/engines/table-engines/integrations/s3.md @@ -150,6 +150,7 @@ The following settings can be specified in configuration file for given endpoint - `use_environment_credentials` — If set to `true`, S3 client will try to obtain credentials from environment variables and [Amazon EC2](https://en.wikipedia.org/wiki/Amazon_Elastic_Compute_Cloud) metadata for given endpoint. Optional, default value is `false`. - `region` — Specifies S3 region name. Optional. - `use_insecure_imds_request` — If set to `true`, S3 client will use insecure IMDS request while obtaining credentials from Amazon EC2 metadata. Optional, default value is `false`. +- `expiration_window_seconds` — Grace period for checking if expiration-based credentials have expired. Optional, default value is `120`. - `header` — Adds specified HTTP header to a request to given endpoint. Optional, can be specified multiple times. - `server_side_encryption_customer_key_base64` — If specified, required headers for accessing S3 objects with SSE-C encryption will be set. Optional. - `max_single_read_retries` — The maximum number of attempts during single read. Default value is `4`. Optional. @@ -166,6 +167,7 @@ The following settings can be specified in configuration file for given endpoint + diff --git a/docs/en/engines/table-engines/mergetree-family/mergetree.md b/docs/en/engines/table-engines/mergetree-family/mergetree.md index fc8060077b0..f1b7a40094d 100644 --- a/docs/en/engines/table-engines/mergetree-family/mergetree.md +++ b/docs/en/engines/table-engines/mergetree-family/mergetree.md @@ -377,8 +377,9 @@ CREATE TABLE table_name i32 Int32, s String, ... - INDEX a (u64 * i32, s) TYPE minmax GRANULARITY 3, - INDEX b (u64 * length(s)) TYPE set(1000) GRANULARITY 4 + INDEX idx1 u64 TYPE bloom_filter GRANULARITY 3, + INDEX idx2 u64 * i32 TYPE minmax GRANULARITY 3, + INDEX idx3 u64 * length(s) TYPE set(1000) GRANULARITY 4 ) ENGINE = MergeTree() ... ``` @@ -386,8 +387,25 @@ CREATE TABLE table_name Indices from the example can be used by ClickHouse to reduce the amount of data to read from disk in the following queries: ``` sql -SELECT count() FROM table WHERE s < 'z' -SELECT count() FROM table WHERE u64 * i32 == 10 AND u64 * length(s) >= 1234 +SELECT count() FROM table WHERE u64 == 10; +SELECT count() FROM table WHERE u64 * i32 >= 1234 +SELECT count() FROM table WHERE u64 * length(s) == 1234 +``` + +Data skipping indexes can also be created on composite columns: + +```sql +-- on columns of type Map: +INDEX map_key_index mapKeys(map_column) TYPE bloom_filter +INDEX map_value_index mapValues(map_column) TYPE bloom_filter + +-- on columns of type Tuple: +INDEX tuple_1_index tuple_column.1 TYPE bloom_filter +INDEX tuple_2_index tuple_column.2 TYPE bloom_filter + +-- on columns of type Nested: +INDEX nested_1_index col.nested_col1 TYPE bloom_filter +INDEX nested_2_index col.nested_col2 TYPE bloom_filter ``` ### Available Types of Indices {#available-types-of-indices} @@ -432,20 +450,6 @@ Syntax: `tokenbf_v1(size_of_bloom_filter_in_bytes, number_of_hash_functions, ran - An experimental index to support approximate nearest neighbor (ANN) search. See [here](annindexes.md) for details. - An experimental inverted index to support full-text search. See [here](invertedindexes.md) for details. -## Example of index creation for Map data type - -``` -INDEX map_key_index mapKeys(map_column) TYPE bloom_filter GRANULARITY 1 -INDEX map_key_index mapValues(map_column) TYPE bloom_filter GRANULARITY 1 -``` - - -``` sql -INDEX sample_index (u64 * length(s)) TYPE minmax GRANULARITY 4 -INDEX sample_index2 (u64 * length(str), i32 + f64 * 100, date, str) TYPE set(100) GRANULARITY 4 -INDEX sample_index3 (lower(str), str) TYPE ngrambf_v1(3, 256, 2, 0) GRANULARITY 4 -``` - ### Functions Support {#functions-support} Conditions in the `WHERE` clause contains calls of the functions that operate with columns. If the column is a part of an index, ClickHouse tries to use this index when performing the functions. ClickHouse supports different subsets of functions for using indexes. @@ -901,7 +905,7 @@ User can assign new big parts to different disks of a [JBOD](https://en.wikipedi ## Using S3 for Data Storage {#table_engine-mergetree-s3} :::note -Google Cloud Storage (GCS) is also supported using the type `s3`. See [GCS backed MergeTree](/docs/en/integrations/data-ingestion/s3/gcs-merge-tree.md). +Google Cloud Storage (GCS) is also supported using the type `s3`. See [GCS backed MergeTree](/docs/en/integrations/gcs). ::: `MergeTree` family table engines can store data to [S3](https://aws.amazon.com/s3/) using a disk with type `s3`. @@ -960,6 +964,7 @@ Optional parameters: - `support_batch_delete` — This controls the check to see if batch deletes are supported. Set this to `false` when using Google Cloud Storage (GCS) as GCS does not support batch deletes and preventing the checks will prevent error messages in the logs. - `use_environment_credentials` — Reads AWS credentials from the Environment variables AWS_ACCESS_KEY_ID and AWS_SECRET_ACCESS_KEY and AWS_SESSION_TOKEN if they exist. Default value is `false`. - `use_insecure_imds_request` — If set to `true`, S3 client will use insecure IMDS request while obtaining credentials from Amazon EC2 metadata. Default value is `false`. +- `expiration_window_seconds` — Grace period for checking if expiration-based credentials have expired. Optional, default value is `120`. - `proxy` — Proxy configuration for S3 endpoint. Each `uri` element inside `proxy` block should contain a proxy URL. - `connect_timeout_ms` — Socket connect timeout in milliseconds. Default value is `10 seconds`. - `request_timeout_ms` — Request timeout in milliseconds. Default value is `5 seconds`. diff --git a/docs/en/engines/table-engines/mergetree-family/replication.md b/docs/en/engines/table-engines/mergetree-family/replication.md index 37ab8ac9fd3..c50433f2aeb 100644 --- a/docs/en/engines/table-engines/mergetree-family/replication.md +++ b/docs/en/engines/table-engines/mergetree-family/replication.md @@ -39,7 +39,7 @@ Compressed data for `INSERT` and `ALTER` queries is replicated (for more informa - The `DROP TABLE` query deletes the replica located on the server where the query is run. - The `RENAME` query renames the table on one of the replicas. In other words, replicated tables can have different names on different replicas. -ClickHouse uses [ClickHouse Keeper](/docs/en/guides/sre/keeper/clickhouse-keeper.md) for storing replicas meta information. It is possible to use ZooKeeper version 3.4.5 or newer, but ClickHouse Keeper is recommended. +ClickHouse uses [ClickHouse Keeper](/docs/en/guides/sre/keeper/index.md) for storing replicas meta information. It is possible to use ZooKeeper version 3.4.5 or newer, but ClickHouse Keeper is recommended. To use replication, set parameters in the [zookeeper](/docs/en/operations/server-configuration-parameters/settings.md/#server-settings_zookeeper) server configuration section. @@ -144,7 +144,7 @@ ENGINE = ReplicatedReplacingMergeTree The `Replicated` prefix is added to the table engine name. For example:`ReplicatedMergeTree`. :::tip -Adding `Replicated` is optional in ClickHouse Cloud, as all of the tables are replicated. +Adding `Replicated` is optional in ClickHouse Cloud, as all of the tables are replicated. ::: ### Replicated\*MergeTree parameters diff --git a/docs/en/engines/table-engines/special/dictionary.md b/docs/en/engines/table-engines/special/dictionary.md index e487ca2002f..05d07d94e56 100644 --- a/docs/en/engines/table-engines/special/dictionary.md +++ b/docs/en/engines/table-engines/special/dictionary.md @@ -6,7 +6,7 @@ sidebar_label: Dictionary # Dictionary Table Engine -The `Dictionary` engine displays the [dictionary](../../../sql-reference/dictionaries/external-dictionaries/external-dicts.md) data as a ClickHouse table. +The `Dictionary` engine displays the [dictionary](../../../sql-reference/dictionaries/index.md) data as a ClickHouse table. ## Example {#example} diff --git a/docs/en/engines/table-engines/special/distributed.md b/docs/en/engines/table-engines/special/distributed.md index f4f541843d3..52d82483a46 100644 --- a/docs/en/engines/table-engines/special/distributed.md +++ b/docs/en/engines/table-engines/special/distributed.md @@ -184,7 +184,7 @@ The parameters `host`, `port`, and optionally `user`, `password`, `secure`, `com - `host` – The address of the remote server. You can use either the domain or the IPv4 or IPv6 address. If you specify the domain, the server makes a DNS request when it starts, and the result is stored as long as the server is running. If the DNS request fails, the server does not start. If you change the DNS record, restart the server. - `port` – The TCP port for messenger activity (`tcp_port` in the config, usually set to 9000). Not to be confused with `http_port`. -- `user` – Name of the user for connecting to a remote server. Default value is the `default` user. This user must have access to connect to the specified server. Access is configured in the `users.xml` file. For more information, see the section [Access rights](../../../operations/access-rights.md). +- `user` – Name of the user for connecting to a remote server. Default value is the `default` user. This user must have access to connect to the specified server. Access is configured in the `users.xml` file. For more information, see the section [Access rights](../../../guides/sre/user-management/index.md). - `password` – The password for connecting to a remote server (not masked). Default value: empty string. - `secure` - Whether to use a secure SSL/TLS connection. Usually also requires specifying the port (the default secure port is `9440`). The server should listen on `9440` and be configured with correct certificates. - `compression` - Use data compression. Default value: `true`. diff --git a/docs/en/engines/table-engines/special/generate.md b/docs/en/engines/table-engines/special/generate.md index 77d90082ddc..714afe3c3b5 100644 --- a/docs/en/engines/table-engines/special/generate.md +++ b/docs/en/engines/table-engines/special/generate.md @@ -15,7 +15,7 @@ Usage examples: ## Usage in ClickHouse Server {#usage-in-clickhouse-server} ``` sql -ENGINE = GenerateRandom([random_seed] [,max_string_length] [,max_array_length]) +ENGINE = GenerateRandom([random_seed [,max_string_length [,max_array_length]]]) ``` The `max_array_length` and `max_string_length` parameters specify maximum length of all diff --git a/docs/en/getting-started/example-datasets/_category_.yml b/docs/en/getting-started/example-datasets/_category_.yml deleted file mode 100644 index 2ee34c63e93..00000000000 --- a/docs/en/getting-started/example-datasets/_category_.yml +++ /dev/null @@ -1,7 +0,0 @@ -position: 1 -label: 'Example Datasets' -collapsible: true -collapsed: true -link: - type: doc - id: en/getting-started/example-datasets/ diff --git a/docs/en/getting-started/example-datasets/cell-towers.md b/docs/en/getting-started/example-datasets/cell-towers.md index b19d09c777a..d88ce5159d4 100644 --- a/docs/en/getting-started/example-datasets/cell-towers.md +++ b/docs/en/getting-started/example-datasets/cell-towers.md @@ -1,9 +1,10 @@ --- slug: /en/getting-started/example-datasets/cell-towers -sidebar_label: Cell Towers +sidebar_label: Geo Data sidebar_position: 3 -title: "Cell Towers" +title: "Geo Data using the Cell Tower Dataset" --- + import ConnectionDetails from '@site/docs/en/_snippets/_gather_your_details_http.mdx'; import Tabs from '@theme/Tabs'; @@ -163,7 +164,7 @@ SELECT mcc, count() FROM cell_towers GROUP BY mcc ORDER BY count() DESC LIMIT 10 Based on the above query and the [MCC list](https://en.wikipedia.org/wiki/Mobile_country_code), the countries with the most cell towers are: the USA, Germany, and Russia. -You may want to create a [Dictionary](../../sql-reference/dictionaries/external-dictionaries/external-dicts.md) in ClickHouse to decode these values. +You may want to create a [Dictionary](../../sql-reference/dictionaries/index.md) in ClickHouse to decode these values. ## Use case: Incorporate geo data {#use-case} diff --git a/docs/en/getting-started/example-datasets/covid19.md b/docs/en/getting-started/example-datasets/covid19.md new file mode 100644 index 00000000000..9482e8870d2 --- /dev/null +++ b/docs/en/getting-started/example-datasets/covid19.md @@ -0,0 +1,265 @@ +--- +slug: /en/getting-started/example-datasets/covid19 +sidebar_label: COVID-19 Open-Data +--- + +# COVID-19 Open-Data + +COVID-19 Open-Data attempts to assemble the largest Covid-19 epidemiological database, in addition to a powerful set of expansive covariates. It includes open, publicly sourced, licensed data relating to demographics, economy, epidemiology, geography, health, hospitalizations, mobility, government response, weather, and more. + +The details are in GitHub [here](https://github.com/GoogleCloudPlatform/covid-19-open-data). + +It's easy to insert this data into ClickHouse... + +:::note +The following commands were executed on a **Production** instance of [ClickHouse Cloud](https://clickhouse.cloud). You can easily run them on a local install as well. +::: + +1. Let's see what the data looks like: + +```sql +DESCRIBE url( + 'https://storage.googleapis.com/covid19-open-data/v3/epidemiology.csv', + 'CSVWithNames' +); +``` + +The CSV file has 10 columns: + +```response +┌─name─────────────────┬─type─────────────┠+│ date │ Nullable(String) │ +│ location_key │ Nullable(String) │ +│ new_confirmed │ Nullable(Int64) │ +│ new_deceased │ Nullable(Int64) │ +│ new_recovered │ Nullable(Int64) │ +│ new_tested │ Nullable(Int64) │ +│ cumulative_confirmed │ Nullable(Int64) │ +│ cumulative_deceased │ Nullable(Int64) │ +│ cumulative_recovered │ Nullable(Int64) │ +│ cumulative_tested │ Nullable(Int64) │ +└──────────────────────┴──────────────────┘ + +10 rows in set. Elapsed: 0.745 sec. +``` + +2. Now let's view some of the rows: + +```sql +SELECT * +FROM url('https://storage.googleapis.com/covid19-open-data/v3/epidemiology.csv') +LIMIT 100; +``` + +Notice the `url` function easily reads data from a CSV file: + +```response +┌─c1─────────┬─c2───────────┬─c3────────────┬─c4───────────┬─c5────────────┬─c6─────────┬─c7───────────────────┬─c8──────────────────┬─c9───────────────────┬─c10───────────────┠+│ date │ location_key │ new_confirmed │ new_deceased │ new_recovered │ new_tested │ cumulative_confirmed │ cumulative_deceased │ cumulative_recovered │ cumulative_tested │ +│ 2020-04-03 │ AD │ 24 │ 1 │ á´ºáµá´¸á´¸ │ á´ºáµá´¸á´¸ │ 466 │ 17 │ á´ºáµá´¸á´¸ │ á´ºáµá´¸á´¸ │ +│ 2020-04-04 │ AD │ 57 │ 0 │ á´ºáµá´¸á´¸ │ á´ºáµá´¸á´¸ │ 523 │ 17 │ á´ºáµá´¸á´¸ │ á´ºáµá´¸á´¸ │ +│ 2020-04-05 │ AD │ 17 │ 4 │ á´ºáµá´¸á´¸ │ á´ºáµá´¸á´¸ │ 540 │ 21 │ á´ºáµá´¸á´¸ │ á´ºáµá´¸á´¸ │ +│ 2020-04-06 │ AD │ 11 │ 1 │ á´ºáµá´¸á´¸ │ á´ºáµá´¸á´¸ │ 551 │ 22 │ á´ºáµá´¸á´¸ │ á´ºáµá´¸á´¸ │ +│ 2020-04-07 │ AD │ 15 │ 2 │ á´ºáµá´¸á´¸ │ á´ºáµá´¸á´¸ │ 566 │ 24 │ á´ºáµá´¸á´¸ │ á´ºáµá´¸á´¸ │ +│ 2020-04-08 │ AD │ 23 │ 2 │ á´ºáµá´¸á´¸ │ á´ºáµá´¸á´¸ │ 589 │ 26 │ á´ºáµá´¸á´¸ │ á´ºáµá´¸á´¸ │ +└────────────┴──────────────┴───────────────┴──────────────┴───────────────┴────────────┴──────────────────────┴─────────────────────┴──────────────────────┴───────────────────┘ +``` + +3. We will create a table now that we know what the data looks like: + +```sql +CREATE TABLE covid19 ( + date Date, + location_key LowCardinality(String), + new_confirmed Int32, + new_deceased Int32, + new_recovered Int32, + new_tested Int32, + cumulative_confirmed Int32, + cumulative_deceased Int32, + cumulative_recovered Int32, + cumulative_tested Int32 +) +ENGINE = MergeTree +ORDER BY (location_key, date); +``` + +4. The following command inserts the entire dataset into the `covid19` table: + +```sql +INSERT INTO covid19 + SELECT * + FROM + url( + 'https://storage.googleapis.com/covid19-open-data/v3/epidemiology.csv', + CSVWithNames, + 'date Date, + location_key LowCardinality(String), + new_confirmed Int32, + new_deceased Int32, + new_recovered Int32, + new_tested Int32, + cumulative_confirmed Int32, + cumulative_deceased Int32, + cumulative_recovered Int32, + cumulative_tested Int32' + ); +``` + +5. It goes pretty quick - let's see how many rows were inserted: + +```sql +SELECT formatReadableQuantity(count()) +FROM covid19; +``` + +```response +┌─formatReadableQuantity(count())─┠+│ 12.53 million │ +└─────────────────────────────────┘ +``` + +6. Let's see how many total cases of Covid-19 were recorded: + +```sql +SELECT formatReadableQuantity(sum(new_confirmed)) +FROM covid19; +``` + +```response +┌─formatReadableQuantity(sum(new_confirmed))─┠+│ 1.39 billion │ +└────────────────────────────────────────────┘ +``` + +7. You will notice the data has a lot of 0's for dates - either weekends or days where numbers were not reported each day. We can use a window function to smooth out the daily averages of new cases: + +```sql +SELECT + AVG(new_confirmed) OVER (PARTITION BY location_key ORDER BY date ROWS BETWEEN 2 PRECEDING AND 2 FOLLOWING) AS cases_smoothed, + new_confirmed, + location_key, + date +FROM covid19; +``` + +8. This query determines the latest values for each location. We can't use `max(date)` because not all countries reported every day, so we grab the last row using `ROW_NUMBER`: + +```sql +WITH latest_deaths_data AS + ( SELECT location_key, + date, + new_deceased, + new_confirmed, + ROW_NUMBER() OVER (PARTITION BY location_key ORDER BY date DESC) as rn + FROM covid19) +SELECT location_key, + date, + new_deceased, + new_confirmed, + rn +FROM latest_deaths_data +WHERE rn=1; +``` + +9. We can use `lagInFrame` to determine the `LAG` of new cases each day. In this query we filter by the `US_DC` location: + +```sql +SELECT + new_confirmed - lagInFrame(new_confirmed,1) OVER (PARTITION BY location_key ORDER BY date) AS confirmed_cases_delta, + new_confirmed, + location_key, + date +FROM covid19 +WHERE location_key = 'US_DC'; +``` + +The response look like: + +```response +┌─confirmed_cases_delta─┬─new_confirmed─┬─location_key─┬───────date─┠+│ 0 │ 0 │ US_DC │ 2020-03-08 │ +│ 2 │ 2 │ US_DC │ 2020-03-09 │ +│ -2 │ 0 │ US_DC │ 2020-03-10 │ +│ 6 │ 6 │ US_DC │ 2020-03-11 │ +│ -6 │ 0 │ US_DC │ 2020-03-12 │ +│ 0 │ 0 │ US_DC │ 2020-03-13 │ +│ 6 │ 6 │ US_DC │ 2020-03-14 │ +│ -5 │ 1 │ US_DC │ 2020-03-15 │ +│ 4 │ 5 │ US_DC │ 2020-03-16 │ +│ 4 │ 9 │ US_DC │ 2020-03-17 │ +│ -1 │ 8 │ US_DC │ 2020-03-18 │ +│ 24 │ 32 │ US_DC │ 2020-03-19 │ +│ -26 │ 6 │ US_DC │ 2020-03-20 │ +│ 15 │ 21 │ US_DC │ 2020-03-21 │ +│ -3 │ 18 │ US_DC │ 2020-03-22 │ +│ 3 │ 21 │ US_DC │ 2020-03-23 │ +``` + +10. This query calculates the percentage of change in new cases each day, and includes a simple `increase` or `decrease` column in the result set: + +```sql +WITH confirmed_lag AS ( + SELECT + *, + lagInFrame(new_confirmed) OVER( + PARTITION BY location_key + ORDER BY date + ) AS confirmed_previous_day + FROM covid19 +), +confirmed_percent_change AS ( + SELECT + *, + COALESCE(ROUND((new_confirmed - confirmed_previous_day) / confirmed_previous_day * 100), 0) AS percent_change + FROM confirmed_lag +) +SELECT + date, + new_confirmed, + percent_change, + CASE + WHEN percent_change > 0 THEN 'increase' + WHEN percent_change = 0 THEN 'no change' + ELSE 'decrease' + END AS trend +FROM confirmed_percent_change +WHERE location_key = 'US_DC'; +``` + +The results look like + +```response +┌───────date─┬─new_confirmed─┬─percent_change─┬─trend─────┠+│ 2020-03-08 │ 0 │ nan │ decrease │ +│ 2020-03-09 │ 2 │ inf │ increase │ +│ 2020-03-10 │ 0 │ -100 │ decrease │ +│ 2020-03-11 │ 6 │ inf │ increase │ +│ 2020-03-12 │ 0 │ -100 │ decrease │ +│ 2020-03-13 │ 0 │ nan │ decrease │ +│ 2020-03-14 │ 6 │ inf │ increase │ +│ 2020-03-15 │ 1 │ -83 │ decrease │ +│ 2020-03-16 │ 5 │ 400 │ increase │ +│ 2020-03-17 │ 9 │ 80 │ increase │ +│ 2020-03-18 │ 8 │ -11 │ decrease │ +│ 2020-03-19 │ 32 │ 300 │ increase │ +│ 2020-03-20 │ 6 │ -81 │ decrease │ +│ 2020-03-21 │ 21 │ 250 │ increase │ +│ 2020-03-22 │ 18 │ -14 │ decrease │ +│ 2020-03-23 │ 21 │ 17 │ increase │ +│ 2020-03-24 │ 46 │ 119 │ increase │ +│ 2020-03-25 │ 48 │ 4 │ increase │ +│ 2020-03-26 │ 36 │ -25 │ decrease │ +│ 2020-03-27 │ 37 │ 3 │ increase │ +│ 2020-03-28 │ 38 │ 3 │ increase │ +│ 2020-03-29 │ 59 │ 55 │ increase │ +│ 2020-03-30 │ 94 │ 59 │ increase │ +│ 2020-03-31 │ 91 │ -3 │ decrease │ +│ 2020-04-01 │ 67 │ -26 │ decrease │ +│ 2020-04-02 │ 104 │ 55 │ increase │ +│ 2020-04-03 │ 145 │ 39 │ increase │ +``` + +:::note +As mentioned in the [GitHub repo](https://github.com/GoogleCloudPlatform/covid-19-open-data), the datset is no longer updated as of September 15, 2022. +::: \ No newline at end of file diff --git a/docs/en/getting-started/example-datasets/criteo.md b/docs/en/getting-started/example-datasets/criteo.md index 3bd0230d4cc..a2e0fda0cb0 100644 --- a/docs/en/getting-started/example-datasets/criteo.md +++ b/docs/en/getting-started/example-datasets/criteo.md @@ -3,14 +3,56 @@ slug: /en/getting-started/example-datasets/criteo sidebar_label: Terabyte Click Logs from Criteo --- -# Terabyte of Click Logs from Criteo +# Terabyte of Click Logs from Criteo Download the data from http://labs.criteo.com/downloads/download-terabyte-click-logs/ Create a table to import the log to: ``` sql -CREATE TABLE criteo_log (date Date, clicked UInt8, int1 Int32, int2 Int32, int3 Int32, int4 Int32, int5 Int32, int6 Int32, int7 Int32, int8 Int32, int9 Int32, int10 Int32, int11 Int32, int12 Int32, int13 Int32, cat1 String, cat2 String, cat3 String, cat4 String, cat5 String, cat6 String, cat7 String, cat8 String, cat9 String, cat10 String, cat11 String, cat12 String, cat13 String, cat14 String, cat15 String, cat16 String, cat17 String, cat18 String, cat19 String, cat20 String, cat21 String, cat22 String, cat23 String, cat24 String, cat25 String, cat26 String) ENGINE = Log +CREATE TABLE criteo_log ( + date Date, + clicked UInt8, + int1 Int32, + int2 Int32, + int3 Int32, + int4 Int32, + int5 Int32, + int6 Int32, + int7 Int32, + int8 Int32, + int9 Int32, + int10 Int32, + int11 Int32, + int12 Int32, + int13 Int32, + cat1 String, + cat2 String, + cat3 String, + cat4 String, + cat5 String, + cat6 String, + cat7 String, + cat8 String, + cat9 String, + cat10 String, + cat11 String, + cat12 String, + cat13 String, + cat14 String, + cat15 String, + cat16 String, + cat17 String, + cat18 String, + cat19 String, + cat20 String, + cat21 String, + cat22 String, + cat23 String, + cat24 String, + cat25 String, + cat26 String +) ENGINE = Log; ``` Download the data: @@ -73,7 +115,52 @@ ORDER BY (date, icat1) Transform data from the raw log and put it in the second table: ``` sql -INSERT INTO criteo SELECT date, clicked, int1, int2, int3, int4, int5, int6, int7, int8, int9, int10, int11, int12, int13, reinterpretAsUInt32(unhex(cat1)) AS icat1, reinterpretAsUInt32(unhex(cat2)) AS icat2, reinterpretAsUInt32(unhex(cat3)) AS icat3, reinterpretAsUInt32(unhex(cat4)) AS icat4, reinterpretAsUInt32(unhex(cat5)) AS icat5, reinterpretAsUInt32(unhex(cat6)) AS icat6, reinterpretAsUInt32(unhex(cat7)) AS icat7, reinterpretAsUInt32(unhex(cat8)) AS icat8, reinterpretAsUInt32(unhex(cat9)) AS icat9, reinterpretAsUInt32(unhex(cat10)) AS icat10, reinterpretAsUInt32(unhex(cat11)) AS icat11, reinterpretAsUInt32(unhex(cat12)) AS icat12, reinterpretAsUInt32(unhex(cat13)) AS icat13, reinterpretAsUInt32(unhex(cat14)) AS icat14, reinterpretAsUInt32(unhex(cat15)) AS icat15, reinterpretAsUInt32(unhex(cat16)) AS icat16, reinterpretAsUInt32(unhex(cat17)) AS icat17, reinterpretAsUInt32(unhex(cat18)) AS icat18, reinterpretAsUInt32(unhex(cat19)) AS icat19, reinterpretAsUInt32(unhex(cat20)) AS icat20, reinterpretAsUInt32(unhex(cat21)) AS icat21, reinterpretAsUInt32(unhex(cat22)) AS icat22, reinterpretAsUInt32(unhex(cat23)) AS icat23, reinterpretAsUInt32(unhex(cat24)) AS icat24, reinterpretAsUInt32(unhex(cat25)) AS icat25, reinterpretAsUInt32(unhex(cat26)) AS icat26 FROM criteo_log; +INSERT INTO + criteo +SELECT + date, + clicked, + int1, + int2, + int3, + int4, + int5, + int6, + int7, + int8, + int9, + int10, + int11, + int12, + int13, + reinterpretAsUInt32(unhex(cat1)) AS icat1, + reinterpretAsUInt32(unhex(cat2)) AS icat2, + reinterpretAsUInt32(unhex(cat3)) AS icat3, + reinterpretAsUInt32(unhex(cat4)) AS icat4, + reinterpretAsUInt32(unhex(cat5)) AS icat5, + reinterpretAsUInt32(unhex(cat6)) AS icat6, + reinterpretAsUInt32(unhex(cat7)) AS icat7, + reinterpretAsUInt32(unhex(cat8)) AS icat8, + reinterpretAsUInt32(unhex(cat9)) AS icat9, + reinterpretAsUInt32(unhex(cat10)) AS icat10, + reinterpretAsUInt32(unhex(cat11)) AS icat11, + reinterpretAsUInt32(unhex(cat12)) AS icat12, + reinterpretAsUInt32(unhex(cat13)) AS icat13, + reinterpretAsUInt32(unhex(cat14)) AS icat14, + reinterpretAsUInt32(unhex(cat15)) AS icat15, + reinterpretAsUInt32(unhex(cat16)) AS icat16, + reinterpretAsUInt32(unhex(cat17)) AS icat17, + reinterpretAsUInt32(unhex(cat18)) AS icat18, + reinterpretAsUInt32(unhex(cat19)) AS icat19, + reinterpretAsUInt32(unhex(cat20)) AS icat20, + reinterpretAsUInt32(unhex(cat21)) AS icat21, + reinterpretAsUInt32(unhex(cat22)) AS icat22, + reinterpretAsUInt32(unhex(cat23)) AS icat23, + reinterpretAsUInt32(unhex(cat24)) AS icat24, + reinterpretAsUInt32(unhex(cat25)) AS icat25, + reinterpretAsUInt32(unhex(cat26)) AS icat26 +FROM + criteo_log; DROP TABLE criteo_log; ``` diff --git a/docs/en/getting-started/example-datasets/github.md b/docs/en/getting-started/example-datasets/github.md index 239637a34e9..e18c7dec1a6 100644 --- a/docs/en/getting-started/example-datasets/github.md +++ b/docs/en/getting-started/example-datasets/github.md @@ -1,12 +1,13 @@ --- slug: /en/getting-started/example-datasets/github -sidebar_label: GitHub Repo Analysis +sidebar_label: Github Repo +sidebar_position: 1 description: Analyze the ClickHouse GitHub repo or any repository of your choosing --- -# ClickHouse GitHub data +# Writing Queries in ClickHouse using GitHub Data -This dataset contains all of the commits and changes for the ClickHouse repository. It can be generated using the native `git-import` tool distributed with ClickHouse. +This dataset contains all of the commits and changes for the ClickHouse repository. It can be generated using the native `git-import` tool distributed with ClickHouse. The generated data provides a `tsv` file for each of the following tables: @@ -323,7 +324,7 @@ Note a more complex variant of this query exists where we find the [line-by-line ## Find the current active files -This is important for later analysis when we only want to consider the current files in the repository. We estimate this set as the files which haven't been renamed or deleted (and then re-added/re-named). +This is important for later analysis when we only want to consider the current files in the repository. We estimate this set as the files which haven't been renamed or deleted (and then re-added/re-named). **Note there appears to have been a broken commit history in relation to files under the `dbms`, `libs`, `tests/testflows/` directories during their renames. We also thus exclude these.** @@ -417,7 +418,7 @@ git ls-files | grep -v -E 'generated\.cpp|^(contrib|docs?|website|libs/(libcityh The difference here is caused by a few factors: -- A rename can occur alongside other modifications to the file. These are listed as separate events in file_changes but with the same time. The `argMax` function has no way of distinguishing these - it picks the first value. The natural ordering of the inserts (the only means of knowing the correct order) is not maintained across the union so modified events can be selected. For example, below the `src/Functions/geometryFromColumn.h` file has several modifications before being renamed to `src/Functions/geometryConverters.h`. Our current solution may pick a Modify event as the latest change causing `src/Functions/geometryFromColumn.h` to be retained. +- A rename can occur alongside other modifications to the file. These are listed as separate events in file_changes but with the same time. The `argMax` function has no way of distinguishing these - it picks the first value. The natural ordering of the inserts (the only means of knowing the correct order) is not maintained across the union so modified events can be selected. For example, below the `src/Functions/geometryFromColumn.h` file has several modifications before being renamed to `src/Functions/geometryConverters.h`. Our current solution may pick a Modify event as the latest change causing `src/Functions/geometryFromColumn.h` to be retained. [play](https://play.clickhouse.com/play?user=play#U0VMRUNUCiAgICAgIGNoYW5nZV90eXBlLAogICAgICBwYXRoLAogICAgICBvbGRfcGF0aCwKICAgICAgdGltZSwKICAgICAgY29tbWl0X2hhc2gKICBGUk9NIGdpdF9jbGlja2hvdXNlLmZpbGVfY2hhbmdlcwogIFdIRVJFIChwYXRoID0gJ3NyYy9GdW5jdGlvbnMvZ2VvbWV0cnlGcm9tQ29sdW1uLmgnKSBPUiAob2xkX3BhdGggPSAnc3JjL0Z1bmN0aW9ucy9nZW9tZXRyeUZyb21Db2x1bW4uaCcpCg==) @@ -1386,7 +1387,7 @@ LIMIT 1 BY day_of_week 7 rows in set. Elapsed: 0.004 sec. Processed 21.82 thousand rows, 140.02 KB (4.88 million rows/s., 31.29 MB/s.) ``` -This is still a little simple and doesn't reflect people's work. +This is still a little simple and doesn't reflect people's work. A better metric might be who is the top contributor each day as a fraction of the total work performed in the last year. Note that we treat the deletion and adding code equally. @@ -1952,7 +1953,7 @@ SELECT Most contributors write more code than tests, as you'd expect. -What about who adds the most comments when contributing code? +What about who adds the most comments when contributing code? [play](https://play.clickhouse.com/play?user=play#U0VMRUNUCiAgICBhdXRob3IsCiAgICBhdmcocmF0aW9fY29tbWVudHMpIEFTIGF2Z19yYXRpb19jb21tZW50cywKICAgIHN1bShjb2RlKSBBUyBjb2RlCkZST00KKAogICAgU0VMRUNUCiAgICAgICAgYXV0aG9yLAogICAgICAgIGNvbW1pdF9oYXNoLAogICAgICAgIGNvdW50SWYobGluZV90eXBlID0gJ0NvbW1lbnQnKSBBUyBjb21tZW50cywKICAgICAgICBjb3VudElmKGxpbmVfdHlwZSA9ICdDb2RlJykgQVMgY29kZSwKICAgICAgICBpZihjb21tZW50cyA+IDAsIGNvbW1lbnRzIC8gKGNvbW1lbnRzICsgY29kZSksIDApIEFTIHJhdGlvX2NvbW1lbnRzCiAgICBGUk9NIGdpdF9jbGlja2hvdXNlLmxpbmVfY2hhbmdlcwogICAgR1JPVVAgQlkKICAgICAgICBhdXRob3IsCiAgICAgICAgY29tbWl0X2hhc2gKKQpHUk9VUCBCWSBhdXRob3IKT1JERVIgQlkgY29kZSBERVNDCkxJTUlUIDEwCg==) @@ -2393,7 +2394,7 @@ WHERE (path = 'src/Storages/StorageReplicatedMergeTree.cpp') AND (change_type = This makes viewing the full history of a file challenging since we don't have a single value connecting all line or file changes. -To address this, we can use User Defined Functions (UDFs). These cannot, currently, be recursive, so to identify the history of a file we must define a series of UDFs which call each other explicitly. +To address this, we can use User Defined Functions (UDFs). These cannot, currently, be recursive, so to identify the history of a file we must define a series of UDFs which call each other explicitly. This means we can only track renames to a maximum depth - the below example is 5 deep. It is unlikely a file will be renamed more times than this, so for now, this is sufficient. diff --git a/docs/en/getting-started/example-datasets/metrica.md b/docs/en/getting-started/example-datasets/metrica.md index e966f6c20d6..e21237f39bb 100644 --- a/docs/en/getting-started/example-datasets/metrica.md +++ b/docs/en/getting-started/example-datasets/metrica.md @@ -84,7 +84,7 @@ clickhouse-client --query "SELECT COUNT(*) FROM datasets.visits_v1" 1680609 ``` -## An example JOIN +## An example JOIN The hits and visits dataset is used in the ClickHouse test routines, this is one of the queries from the test suite. The rest @@ -131,10 +131,10 @@ FORMAT PrettyCompact" ## Next Steps -[A Practical Introduction to Sparse Primary Indexes in ClickHouse](../../guides/improving-query-performance/sparse-primary-indexes/sparse-primary-indexes-intro.md) uses the hits dataset to discuss the differences in ClickHouse indexing compared to traditional relational databases, how ClickHouse builds and uses a sparse primary index, and indexing best practices. +[A Practical Introduction to Sparse Primary Indexes in ClickHouse](/docs/en/guides/best-practices/sparse-primary-indexes.md) uses the hits dataset to discuss the differences in ClickHouse indexing compared to traditional relational databases, how ClickHouse builds and uses a sparse primary index, and indexing best practices. Additional examples of queries to these tables can be found among the ClickHouse [stateful tests](https://github.com/ClickHouse/ClickHouse/blob/d7129855757f38ceec3e4ecc6dafacdabe9b178f/tests/queries/1_stateful/00172_parallel_join.sql). :::note -The test suite uses a database name `test`, and the tables are named `hits` and `visits`. You can rename your database and tables, or edit the SQL from the test file. +The test suite uses a database name `test`, and the tables are named `hits` and `visits`. You can rename your database and tables, or edit the SQL from the test file. ::: diff --git a/docs/en/getting-started/example-datasets/nypd_complaint_data.md b/docs/en/getting-started/example-datasets/nypd_complaint_data.md index 8b02ac23cf9..154cfa78e53 100644 --- a/docs/en/getting-started/example-datasets/nypd_complaint_data.md +++ b/docs/en/getting-started/example-datasets/nypd_complaint_data.md @@ -16,7 +16,7 @@ While working through this guide you will: The dataset used in this guide comes from the NYC Open Data team, and contains data about "all valid felony, misdemeanor, and violation crimes reported to the New York City Police Department (NYPD)". At the time of writing, the data file is 166MB, but it is updated regularly. -**Source**: [data.cityofnewyork.us](https://data.cityofnewyork.us/Public-Safety/NYPD-Complaint-Data-Current-Year-To-Date-/5uac-w243) +**Source**: [data.cityofnewyork.us](https://data.cityofnewyork.us/Public-Safety/NYPD-Complaint-Data-Current-Year-To-Date-/5uac-w243) **Terms of use**: https://www1.nyc.gov/home/terms-of-use.page ## Prerequisites @@ -35,7 +35,7 @@ The examples in this guide assume that you have saved the TSV file to `${HOME}/N ## Familiarize yourself with the TSV file -Before starting to work with the ClickHouse database familiarize yourself with the data. +Before starting to work with the ClickHouse database familiarize yourself with the data. ### Look at the fields in the source TSV file @@ -47,15 +47,15 @@ clickhouse-local --query \ Sample response ```response -CMPLNT_NUM Nullable(Float64) -ADDR_PCT_CD Nullable(Float64) -BORO_NM Nullable(String) -CMPLNT_FR_DT Nullable(String) -CMPLNT_FR_TM Nullable(String) +CMPLNT_NUM Nullable(Float64) +ADDR_PCT_CD Nullable(Float64) +BORO_NM Nullable(String) +CMPLNT_FR_DT Nullable(String) +CMPLNT_FR_TM Nullable(String) ``` :::tip -Most of the time the above command will let you know which fields in the input data are numeric, and which are strings, and which are tuples. This is not always the case. Because ClickHouse is routineley used with datasets containing billions of records there is a default number (100) of rows examined to [infer the schema](../../guides/developer/working-with-json/json-semi-structured.md/#relying-on-schema-inference) in order to avoid parsing billions of rows to infer the schema. The response below may not match what you see, as the dataset is updated several times each year. Looking at the Data Dictionary you can see that CMPLNT_NUM is specified as text, and not numeric. By overriding the default of 100 rows for inference with the setting `SETTINGS input_format_max_rows_to_read_for_schema_inference=2000` +Most of the time the above command will let you know which fields in the input data are numeric, and which are strings, and which are tuples. This is not always the case. Because ClickHouse is routineley used with datasets containing billions of records there is a default number (100) of rows examined to [infer the schema](/docs/en/integrations/data-ingestion/data-formats/json.md#relying-on-schema-inference) in order to avoid parsing billions of rows to infer the schema. The response below may not match what you see, as the dataset is updated several times each year. Looking at the Data Dictionary you can see that CMPLNT_NUM is specified as text, and not numeric. By overriding the default of 100 rows for inference with the setting `SETTINGS input_format_max_rows_to_read_for_schema_inference=2000` you can get a better idea of the content. Note: as of version 22.5 the default is now 25,000 rows for inferring the schema, so only change the setting if you are on an older version or if you need more than 25,000 rows to be sampled. @@ -65,46 +65,46 @@ Run this command at your command prompt. You will be using `clickhouse-local` t ```sh clickhouse-local --input_format_max_rows_to_read_for_schema_inference=2000 \ --query \ -"describe file('${HOME}/NYPD_Complaint_Data_Current__Year_To_Date_.tsv', 'TSVWithNames')" +"describe file('${HOME}/NYPD_Complaint_Data_Current__Year_To_Date_.tsv', 'TSVWithNames')" ``` Result: ```response -CMPLNT_NUM Nullable(String) -ADDR_PCT_CD Nullable(Float64) -BORO_NM Nullable(String) -CMPLNT_FR_DT Nullable(String) -CMPLNT_FR_TM Nullable(String) -CMPLNT_TO_DT Nullable(String) -CMPLNT_TO_TM Nullable(String) -CRM_ATPT_CPTD_CD Nullable(String) -HADEVELOPT Nullable(String) -HOUSING_PSA Nullable(Float64) -JURISDICTION_CODE Nullable(Float64) -JURIS_DESC Nullable(String) -KY_CD Nullable(Float64) -LAW_CAT_CD Nullable(String) -LOC_OF_OCCUR_DESC Nullable(String) -OFNS_DESC Nullable(String) -PARKS_NM Nullable(String) -PATROL_BORO Nullable(String) -PD_CD Nullable(Float64) -PD_DESC Nullable(String) -PREM_TYP_DESC Nullable(String) -RPT_DT Nullable(String) -STATION_NAME Nullable(String) -SUSP_AGE_GROUP Nullable(String) -SUSP_RACE Nullable(String) -SUSP_SEX Nullable(String) -TRANSIT_DISTRICT Nullable(Float64) -VIC_AGE_GROUP Nullable(String) -VIC_RACE Nullable(String) -VIC_SEX Nullable(String) -X_COORD_CD Nullable(Float64) -Y_COORD_CD Nullable(Float64) -Latitude Nullable(Float64) -Longitude Nullable(Float64) -Lat_Lon Tuple(Nullable(Float64), Nullable(Float64)) +CMPLNT_NUM Nullable(String) +ADDR_PCT_CD Nullable(Float64) +BORO_NM Nullable(String) +CMPLNT_FR_DT Nullable(String) +CMPLNT_FR_TM Nullable(String) +CMPLNT_TO_DT Nullable(String) +CMPLNT_TO_TM Nullable(String) +CRM_ATPT_CPTD_CD Nullable(String) +HADEVELOPT Nullable(String) +HOUSING_PSA Nullable(Float64) +JURISDICTION_CODE Nullable(Float64) +JURIS_DESC Nullable(String) +KY_CD Nullable(Float64) +LAW_CAT_CD Nullable(String) +LOC_OF_OCCUR_DESC Nullable(String) +OFNS_DESC Nullable(String) +PARKS_NM Nullable(String) +PATROL_BORO Nullable(String) +PD_CD Nullable(Float64) +PD_DESC Nullable(String) +PREM_TYP_DESC Nullable(String) +RPT_DT Nullable(String) +STATION_NAME Nullable(String) +SUSP_AGE_GROUP Nullable(String) +SUSP_RACE Nullable(String) +SUSP_SEX Nullable(String) +TRANSIT_DISTRICT Nullable(Float64) +VIC_AGE_GROUP Nullable(String) +VIC_RACE Nullable(String) +VIC_SEX Nullable(String) +X_COORD_CD Nullable(Float64) +Y_COORD_CD Nullable(Float64) +Latitude Nullable(Float64) +Longitude Nullable(Float64) +Lat_Lon Tuple(Nullable(Float64), Nullable(Float64)) New Georeferenced Column Nullable(String) ``` @@ -362,7 +362,7 @@ The dates shown as `1925` above are from errors in the data. There are several The decisions made above on the data types used for the columns are reflected in the table schema below. We also need to decide on the `ORDER BY` and `PRIMARY KEY` used for the table. At least one -of `ORDER BY` or `PRIMARY KEY` must be specified. Here are some guidelines on deciding on the +of `ORDER BY` or `PRIMARY KEY` must be specified. Here are some guidelines on deciding on the columns to includes in `ORDER BY`, and more information is in the *Next Steps* section at the end of this document. @@ -420,7 +420,7 @@ ORDER BY ( borough, offense_description, date_reported ) Putting together the changes to data types and the `ORDER BY` tuple gives this table structure: ```sql -CREATE TABLE NYPD_Complaint ( +CREATE TABLE NYPD_Complaint ( complaint_number String, precinct UInt8, borough LowCardinality(String), @@ -429,7 +429,7 @@ CREATE TABLE NYPD_Complaint ( was_crime_completed String, housing_authority String, housing_level_code UInt32, - jurisdiction_code UInt8, + jurisdiction_code UInt8, jurisdiction LowCardinality(String), offense_code UInt8, offense_level LowCardinality(String), @@ -478,7 +478,7 @@ Query id: 6a5b10bf-9333-4090-b36e-c7f08b1d9e01 Row 1: ────── -partition_key: +partition_key: sorting_key: borough, offense_description, date_reported primary_key: borough, offense_description, date_reported table: NYPD_Complaint @@ -495,7 +495,7 @@ We will use `clickhouse-local` tool for data preprocessing and `clickhouse-clien :::tip `table='input'` appears in the arguments to clickhouse-local below. clickhouse-local takes the provided input (`cat ${HOME}/NYPD_Complaint_Data_Current__Year_To_Date_.tsv`) and inserts the input into a table. By default the table is named `table`. In this guide the name of the table is set to `input` to make the data flow clearer. The final argument to clickhouse-local is a query that selects from the table (`FROM input`) which is then piped to `clickhouse-client` to populate the table `NYPD_Complaint`. ::: - + ```sql cat ${HOME}/NYPD_Complaint_Data_Current__Year_To_Date_.tsv \ | clickhouse-local --table='input' --input-format='TSVWithNames' \ @@ -512,12 +512,12 @@ cat ${HOME}/NYPD_Complaint_Data_Current__Year_To_Date_.tsv \ CRM_ATPT_CPTD_CD AS was_crime_completed, HADEVELOPT AS housing_authority_development, HOUSING_PSA AS housing_level_code, - JURISDICTION_CODE AS jurisdiction_code, + JURISDICTION_CODE AS jurisdiction_code, JURIS_DESC AS jurisdiction, KY_CD AS offense_code, LAW_CAT_CD AS offense_level, LOC_OF_OCCUR_DESC AS location_descriptor, - OFNS_DESC AS offense_description, + OFNS_DESC AS offense_description, PARKS_NM AS park_name, PATROL_BORO AS patrol_borough, PD_CD, @@ -529,7 +529,7 @@ cat ${HOME}/NYPD_Complaint_Data_Current__Year_To_Date_.tsv \ SUSP_RACE AS suspect_race, SUSP_SEX AS suspect_sex, TRANSIT_DISTRICT AS transit_district, - VIC_AGE_GROUP AS victim_age_group, + VIC_AGE_GROUP AS victim_age_group, VIC_RACE AS victim_race, VIC_SEX AS victim_sex, X_COORD_CD AS NY_x_coordinate, @@ -538,7 +538,7 @@ cat ${HOME}/NYPD_Complaint_Data_Current__Year_To_Date_.tsv \ Longitude FROM input" \ | clickhouse-client --query='INSERT INTO NYPD_Complaint FORMAT TSV' -``` +``` ## Validate the Data {#validate-data} @@ -560,7 +560,7 @@ Result: │ 208993 │ └─────────┘ -1 row in set. Elapsed: 0.001 sec. +1 row in set. Elapsed: 0.001 sec. ``` The size of the dataset in ClickHouse is just 12% of the original TSV file, compare the size of the original TSV file with the size of the table: @@ -651,4 +651,4 @@ Query id: 8cdcdfd4-908f-4be0-99e3-265722a2ab8d ## Next Steps -[A Practical Introduction to Sparse Primary Indexes in ClickHouse](../../guides/improving-query-performance/sparse-primary-indexes/sparse-primary-indexes-intro.md) discusses the differences in ClickHouse indexing compared to traditional relational databases, how ClickHouse builds and uses a sparse primary index, and indexing best practices. +[A Practical Introduction to Sparse Primary Indexes in ClickHouse](/docs/en/guides/best-practices/sparse-primary-indexes.md) discusses the differences in ClickHouse indexing compared to traditional relational databases, how ClickHouse builds and uses a sparse primary index, and indexing best practices. diff --git a/docs/en/getting-started/example-datasets/recipes.md b/docs/en/getting-started/example-datasets/recipes.md index 4cc94c3ce5b..729d3d17015 100644 --- a/docs/en/getting-started/example-datasets/recipes.md +++ b/docs/en/getting-started/example-datasets/recipes.md @@ -80,7 +80,7 @@ Result: ### Top Components by the Number of Recipes: -In this example we learn how to use [arrayJoin](../../sql-reference/functions/array-join/) function to expand an array into a set of rows. +In this example we learn how to use [arrayJoin](../../sql-reference/functions/array-join.md) function to expand an array into a set of rows. Query: @@ -185,7 +185,7 @@ Result: 10 rows in set. Elapsed: 0.215 sec. Processed 2.23 million rows, 1.48 GB (10.35 million rows/s., 6.86 GB/s.) ``` -In this example, we involve [has](../../sql-reference/functions/array-functions/#hasarr-elem) function to filter by array elements and sort by the number of directions. +In this example, we involve [has](../../sql-reference/functions/array-functions.md#hasarr-elem) function to filter by array elements and sort by the number of directions. There is a wedding cake that requires the whole 126 steps to produce! Show that directions: diff --git a/docs/en/getting-started/example-datasets/uk-price-paid.md b/docs/en/getting-started/example-datasets/uk-price-paid.md index 2a89bfda2e7..8ed79c3986f 100644 --- a/docs/en/getting-started/example-datasets/uk-price-paid.md +++ b/docs/en/getting-started/example-datasets/uk-price-paid.md @@ -1,17 +1,17 @@ --- slug: /en/getting-started/example-datasets/uk-price-paid -sidebar_label: UK Property Price Paid +sidebar_label: UK Property Prices sidebar_position: 1 -title: "UK Property Price Paid" --- -The dataset contains data about prices paid for real-estate property in England and Wales. The data is available since year 1995. -The size of the dataset in uncompressed form is about 4 GiB and it will take about 278 MiB in ClickHouse. +# The UK property prices dataset -Source: https://www.gov.uk/government/statistical-data-sets/price-paid-data-downloads -Description of the fields: https://www.gov.uk/guidance/about-the-price-paid-data +Projections are a great way to improve the performance of queries that you run frequently. We will demonstrate the power of projections +using the UK property dataset, which contains data about prices paid for real-estate property in England and Wales. The data is available since 1995, and the size of the dataset in uncompressed form is about 4 GiB (which will only take about 278 MiB in ClickHouse). -Contains HM Land Registry data © Crown copyright and database right 2021. This data is licensed under the Open Government Licence v3.0. +- Source: https://www.gov.uk/government/statistical-data-sets/price-paid-data-downloads +- Description of the fields: https://www.gov.uk/guidance/about-the-price-paid-data +- Contains HM Land Registry data © Crown copyright and database right 2021. This data is licensed under the Open Government Licence v3.0. ## Create the Table {#create-table} diff --git a/docs/en/getting-started/example-datasets/youtube-dislikes.md b/docs/en/getting-started/example-datasets/youtube-dislikes.md new file mode 100644 index 00000000000..2eb2071d5f2 --- /dev/null +++ b/docs/en/getting-started/example-datasets/youtube-dislikes.md @@ -0,0 +1,219 @@ +--- +slug: /en/getting-started/example-datasets/youtube-dislikes +sidebar_label: YouTube Dislikes +description: A collection is dislikes of YouTube videos. +--- + +# YouTube dataset of dislikes + +In November of 2021, YouTube removed the public ***dislike*** count from all of its videos. While creators can still see the number of dislikes, viewers can only see how many ***likes*** a video has received. + +:::important +The dataset has over 4.55 billion records, so be careful just copying-and-pasting the commands below unless your resources can handle that type of volume. The commands below were executed on a **Production** instance of [ClickHouse Cloud](https://clickhouse.cloud). +::: + +The data is in a JSON format and can be downloaded from [archive.org](https://archive.org/download/dislikes_youtube_2021_12_video_json_files). We have made this same data available in S3 so that it can be downloaded more efficiently into a ClickHouse Cloud instance. + +Here are the steps to create a table in ClickHouse Cloud and insert the data. + +:::note +The steps below will easily work on a local install of ClickHouse too. The only change would be to use the `s3` function instead of `s3cluster` (unless you have a cluster configured - in which case change `default` to the name of your cluster). +::: + +## Step-by-step instructions + +1. Let's see what the data looks like. The `s3cluster` table function returns a table, so we can `DESCRIBE` the reult: + +```sql +DESCRIBE s3Cluster( + 'default', + 'https://clickhouse-public-datasets.s3.amazonaws.com/youtube/original/files/*.zst', + 'JSONLines' +); +``` + +ClickHouse infers the following schema from the JSON file: + +```response +┌─name────────────────┬─type─────────────────────────────────┠+│ id │ Nullable(String) │ +│ fetch_date │ Nullable(Int64) │ +│ upload_date │ Nullable(String) │ +│ title │ Nullable(String) │ +│ uploader_id │ Nullable(String) │ +│ uploader │ Nullable(String) │ +│ uploader_sub_count │ Nullable(Int64) │ +│ is_age_limit │ Nullable(Bool) │ +│ view_count │ Nullable(Int64) │ +│ like_count │ Nullable(Int64) │ +│ dislike_count │ Nullable(Int64) │ +│ is_crawlable │ Nullable(Bool) │ +│ is_live_content │ Nullable(Bool) │ +│ has_subtitles │ Nullable(Bool) │ +│ is_ads_enabled │ Nullable(Bool) │ +│ is_comments_enabled │ Nullable(Bool) │ +│ description │ Nullable(String) │ +│ rich_metadata │ Array(Map(String, Nullable(String))) │ +│ super_titles │ Array(Map(String, Nullable(String))) │ +│ uploader_badges │ Nullable(String) │ +│ video_badges │ Nullable(String) │ +└─────────────────────┴──────────────────────────────────────┘ +``` + +2. Based on the inferred schema, we cleaned up the data types and added a primary key. Define the following table: + +```sql +CREATE TABLE youtube +( + `id` String, + `fetch_date` DateTime, + `upload_date` String, + `title` String, + `uploader_id` String, + `uploader` String, + `uploader_sub_count` Int64, + `is_age_limit` Bool, + `view_count` Int64, + `like_count` Int64, + `dislike_count` Int64, + `is_crawlable` Bool, + `has_subtitles` Bool, + `is_ads_enabled` Bool, + `is_comments_enabled` Bool, + `description` String, + `rich_metadata` Array(Map(String, String)), + `super_titles` Array(Map(String, String)), + `uploader_badges` String, + `video_badges` String +) +ENGINE = MergeTree +ORDER BY (upload_date, uploader); +``` + +3. The following command streams the records from the S3 files into the `youtube` table. + +:::important +This inserts a lot of data - 4.65 billion rows. If you do not want the entire dataset, simply add a `LIMIT` clause with the desired number of rows. +::: + +```sql +INSERT INTO youtube +SETTINGS input_format_null_as_default = 1 +SELECT + id, + parseDateTimeBestEffortUS(toString(fetch_date)) AS fetch_date, + upload_date, + ifNull(title, '') AS title, + uploader_id, + ifNull(uploader, '') AS uploader, + uploader_sub_count, + is_age_limit, + view_count, + like_count, + dislike_count, + is_crawlable, + has_subtitles, + is_ads_enabled, + is_comments_enabled, + ifNull(description, '') AS description, + rich_metadata, + super_titles, + ifNull(uploader_badges, '') AS uploader_badges, + ifNull(video_badges, '') AS video_badges +FROM s3Cluster( + 'default', + 'https://clickhouse-public-datasets.s3.amazonaws.com/youtube/original/files/*.zst', + 'JSONLines' + ); +``` + +4. Open a new tab in the SQL Console of ClickHouse Cloud (or a new `clickhouse-client` window) and watch the count increase. It will take a while to insert 4.56B rows, depending on your server resources. (Withtout any tweaking of settings, it takes about 4.5 hours.) + +```sql +SELECT formatReadableQuantity(count()) +FROM youtube +``` + +```response +┌─formatReadableQuantity(count())─┠+│ 4.56 billion │ +└─────────────────────────────────┘ +``` + +5. Once the data is inserted, go ahead and count the number of dislikes of your favorite videos or channels. Let's see how many videos were uploaded by ClickHouse: + +```sql +SELECT count() +FROM youtube +WHERE uploader = 'ClickHouse'; +``` + +```response +┌─count()─┠+│ 84 │ +└─────────┘ + +1 row in set. Elapsed: 0.570 sec. Processed 237.57 thousand rows, 5.77 MB (416.54 thousand rows/s., 10.12 MB/s.) +``` + +:::note +The query above runs so quickly because we chose `uploader` as the first column of the primary key - so it only had to process 237k rows. +::: + +6. Let's look and likes and dislikes of ClickHouse videos: + +```sql +SELECT + title, + like_count, + dislike_count +FROM youtube +WHERE uploader = 'ClickHouse' +ORDER BY dislike_count DESC; +``` + +The response looks like: + +```response +┌─title────────────────────────────────────────────────────────────────────────────────────────────────┬─like_count─┬─dislike_count─┠+│ ClickHouse v21.11 Release Webinar │ 52 │ 3 │ +│ ClickHouse Introduction │ 97 │ 3 │ +│ Casa Modelo Algarve │ 180 │ 3 │ +│ Профайлер запроÑов: трудный путь │ 33 │ 3 │ +│ ClickHouse в КурÑометре │ 4 │ 2 │ +│ 10 Good Reasons to Use ClickHouse │ 27 │ 2 │ +... + +84 rows in set. Elapsed: 0.013 sec. Processed 155.65 thousand rows, 16.94 MB (11.96 million rows/s., 1.30 GB/s.) +``` + +7. Here is a search for videos with **ClickHouse** in the `title` or `description` fields: + +```sql +SELECT + view_count, + like_count, + dislike_count, + concat('https://youtu.be/', id) AS url, + title +FROM youtube +WHERE (title ILIKE '%ClickHouse%') OR (description ILIKE '%ClickHouse%') +ORDER BY + like_count DESC, + view_count DESC +``` + +This query has to process every row, and also parse through two columns of strings. Even then, we get decent performance at 4.15M rows/second: + +```response +1174 rows in set. Elapsed: 1099.368 sec. Processed 4.56 billion rows, 1.98 TB (4.15 million rows/s., 1.80 GB/s.) +``` + +The results look like: + +```response +┌─view_count─┬─like_count─┬─dislike_count─┬─url──────────────────────────┬─title──────────────────────────────────────────────────────────────────────────────────────────────────┠+│ 1919 │ 63 │ 1 │ https://youtu.be/b9MeoOtAivQ │ ClickHouse v21.10 Release Webinar │ +│ 8710 │ 62 │ 4 │ https://youtu.be/PeV1mC2z--M │ What is JDBC DriverManager? | JDBC │ +│ 3534 │ 62 │ 1 │ https://youtu.be/8nWRhK9gw10 │ CLICKHOUSE - Arquitetura Modular │ +``` \ No newline at end of file diff --git a/docs/en/getting-started/install.md b/docs/en/getting-started/install.md index 0867f3a0795..2c0ac70a321 100644 --- a/docs/en/getting-started/install.md +++ b/docs/en/getting-started/install.md @@ -14,75 +14,35 @@ import CodeBlock from '@theme/CodeBlock'; You have three options for getting up and running with ClickHouse: - **[ClickHouse Cloud](https://clickhouse.com/cloud/):** The official ClickHouse as a service, - built by, maintained and supported by the creators of ClickHouse -- **[Self-managed ClickHouse](#self-managed-install):** ClickHouse can run on any Linux, FreeBSD, or macOS with x86-64, ARM, or PowerPC64LE CPU architecture -- **[Docker Image](https://hub.docker.com/r/clickhouse/clickhouse-server/):** Read the guide with the official image in Docker Hub +- **[Quick Install](#quick-install):** an easy-to-download binary for testing and developing with ClickHouse +- **[Production Deployments](#available-installation-options):** ClickHouse can run on any Linux, FreeBSD, or macOS with x86-64, ARM, or PowerPC64LE CPU architecture +- **[Docker Image](https://hub.docker.com/r/clickhouse/clickhouse-server/):** use the official Docker image in Docker Hub ## ClickHouse Cloud The quickest and easiest way to get up and running with ClickHouse is to create a new service in [ClickHouse Cloud](https://clickhouse.cloud/). -## Self-Managed Install +## Quick Install :::tip For production installs of a specific release version see the [installation options](#available-installation-options) down below. ::: - - +On Linux and macOS: -1. The simplest way to download ClickHouse locally is to run the following command. If your operating system is supported, an appropriate ClickHouse binary will be downloaded and made runnable: +1. If you are just getting started and want to see what ClickHouse can do, the simplest way to download ClickHouse locally is to run the following command. It downloads a single binary for your operating system that can be used to run the ClickHouse server, clickhouse-client, clickhouse-local, +ClickHouse Keeper, and other tools: ```bash curl https://clickhouse.com/ | sh ``` -1. Run the `install` command, which defines a collection of useful symlinks along with the files and folders used by ClickHouse - all of which you can see in the output of the install script: - - ```bash - sudo ./clickhouse install - ``` - -1. At the end of the install script, you are prompted for a password for the `default` user. Feel free to enter a password, or you can optionally leave it blank: - - ```response - Creating log directory /var/log/clickhouse-server. - Creating data directory /var/lib/clickhouse. - Creating pid directory /var/run/clickhouse-server. - chown -R clickhouse:clickhouse '/var/log/clickhouse-server' - chown -R clickhouse:clickhouse '/var/run/clickhouse-server' - chown clickhouse:clickhouse '/var/lib/clickhouse' - Enter password for default user: - ``` - You should see the following output: - - ```response - ClickHouse has been successfully installed. - - Start clickhouse-server with: - sudo clickhouse start - - Start clickhouse-client with: - clickhouse-client - ``` - 1. Run the following command to start the ClickHouse server: ```bash - sudo clickhouse start + ./clickhouse server ``` - - - -1. The simplest way to download ClickHouse locally is to run the following command. If your operating system is supported, an appropriate ClickHouse binary will be downloaded and made runnable: - ```bash - curl https://clickhouse.com/ | sh - ``` - -1. Run the ClickHouse server: - - ```bash - ./clickhouse server - ``` + The first time you run this script, the necessary files and folders are created in the current directory, then the server starts. 1. Open a new terminal and use the **clickhouse-client** to connect to your service: @@ -101,15 +61,14 @@ For production installs of a specific release version see the [installation opti You are ready to start sending DDL and SQL commands to ClickHouse! - - - :::tip -The [Quick Start](/docs/en/quick-start.mdx/#step-1-get-clickhouse) walks through the steps to download and run ClickHouse, connect to it, and insert data. +The [Quick Start](/docs/en/quick-start.mdx) walks through the steps for creating tables and inserting data. ::: -## Available Installation Options {#available-installation-options} +## Production Deployments {#available-installation-options} + +For production deployments of ClickHouse, choose from one of the following install options. ### From DEB Packages {#install-from-deb-packages} @@ -118,9 +77,12 @@ It is recommended to use official pre-compiled `deb` packages for Debian or Ubun #### Setup the Debian repository ``` bash sudo apt-get install -y apt-transport-https ca-certificates dirmngr -sudo apt-key adv --keyserver hkp://keyserver.ubuntu.com:80 --recv 8919F6BD2B48D754 +GNUPGHOME=$(mktemp -d) +sudo GNUPGHOME="$GNUPGHOME" gpg --no-default-keyring --keyring /usr/share/keyrings/clickhouse-keyring.gpg --keyserver hkp://keyserver.ubuntu.com:80 --recv-keys 8919F6BD2B48D754 +sudo rm -r "$GNUPGHOME" +sudo chmod +r /usr/share/keyrings/clickhouse-keyring.gpg -echo "deb https://packages.clickhouse.com/deb stable main" | sudo tee \ +echo "deb [signed-by=/usr/share/keyrings/clickhouse-keyring.gpg] https://packages.clickhouse.com/deb stable main" | sudo tee \ /etc/apt/sources.list.d/clickhouse.list sudo apt-get update ``` @@ -174,7 +136,7 @@ clickhouse-client # or "clickhouse-client --password" if you set up a password. -You can replace `stable` with `lts` to use different [release kinds](/docs/en/faq/operations/production.md) based on your needs. +You can replace `stable` with `lts` to use different [release kinds](/knowledgebase/production) based on your needs. You can also download and install packages manually from [here](https://packages.clickhouse.com/deb/pool/main/c/). @@ -272,7 +234,7 @@ clickhouse-client # or "clickhouse-client --password" if you set up a password. -You can replace `stable` with `lts` to use different [release kinds](/docs/en/faq/operations/production.md) based on your needs. +You can replace `stable` with `lts` to use different [release kinds](/knowledgebase/production) based on your needs. Then run these commands to install packages: diff --git a/docs/en/getting-started/playground.md b/docs/en/getting-started/playground.md index e995ea6ef8b..dbb8d46a2fc 100644 --- a/docs/en/getting-started/playground.md +++ b/docs/en/getting-started/playground.md @@ -1,5 +1,5 @@ --- -sidebar_label: Playground +sidebar_label: ClickHouse Playground sidebar_position: 2 keywords: [clickhouse, playground, getting, started, docs] description: The ClickHouse Playground allows people to experiment with ClickHouse by running queries instantly, without setting up their server or cluster. @@ -11,7 +11,7 @@ slug: /en/getting-started/playground [ClickHouse Playground](https://play.clickhouse.com/play?user=play) allows people to experiment with ClickHouse by running queries instantly, without setting up their server or cluster. Several example datasets are available in Playground. -You can make queries to Playground using any HTTP client, for example [curl](https://curl.haxx.se) or [wget](https://www.gnu.org/software/wget/), or set up a connection using [JDBC](../interfaces/jdbc.md) or [ODBC](../interfaces/odbc.md) drivers. More information about software products that support ClickHouse is available [here](../interfaces). +You can make queries to Playground using any HTTP client, for example [curl](https://curl.haxx.se) or [wget](https://www.gnu.org/software/wget/), or set up a connection using [JDBC](../interfaces/jdbc.md) or [ODBC](../interfaces/odbc.md) drivers. More information about software products that support ClickHouse is available [here](../integrations/index.mdx). ## Credentials {#credentials} diff --git a/docs/en/interfaces/formats.md b/docs/en/interfaces/formats.md index db2e773a685..ae3756d5d41 100644 --- a/docs/en/interfaces/formats.md +++ b/docs/en/interfaces/formats.md @@ -1,7 +1,7 @@ --- slug: /en/interfaces/formats sidebar_position: 21 -sidebar_label: Input and Output Formats +sidebar_label: View all formats... title: Formats for Input and Output Data --- @@ -154,7 +154,7 @@ Arrays are written as a list of comma-separated values in square brackets. Numbe In input data, ENUM values can be represented as names or as ids. First, we try to match the input value to the ENUM name. If we fail and the input value is a number, we try to match this number to ENUM id. If input data contains only ENUM ids, it's recommended to enable the setting [input_format_tsv_enum_as_number](/docs/en/operations/settings/settings-formats.md/#input_format_tsv_enum_as_number) to optimize ENUM parsing. -Each element of [Nested](/docs/en/sql-reference/data-types/nested-data-structures/nested.md) structures is represented as an array. +Each element of [Nested](/docs/en/sql-reference/data-types/nested-data-structures/index.md) structures is represented as an array. For example: @@ -684,7 +684,7 @@ Example: ## JSONColumns {#jsoncolumns} :::tip -The output of the JSONColumns* formats provides the ClickHouse field name and then the content of each row of the table for that field; +The output of the JSONColumns* formats provides the ClickHouse field name and then the content of each row of the table for that field; visually, the data is rotated 90 degrees to the left. ::: @@ -1150,7 +1150,7 @@ Any set of bytes can be output in the strings. Use the `JSONEachRow` format if y ### Usage of Nested Structures {#jsoneachrow-nested} -If you have a table with [Nested](/docs/en/sql-reference/data-types/nested-data-structures/nested.md) data type columns, you can insert JSON data with the same structure. Enable this feature with the [input_format_import_nested_json](/docs/en/operations/settings/settings-formats.md/#input_format_import_nested_json) setting. +If you have a table with [Nested](/docs/en/sql-reference/data-types/nested-data-structures/index.md) data type columns, you can insert JSON data with the same structure. Enable this feature with the [input_format_import_nested_json](/docs/en/operations/settings/settings-formats.md/#input_format_import_nested_json) setting. For example, consider the following table: @@ -1776,7 +1776,7 @@ message MessageType { ``` ClickHouse tries to find a column named `x.y.z` (or `x_y_z` or `X.y_Z` and so on). -Nested messages are suitable to input or output a [nested data structures](/docs/en/sql-reference/data-types/nested-data-structures/nested.md). +Nested messages are suitable to input or output a [nested data structures](/docs/en/sql-reference/data-types/nested-data-structures/index.md). Default values defined in a protobuf schema like this @@ -1808,23 +1808,26 @@ ClickHouse Avro format supports reading and writing [Avro data files](https://av The table below shows supported data types and how they match ClickHouse [data types](/docs/en/sql-reference/data-types/index.md) in `INSERT` and `SELECT` queries. -| Avro data type `INSERT` | ClickHouse data type | Avro data type `SELECT` | -|---------------------------------------------|-----------------------------------------------------------------------------------------------------------------|-------------------------------------------------| -| `boolean`, `int`, `long`, `float`, `double` | [Int(8\ | 16\ |32)](/docs/en/sql-reference/data-types/int-uint.md), [UInt(8\|16\|32)](/docs/en/sql-reference/data-types/int-uint.md) | `int` | -| `boolean`, `int`, `long`, `float`, `double` | [Int64](/docs/en/sql-reference/data-types/int-uint.md), [UInt64](/docs/en/sql-reference/data-types/int-uint.md) | `long` | -| `boolean`, `int`, `long`, `float`, `double` | [Float32](/docs/en/sql-reference/data-types/float.md) | `float` | -| `boolean`, `int`, `long`, `float`, `double` | [Float64](/docs/en/sql-reference/data-types/float.md) | `double` | -| `bytes`, `string`, `fixed`, `enum` | [String](/docs/en/sql-reference/data-types/string.md) | `bytes` or `string` \* | -| `bytes`, `string`, `fixed` | [FixedString(N)](/docs/en/sql-reference/data-types/fixedstring.md) | `fixed(N)` | -| `enum` | [Enum(8\ | 16)](/docs/en/sql-reference/data-types/enum.md) | `enum` | -| `array(T)` | [Array(T)](/docs/en/sql-reference/data-types/array.md) | `array(T)` | -| `union(null, T)`, `union(T, null)` | [Nullable(T)](/docs/en/sql-reference/data-types/date.md) | `union(null, T)` | -| `null` | [Nullable(Nothing)](/docs/en/sql-reference/data-types/special-data-types/nothing.md) | `null` | -| `int (date)` \** | [Date](/docs/en/sql-reference/data-types/date.md) | `int (date)` \** | -| `long (timestamp-millis)` \** | [DateTime64(3)](/docs/en/sql-reference/data-types/datetime.md) | `long (timestamp-millis)` \* | -| `long (timestamp-micros)` \** | [DateTime64(6)](/docs/en/sql-reference/data-types/datetime.md) | `long (timestamp-micros)` \* | -| `int` | [IPv4](/docs/en/sql-reference/data-types/domains/ipv4.md) | `int` | -| `fixed(16)` | [IPv6](/docs/en/sql-reference/data-types/domains/ipv6.md) | `fixed(16)` | +| Avro data type `INSERT` | ClickHouse data type | Avro data type `SELECT` | +|---------------------------------------------|-------------------------------------------------------------------------------------------------------------------------------|-------------------------------| +| `boolean`, `int`, `long`, `float`, `double` | [Int(8\16\32)](/docs/en/sql-reference/data-types/int-uint.md), [UInt(8\16\32)](/docs/en/sql-reference/data-types/int-uint.md) | `int` | +| `boolean`, `int`, `long`, `float`, `double` | [Int64](/docs/en/sql-reference/data-types/int-uint.md), [UInt64](/docs/en/sql-reference/data-types/int-uint.md) | `long` | +| `boolean`, `int`, `long`, `float`, `double` | [Float32](/docs/en/sql-reference/data-types/float.md) | `float` | +| `boolean`, `int`, `long`, `float`, `double` | [Float64](/docs/en/sql-reference/data-types/float.md) | `double` | +| `bytes`, `string`, `fixed`, `enum` | [String](/docs/en/sql-reference/data-types/string.md) | `bytes` or `string` \* | +| `bytes`, `string`, `fixed` | [FixedString(N)](/docs/en/sql-reference/data-types/fixedstring.md) | `fixed(N)` | +| `enum` | [Enum(8\16)](/docs/en/sql-reference/data-types/enum.md) | `enum` | +| `array(T)` | [Array(T)](/docs/en/sql-reference/data-types/array.md) | `array(T)` | +| `union(null, T)`, `union(T, null)` | [Nullable(T)](/docs/en/sql-reference/data-types/date.md) | `union(null, T)` | +| `null` | [Nullable(Nothing)](/docs/en/sql-reference/data-types/special-data-types/nothing.md) | `null` | +| `int (date)` \** | [Date](/docs/en/sql-reference/data-types/date.md), [Date32](docs/en/sql-reference/data-types/date32.md) | `int (date)` \** | +| `long (timestamp-millis)` \** | [DateTime64(3)](/docs/en/sql-reference/data-types/datetime.md) | `long (timestamp-millis)` \** | +| `long (timestamp-micros)` \** | [DateTime64(6)](/docs/en/sql-reference/data-types/datetime.md) | `long (timestamp-micros)` \** | +| `int` | [IPv4](/docs/en/sql-reference/data-types/domains/ipv4.md) | `int` | +| `fixed(16)` | [IPv6](/docs/en/sql-reference/data-types/domains/ipv6.md) | `fixed(16)` | +| `bytes (decimal)` \** | [Decimal(P, S)](/docs/en/sql-reference/data-types/decimal.md) | `bytes (decimal)` \** | +| `string (uuid)` \** | [UUID](/docs/en/sql-reference/data-types/uuid.md) | `string (uuid)` \** | + \* `bytes` is default, controlled by [output_format_avro_string_column_pattern](/docs/en/operations/settings/settings-formats.md/#output_format_avro_string_column_pattern) \** [Avro logical types](https://avro.apache.org/docs/current/spec.html#Logical+Types) @@ -1975,7 +1978,7 @@ To exchange data with Hadoop, you can use [HDFS table engine](/docs/en/engines/t - [output_format_parquet_row_group_size](/docs/en/operations/settings/settings-formats.md/#output_format_parquet_row_group_size) - row group size in rows while data output. Default value - `1000000`. - [output_format_parquet_string_as_string](/docs/en/operations/settings/settings-formats.md/#output_format_parquet_string_as_string) - use Parquet String type instead of Binary for String columns. Default value - `false`. -- [input_format_parquet_import_nested](/docs/en/operations/settings/settings-formats.md/#input_format_parquet_import_nested) - allow inserting array of structs into [Nested](/docs/en/sql-reference/data-types/nested-data-structures/nested.md) table in Parquet input format. Default value - `false`. +- [input_format_parquet_import_nested](/docs/en/operations/settings/settings-formats.md/#input_format_parquet_import_nested) - allow inserting array of structs into [Nested](/docs/en/sql-reference/data-types/nested-data-structures/index.md) table in Parquet input format. Default value - `false`. - [input_format_parquet_case_insensitive_column_matching](/docs/en/operations/settings/settings-formats.md/#input_format_parquet_case_insensitive_column_matching) - ignore case when matching Parquet columns with ClickHouse columns. Default value - `false`. - [input_format_parquet_allow_missing_columns](/docs/en/operations/settings/settings-formats.md/#input_format_parquet_allow_missing_columns) - allow missing columns while reading Parquet data. Default value - `false`. - [input_format_parquet_skip_columns_with_unsupported_types_in_schema_inference](/docs/en/operations/settings/settings-formats.md/#input_format_parquet_skip_columns_with_unsupported_types_in_schema_inference) - allow skipping columns with unsupported types while schema inference for Parquet format. Default value - `false`. diff --git a/docs/en/interfaces/overview.md b/docs/en/interfaces/overview.md index e5fa503e8fc..ee47e010f9e 100644 --- a/docs/en/interfaces/overview.md +++ b/docs/en/interfaces/overview.md @@ -6,7 +6,7 @@ keywords: [clickhouse, network, interfaces, http, tcp, grpc, command-line, clien description: ClickHouse provides three network interfaces --- -# Interfaces +# Drivers and Interfaces ClickHouse provides three network interfaces (they can be optionally wrapped in TLS for additional security): diff --git a/docs/en/interfaces/postgresql.md b/docs/en/interfaces/postgresql.md index 9ff83559787..f7a619ca620 100644 --- a/docs/en/interfaces/postgresql.md +++ b/docs/en/interfaces/postgresql.md @@ -8,7 +8,7 @@ sidebar_label: PostgreSQL Interface ClickHouse supports the PostgreSQL wire protocol, which allows you to use Postgres clients to connect to ClickHouse. In a sense, ClickHouse can pretend to be a PostgreSQL instance - allowing you to connect a PostgreSQL client application to ClickHouse that is not already directly supported by ClickHouse (for example, Amazon Redshift). -To enable the PostgreSQL wire protocol, add the [postgresql_port](../operations/server-configuration-parameters/settings#server_configuration_parameters-postgresql_port) setting to your server's configuration file. For example, you could define the port in a new XML file in your `config.d` folder: +To enable the PostgreSQL wire protocol, add the [postgresql_port](../operations/server-configuration-parameters/settings.md#server_configuration_parameters-postgresql_port) setting to your server's configuration file. For example, you could define the port in a new XML file in your `config.d` folder: ```xml diff --git a/docs/en/interfaces/schema-inference.md b/docs/en/interfaces/schema-inference.md index e028b4a6d96..c448d0aee47 100644 --- a/docs/en/interfaces/schema-inference.md +++ b/docs/en/interfaces/schema-inference.md @@ -1473,6 +1473,7 @@ In Avro format ClickHouse reads its schema from the data and converts it to Clic |------------------------------------|--------------------------------------------------------------------------------| | `boolean` | [Bool](../sql-reference/data-types/boolean.md) | | `int` | [Int32](../sql-reference/data-types/int-uint.md) | +| `int (date)` \* | [Date32](../sql-reference/data-types/date32.md) | | `long` | [Int64](../sql-reference/data-types/int-uint.md) | | `float` | [Float32](../sql-reference/data-types/float.md) | | `double` | [Float64](../sql-reference/data-types/float.md) | @@ -1482,6 +1483,10 @@ In Avro format ClickHouse reads its schema from the data and converts it to Clic | `array(T)` | [Array(T)](../sql-reference/data-types/array.md) | | `union(null, T)`, `union(T, null)` | [Nullable(T)](../sql-reference/data-types/date.md) | | `null` | [Nullable(Nothing)](../sql-reference/data-types/special-data-types/nothing.md) | +| `string (uuid)` \* | [UUID](../sql-reference/data-types/uuid.md) | +| `binary (decimal)` \* | [Decimal(P, S)](../sql-reference/data-types/decimal.md) | + +\* [Avro logical types](https://avro.apache.org/docs/current/spec.html#Logical+Types) Other Avro types are not supported. diff --git a/docs/en/operations/_category_.yml b/docs/en/operations/_category_.yml index 08849e7489d..352809f663b 100644 --- a/docs/en/operations/_category_.yml +++ b/docs/en/operations/_category_.yml @@ -2,7 +2,3 @@ position: 70 label: 'Operations' collapsible: true collapsed: true -link: - type: generated-index - title: Operations - slug: /en/operations diff --git a/docs/en/operations/access-rights.md b/docs/en/operations/access-rights.md deleted file mode 100644 index 4c4a06dbe1e..00000000000 --- a/docs/en/operations/access-rights.md +++ /dev/null @@ -1,152 +0,0 @@ ---- -slug: /en/operations/access-rights -sidebar_position: 48 -sidebar_label: Access Control and Account Management -title: Access Control and Account Management ---- - -ClickHouse supports access control management based on [RBAC](https://en.wikipedia.org/wiki/Role-based_access_control) approach. - -ClickHouse access entities: -- [User account](#user-account-management) -- [Role](#role-management) -- [Row Policy](#row-policy-management) -- [Settings Profile](#settings-profiles-management) -- [Quota](#quotas-management) - -You can configure access entities using: - -- SQL-driven workflow. - - You need to [enable](#enabling-access-control) this functionality. - -- Server [configuration files](../operations/configuration-files.md) `users.xml` and `config.xml`. - -We recommend using SQL-driven workflow. Both of the configuration methods work simultaneously, so if you use the server configuration files for managing accounts and access rights, you can smoothly switch to SQL-driven workflow. - -:::warning -You can’t manage the same access entity by both configuration methods simultaneously. -::: - -To see all users, roles, profiles, etc. and all their grants use [SHOW ACCESS](../sql-reference/statements/show.md#show-access-statement) statement. - -## Usage {#access-control-usage} - -By default, the ClickHouse server provides the `default` user account which is not allowed using SQL-driven access control and account management but has all the rights and permissions. The `default` user account is used in any cases when the username is not defined, for example, at login from client or in distributed queries. In distributed query processing a default user account is used, if the configuration of the server or cluster does not specify the [user and password](../engines/table-engines/special/distributed.md) properties. - -If you just started using ClickHouse, consider the following scenario: - -1. [Enable](#enabling-access-control) SQL-driven access control and account management for the `default` user. -2. Log in to the `default` user account and create all the required users. Don’t forget to create an administrator account (`GRANT ALL ON *.* TO admin_user_account WITH GRANT OPTION`). -3. [Restrict permissions](../operations/settings/permissions-for-queries.md#permissions_for_queries) for the `default` user and disable SQL-driven access control and account management for it. - -### Properties of Current Solution {#access-control-properties} - -- You can grant permissions for databases and tables even if they do not exist. -- If a table was deleted, all the privileges that correspond to this table are not revoked. This means that even if you create a new table with the same name later, all the privileges remain valid. To revoke privileges corresponding to the deleted table, you need to execute, for example, the `REVOKE ALL PRIVILEGES ON db.table FROM ALL` query. -- There are no lifetime settings for privileges. - -## User Account {#user-account-management} - -A user account is an access entity that allows to authorize someone in ClickHouse. A user account contains: - -- Identification information. -- [Privileges](../sql-reference/statements/grant.md#grant-privileges) that define a scope of queries the user can execute. -- Hosts allowed to connect to the ClickHouse server. -- Assigned and default roles. -- Settings with their constraints applied by default at user login. -- Assigned settings profiles. - -Privileges can be granted to a user account by the [GRANT](../sql-reference/statements/grant.md) query or by assigning [roles](#role-management). To revoke privileges from a user, ClickHouse provides the [REVOKE](../sql-reference/statements/revoke.md) query. To list privileges for a user, use the [SHOW GRANTS](../sql-reference/statements/show.md#show-grants-statement) statement. - -Management queries: - -- [CREATE USER](../sql-reference/statements/create/user.md) -- [ALTER USER](../sql-reference/statements/alter/user.md#alter-user-statement) -- [DROP USER](../sql-reference/statements/drop.md) -- [SHOW CREATE USER](../sql-reference/statements/show.md#show-create-user-statement) -- [SHOW USERS](../sql-reference/statements/show.md#show-users-statement) - -### Settings Applying {#access-control-settings-applying} - -Settings can be configured differently: for a user account, in its granted roles and in settings profiles. At user login, if a setting is configured for different access entities, the value and constraints of this setting are applied as follows (from higher to lower priority): - -1. User account settings. -2. The settings of default roles of the user account. If a setting is configured in some roles, then order of the setting application is undefined. -3. The settings from settings profiles assigned to a user or to its default roles. If a setting is configured in some profiles, then order of setting application is undefined. -4. Settings applied to all the server by default or from the [default profile](../operations/server-configuration-parameters/settings.md#default-profile). - -## Role {#role-management} - -Role is a container for access entities that can be granted to a user account. - -Role contains: - -- [Privileges](../sql-reference/statements/grant.md#grant-privileges) -- Settings and constraints -- List of assigned roles - -Management queries: - -- [CREATE ROLE](../sql-reference/statements/create/role.md) -- [ALTER ROLE](../sql-reference/statements/alter/role.md#alter-role-statement) -- [DROP ROLE](../sql-reference/statements/drop.md) -- [SET ROLE](../sql-reference/statements/set-role.md) -- [SET DEFAULT ROLE](../sql-reference/statements/set-role.md#set-default-role-statement) -- [SHOW CREATE ROLE](../sql-reference/statements/show.md#show-create-role-statement) -- [SHOW ROLES](../sql-reference/statements/show.md#show-roles-statement) - -Privileges can be granted to a role by the [GRANT](../sql-reference/statements/grant.md) query. To revoke privileges from a role ClickHouse provides the [REVOKE](../sql-reference/statements/revoke.md) query. - -## Row Policy {#row-policy-management} - -Row policy is a filter that defines which of the rows are available to a user or a role. Row policy contains filters for one particular table, as well as a list of roles and/or users which should use this row policy. - -:::warning -Row policies makes sense only for users with readonly access. If user can modify table or copy partitions between tables, it defeats the restrictions of row policies. -::: - -Management queries: - -- [CREATE ROW POLICY](../sql-reference/statements/create/row-policy.md) -- [ALTER ROW POLICY](../sql-reference/statements/alter/row-policy.md#alter-row-policy-statement) -- [DROP ROW POLICY](../sql-reference/statements/drop.md#drop-row-policy-statement) -- [SHOW CREATE ROW POLICY](../sql-reference/statements/show.md#show-create-row-policy-statement) -- [SHOW POLICIES](../sql-reference/statements/show.md#show-policies-statement) - -## Settings Profile {#settings-profiles-management} - -Settings profile is a collection of [settings](../operations/settings/index.md). Settings profile contains settings and constraints, as well as a list of roles and/or users to which this profile is applied. - -Management queries: - -- [CREATE SETTINGS PROFILE](../sql-reference/statements/create/settings-profile.md#create-settings-profile-statement) -- [ALTER SETTINGS PROFILE](../sql-reference/statements/alter/settings-profile.md#alter-settings-profile-statement) -- [DROP SETTINGS PROFILE](../sql-reference/statements/drop.md#drop-settings-profile-statement) -- [SHOW CREATE SETTINGS PROFILE](../sql-reference/statements/show.md#show-create-settings-profile-statement) -- [SHOW PROFILES](../sql-reference/statements/show.md#show-profiles-statement) - -## Quota {#quotas-management} - -Quota limits resource usage. See [Quotas](../operations/quotas.md). - -Quota contains a set of limits for some durations, as well as a list of roles and/or users which should use this quota. - -Management queries: - -- [CREATE QUOTA](../sql-reference/statements/create/quota.md) -- [ALTER QUOTA](../sql-reference/statements/alter/quota.md#alter-quota-statement) -- [DROP QUOTA](../sql-reference/statements/drop.md#drop-quota-statement) -- [SHOW CREATE QUOTA](../sql-reference/statements/show.md#show-create-quota-statement) -- [SHOW QUOTA](../sql-reference/statements/show.md#show-quota-statement) -- [SHOW QUOTAS](../sql-reference/statements/show.md#show-quotas-statement) - -## Enabling SQL-driven Access Control and Account Management {#enabling-access-control} - -- Setup a directory for configurations storage. - - ClickHouse stores access entity configurations in the folder set in the [access_control_path](../operations/server-configuration-parameters/settings.md#access_control_path) server configuration parameter. - -- Enable SQL-driven access control and account management for at least one user account. - - By default, SQL-driven access control and account management is disabled for all users. You need to configure at least one user in the `users.xml` configuration file and set the value of the [access_management](../operations/settings/settings-users.md#access_management-user-setting) setting to 1. diff --git a/docs/en/operations/backup.md b/docs/en/operations/backup.md index f1a5649cd4c..d58dd1376eb 100644 --- a/docs/en/operations/backup.md +++ b/docs/en/operations/backup.md @@ -1,5 +1,6 @@ --- slug: /en/operations/backup +description: In order to effectively mitigate possible human errors, you should carefully prepare a strategy for backing up and restoring your data. --- # Backup and Restore @@ -213,7 +214,7 @@ To write backups to an S3 bucket you need three pieces of information: for example `Abc+123` :::note -Creating an S3 bucket is covered in [Use S3 Object Storage as a ClickHouse disk](/docs/en/integrations/data-ingestion/s3/configuring-s3-for-clickhouse-use.md), just come back to this doc after saving the policy, there is no need to configure ClickHouse to use the S3 bucket. +Creating an S3 bucket is covered in [Use S3 Object Storage as a ClickHouse disk](/docs/en/integrations/data-ingestion/s3/index.md#configuring-s3-for-clickhouse-use), just come back to this doc after saving the policy, there is no need to configure ClickHouse to use the S3 bucket. ::: The destination for a backup will be specified like this: @@ -330,7 +331,7 @@ It is also possible to `BACKUP`/`RESTORE` to S3 by configuring an S3 disk in the
- s3 + s3_plain
diff --git a/docs/en/operations/caches.md b/docs/en/operations/caches.md index 0f9156048c4..86bf8065d94 100644 --- a/docs/en/operations/caches.md +++ b/docs/en/operations/caches.md @@ -3,6 +3,7 @@ slug: /en/operations/caches sidebar_position: 65 sidebar_label: Caches title: "Cache Types" +description: When performing queries, ClickHouse uses different caches. --- When performing queries, ClickHouse uses different caches. diff --git a/docs/en/operations/clickhouse-keeper.md b/docs/en/operations/clickhouse-keeper.md deleted file mode 100644 index 10bad586a54..00000000000 --- a/docs/en/operations/clickhouse-keeper.md +++ /dev/null @@ -1,378 +0,0 @@ ---- -slug: /en/operations/clickhouse-keeper -sidebar_position: 66 -sidebar_label: ClickHouse Keeper ---- - -# ClickHouse Keeper -import SelfManaged from '@site/docs/en/_snippets/_self_managed_only_automated.md'; - - - -ClickHouse Keeper provides the coordination system for data [replication](../engines/table-engines/mergetree-family/replication.md) and [distributed DDL](../sql-reference/distributed-ddl.md) queries execution. ClickHouse Keeper is compatible with ZooKeeper. - -## Implementation details {#implementation-details} - -ZooKeeper is one of the first well-known open-source coordination systems. It's implemented in Java, and has quite a simple and powerful data model. ZooKeeper's coordination algorithm, ZooKeeper Atomic Broadcast (ZAB), doesn't provide linearizability guarantees for reads, because each ZooKeeper node serves reads locally. Unlike ZooKeeper ClickHouse Keeper is written in C++ and uses the [RAFT algorithm](https://raft.github.io/) [implementation](https://github.com/eBay/NuRaft). This algorithm allows linearizability for reads and writes, and has several open-source implementations in different languages. - -By default, ClickHouse Keeper provides the same guarantees as ZooKeeper (linearizable writes, non-linearizable reads). It has a compatible client-server protocol, so any standard ZooKeeper client can be used to interact with ClickHouse Keeper. Snapshots and logs have an incompatible format with ZooKeeper, but the `clickhouse-keeper-converter` tool enables the conversion of ZooKeeper data to ClickHouse Keeper snapshots. The interserver protocol in ClickHouse Keeper is also incompatible with ZooKeeper so a mixed ZooKeeper / ClickHouse Keeper cluster is impossible. - -ClickHouse Keeper supports Access Control Lists (ACLs) the same way as [ZooKeeper](https://zookeeper.apache.org/doc/r3.1.2/zookeeperProgrammers.html#sc_ZooKeeperAccessControl) does. ClickHouse Keeper supports the same set of permissions and has the identical built-in schemes: `world`, `auth` and `digest`. The digest authentication scheme uses the pair `username:password`, the password is encoded in Base64. - -:::note -External integrations are not supported. -::: - -## Configuration {#configuration} - -ClickHouse Keeper can be used as a standalone replacement for ZooKeeper or as an internal part of the ClickHouse server. In both cases the configuration is almost the same `.xml` file. The main ClickHouse Keeper configuration tag is ``. Keeper configuration has the following parameters: - -- `tcp_port` — Port for a client to connect (default for ZooKeeper is `2181`). -- `tcp_port_secure` — Secure port for an SSL connection between client and keeper-server. -- `server_id` — Unique server id, each participant of the ClickHouse Keeper cluster must have a unique number (1, 2, 3, and so on). -- `log_storage_path` — Path to coordination logs, just like ZooKeeper it is best to store logs on non-busy nodes. -- `snapshot_storage_path` — Path to coordination snapshots. - -Other common parameters are inherited from the ClickHouse server config (`listen_host`, `logger`, and so on). - -Internal coordination settings are located in the `.` section: - -- `operation_timeout_ms` — Timeout for a single client operation (ms) (default: 10000). -- `min_session_timeout_ms` — Min timeout for client session (ms) (default: 10000). -- `session_timeout_ms` — Max timeout for client session (ms) (default: 100000). -- `dead_session_check_period_ms` — How often ClickHouse Keeper checks for dead sessions and removes them (ms) (default: 500). -- `heart_beat_interval_ms` — How often a ClickHouse Keeper leader will send heartbeats to followers (ms) (default: 500). -- `election_timeout_lower_bound_ms` — If the follower does not receive a heartbeat from the leader in this interval, then it can initiate leader election (default: 1000). Must be less than or equal to `election_timeout_upper_bound_ms`. Ideally they shouldn't be equal. -- `election_timeout_upper_bound_ms` — If the follower does not receive a heartbeat from the leader in this interval, then it must initiate leader election (default: 2000). -- `rotate_log_storage_interval` — How many log records to store in a single file (default: 100000). -- `reserved_log_items` — How many coordination log records to store before compaction (default: 100000). -- `snapshot_distance` — How often ClickHouse Keeper will create new snapshots (in the number of records in logs) (default: 100000). -- `snapshots_to_keep` — How many snapshots to keep (default: 3). -- `stale_log_gap` — Threshold when leader considers follower as stale and sends the snapshot to it instead of logs (default: 10000). -- `fresh_log_gap` — When node became fresh (default: 200). -- `max_requests_batch_size` - Max size of batch in requests count before it will be sent to RAFT (default: 100). -- `force_sync` — Call `fsync` on each write to coordination log (default: true). -- `quorum_reads` — Execute read requests as writes through whole RAFT consensus with similar speed (default: false). -- `raft_logs_level` — Text logging level about coordination (trace, debug, and so on) (default: system default). -- `auto_forwarding` — Allow to forward write requests from followers to the leader (default: true). -- `shutdown_timeout` — Wait to finish internal connections and shutdown (ms) (default: 5000). -- `startup_timeout` — If the server doesn't connect to other quorum participants in the specified timeout it will terminate (ms) (default: 30000). -- `four_letter_word_white_list` — White list of 4lw commands (default: `conf,cons,crst,envi,ruok,srst,srvr,stat,wchs,dirs,mntr,isro,rcvr,apiv,csnp,lgif,rqld`). - -Quorum configuration is located in the `.` section and contain servers description. - -The only parameter for the whole quorum is `secure`, which enables encrypted connection for communication between quorum participants. The parameter can be set `true` if SSL connection is required for internal communication between nodes, or left unspecified otherwise. - -The main parameters for each `` are: - -- `id` — Server identifier in a quorum. -- `hostname` — Hostname where this server is placed. -- `port` — Port where this server listens for connections. - -:::note -In the case of a change in the topology of your ClickHouse Keeper cluster (e.g., replacing a server), please make sure to keep the mapping of `server_id` to `hostname` consistent and avoid shuffling or reusing an existing `server_id` for different servers (e.g., it can happen if your rely on automation scripts to deploy ClickHouse Keeper) -::: - -Examples of configuration for quorum with three nodes can be found in [integration tests](https://github.com/ClickHouse/ClickHouse/tree/master/tests/integration) with `test_keeper_` prefix. Example configuration for server #1: - -```xml - - 2181 - 1 - /var/lib/clickhouse/coordination/log - /var/lib/clickhouse/coordination/snapshots - - - 10000 - 30000 - trace - - - - - 1 - zoo1 - 9444 - - - 2 - zoo2 - 9444 - - - 3 - zoo3 - 9444 - - - -``` - -## How to run {#how-to-run} - -ClickHouse Keeper is bundled into the ClickHouse server package, just add configuration of `` and start ClickHouse server as always. If you want to run standalone ClickHouse Keeper you can start it in a similar way with: - -```bash -clickhouse-keeper --config /etc/your_path_to_config/config.xml -``` - -If you don't have the symlink (`clickhouse-keeper`) you can create it or specify `keeper` as an argument to `clickhouse`: - -```bash -clickhouse keeper --config /etc/your_path_to_config/config.xml -``` - -## Four Letter Word Commands {#four-letter-word-commands} - -ClickHouse Keeper also provides 4lw commands which are almost the same with Zookeeper. Each command is composed of four letters such as `mntr`, `stat` etc. There are some more interesting commands: `stat` gives some general information about the server and connected clients, while `srvr` and `cons` give extended details on server and connections respectively. - -The 4lw commands has a white list configuration `four_letter_word_white_list` which has default value `conf,cons,crst,envi,ruok,srst,srvr,stat,wchs,dirs,mntr,isro,rcvr,apiv,csnp,lgif,rqld`. - -You can issue the commands to ClickHouse Keeper via telnet or nc, at the client port. - -``` -echo mntr | nc localhost 9181 -``` - -Bellow is the detailed 4lw commands: - -- `ruok`: Tests if server is running in a non-error state. The server will respond with `imok` if it is running. Otherwise it will not respond at all. A response of `imok` does not necessarily indicate that the server has joined the quorum, just that the server process is active and bound to the specified client port. Use "stat" for details on state wrt quorum and client connection information. - -``` -imok -``` - -- `mntr`: Outputs a list of variables that could be used for monitoring the health of the cluster. - -``` -zk_version v21.11.1.1-prestable-7a4a0b0edef0ad6e0aa662cd3b90c3f4acf796e7 -zk_avg_latency 0 -zk_max_latency 0 -zk_min_latency 0 -zk_packets_received 68 -zk_packets_sent 68 -zk_num_alive_connections 1 -zk_outstanding_requests 0 -zk_server_state leader -zk_znode_count 4 -zk_watch_count 1 -zk_ephemerals_count 0 -zk_approximate_data_size 723 -zk_open_file_descriptor_count 310 -zk_max_file_descriptor_count 10240 -zk_followers 0 -zk_synced_followers 0 -``` - -- `srvr`: Lists full details for the server. - -``` -ClickHouse Keeper version: v21.11.1.1-prestable-7a4a0b0edef0ad6e0aa662cd3b90c3f4acf796e7 -Latency min/avg/max: 0/0/0 -Received: 2 -Sent : 2 -Connections: 1 -Outstanding: 0 -Zxid: 34 -Mode: leader -Node count: 4 -``` - -- `stat`: Lists brief details for the server and connected clients. - -``` -ClickHouse Keeper version: v21.11.1.1-prestable-7a4a0b0edef0ad6e0aa662cd3b90c3f4acf796e7 -Clients: - 192.168.1.1:52852(recved=0,sent=0) - 192.168.1.1:52042(recved=24,sent=48) -Latency min/avg/max: 0/0/0 -Received: 4 -Sent : 4 -Connections: 1 -Outstanding: 0 -Zxid: 36 -Mode: leader -Node count: 4 -``` - -- `srst`: Reset server statistics. The command will affect the result of `srvr`, `mntr` and `stat`. - -``` -Server stats reset. -``` - -- `conf`: Print details about serving configuration. - -``` -server_id=1 -tcp_port=2181 -four_letter_word_white_list=* -log_storage_path=./coordination/logs -snapshot_storage_path=./coordination/snapshots -max_requests_batch_size=100 -session_timeout_ms=30000 -operation_timeout_ms=10000 -dead_session_check_period_ms=500 -heart_beat_interval_ms=500 -election_timeout_lower_bound_ms=1000 -election_timeout_upper_bound_ms=2000 -reserved_log_items=1000000000000000 -snapshot_distance=10000 -auto_forwarding=true -shutdown_timeout=5000 -startup_timeout=240000 -raft_logs_level=information -snapshots_to_keep=3 -rotate_log_storage_interval=100000 -stale_log_gap=10000 -fresh_log_gap=200 -max_requests_batch_size=100 -quorum_reads=false -force_sync=false -compress_logs=true -compress_snapshots_with_zstd_format=true -configuration_change_tries_count=20 -``` - -- `cons`: List full connection/session details for all clients connected to this server. Includes information on numbers of packets received/sent, session id, operation latencies, last operation performed, etc... - -``` - 192.168.1.1:52163(recved=0,sent=0,sid=0xffffffffffffffff,lop=NA,est=1636454787393,to=30000,lzxid=0xffffffffffffffff,lresp=0,llat=0,minlat=0,avglat=0,maxlat=0) - 192.168.1.1:52042(recved=9,sent=18,sid=0x0000000000000001,lop=List,est=1636454739887,to=30000,lcxid=0x0000000000000005,lzxid=0x0000000000000005,lresp=1636454739892,llat=0,minlat=0,avglat=0,maxlat=0) -``` - -- `crst`: Reset connection/session statistics for all connections. - -``` -Connection stats reset. -``` - -- `envi`: Print details about serving environment - -``` -Environment: -clickhouse.keeper.version=v21.11.1.1-prestable-7a4a0b0edef0ad6e0aa662cd3b90c3f4acf796e7 -host.name=ZBMAC-C02D4054M.local -os.name=Darwin -os.arch=x86_64 -os.version=19.6.0 -cpu.count=12 -user.name=root -user.home=/Users/JackyWoo/ -user.dir=/Users/JackyWoo/project/jd/clickhouse/cmake-build-debug/programs/ -user.tmp=/var/folders/b4/smbq5mfj7578f2jzwn602tt40000gn/T/ -``` - - -- `dirs`: Shows the total size of snapshot and log files in bytes - -``` -snapshot_dir_size: 0 -log_dir_size: 3875 -``` - -- `isro`: Tests if server is running in read-only mode. The server will respond with "ro" if in read-only mode or "rw" if not in read-only mode. - -``` -rw -``` - -- `wchs`: Lists brief information on watches for the server. - -``` -1 connections watching 1 paths -Total watches:1 -``` - -- `wchc`: Lists detailed information on watches for the server, by session. This outputs a list of sessions (connections) with associated watches (paths). Note, depending on the number of watches this operation may be expensive (ie impact server performance), use it carefully. - -``` -0x0000000000000001 - /clickhouse/task_queue/ddl -``` - -- `wchp`: Lists detailed information on watches for the server, by path. This outputs a list of paths (znodes) with associated sessions. Note, depending on the number of watches this operation may be expensive (i. e. impact server performance), use it carefully. - -``` -/clickhouse/task_queue/ddl - 0x0000000000000001 -``` - -- `dump`: Lists the outstanding sessions and ephemeral nodes. This only works on the leader. - -``` -Sessions dump (2): -0x0000000000000001 -0x0000000000000002 -Sessions with Ephemerals (1): -0x0000000000000001 - /clickhouse/task_queue/ddl -``` - -- `csnp`: Schedule a snapshot creation task. Return the last committed log index of the scheduled snapshot if success or `Failed to schedule snapshot creation task.` if failed. Note that `lgif` command can help you determine whether the snapshot is done. - -``` -100 -``` - -- `lgif`: Keeper log information. `first_log_idx` : my first log index in log store; `first_log_term` : my first log term; `last_log_idx` : my last log index in log store; `last_log_term` : my last log term; `last_committed_log_idx` : my last committed log index in state machine; `leader_committed_log_idx` : leader's committed log index from my perspective; `target_committed_log_idx` : target log index should be committed to; `last_snapshot_idx` : the largest committed log index in last snapshot. - -``` -first_log_idx 1 -first_log_term 1 -last_log_idx 101 -last_log_term 1 -last_committed_log_idx 100 -leader_committed_log_idx 101 -target_committed_log_idx 101 -last_snapshot_idx 50 -``` - -- `rqld`: Request to become new leader. Return `Sent leadership request to leader.` if request sent or `Failed to send leadership request to leader.` if request not sent. Note that if node is already leader the outcome is same as the request is sent. - -``` -Sent leadership request to leader. -``` - -## Migration from ZooKeeper {#migration-from-zookeeper} - -Seamlessly migration from ZooKeeper to ClickHouse Keeper is impossible you have to stop your ZooKeeper cluster, convert data and start ClickHouse Keeper. `clickhouse-keeper-converter` tool allows converting ZooKeeper logs and snapshots to ClickHouse Keeper snapshot. It works only with ZooKeeper > 3.4. Steps for migration: - -1. Stop all ZooKeeper nodes. - -2. Optional, but recommended: find ZooKeeper leader node, start and stop it again. It will force ZooKeeper to create a consistent snapshot. - -3. Run `clickhouse-keeper-converter` on a leader, for example: - -```bash -clickhouse-keeper-converter --zookeeper-logs-dir /var/lib/zookeeper/version-2 --zookeeper-snapshots-dir /var/lib/zookeeper/version-2 --output-dir /path/to/clickhouse/keeper/snapshots -``` - -4. Copy snapshot to ClickHouse server nodes with a configured `keeper` or start ClickHouse Keeper instead of ZooKeeper. The snapshot must persist on all nodes, otherwise, empty nodes can be faster and one of them can become a leader. - - - -## Recovering after losing quorum - -Because ClickHouse Keeper uses Raft it can tolerate certain amount of node crashes depending on the cluster size. \ -E.g. for a 3-node cluster, it will continue working correctly if only 1 node crashes. - -Cluster configuration can be dynamically configured but there are some limitations. Reconfiguration relies on Raft also -so to add/remove a node from the cluster you need to have a quorum. If you lose too many nodes in your cluster at the same time without any chance -of starting them again, Raft will stop working and not allow you to reconfigure your cluster using the conventional way. - -Nevertheless, ClickHouse Keeper has a recovery mode which allows you to forcefully reconfigure your cluster with only 1 node. -This should be done only as your last resort if you cannot start your nodes again, or start a new instance on the same endpoint. - -Important things to note before continuing: -- Make sure that the failed nodes cannot connect to the cluster again. -- Do not start any of the new nodes until it's specified in the steps. - -After making sure that the above things are true, you need to do following: -1. Pick a single Keeper node to be your new leader. Be aware that the data of that node will be used for the entire cluster so we recommend to use a node with the most up to date state. -2. Before doing anything else, make a backup of the `log_storage_path` and `snapshot_storage_path` folders of the picked node. -3. Reconfigure the cluster on all of the nodes you want to use. -4. Send the four letter command `rcvr` to the node you picked which will move the node to the recovery mode OR stop Keeper instance on the picked node and start it again with the `--force-recovery` argument. -5. One by one, start Keeper instances on the new nodes making sure that `mntr` returns `follower` for the `zk_server_state` before starting the next one. -6. While in the recovery mode, the leader node will return error message for `mntr` command until it achieves quorum with the new nodes and refuse any requests from the client and the followers. -7. After quorum is achieved, the leader node will return to the normal mode of operation, accepting all the requests using Raft - verify with `mntr` which should return `leader` for the `zk_server_state`. diff --git a/docs/en/operations/external-authenticators/kerberos.md b/docs/en/operations/external-authenticators/kerberos.md index 95944e96194..b7a11d7445b 100644 --- a/docs/en/operations/external-authenticators/kerberos.md +++ b/docs/en/operations/external-authenticators/kerberos.md @@ -113,7 +113,7 @@ Note, that now, once user `my_user` uses `kerberos`, Kerberos must be enabled in ### Enabling Kerberos using SQL {#enabling-kerberos-using-sql} -When [SQL-driven Access Control and Account Management](../access-rights.md#access-control) is enabled in ClickHouse, users identified by Kerberos can also be created using SQL statements. +When [SQL-driven Access Control and Account Management](/docs/en/guides/sre/user-management/index.md#access-control) is enabled in ClickHouse, users identified by Kerberos can also be created using SQL statements. ```sql CREATE USER my_user IDENTIFIED WITH kerberos REALM 'EXAMPLE.COM' diff --git a/docs/en/operations/external-authenticators/ldap.md b/docs/en/operations/external-authenticators/ldap.md index eba560f6ea5..ee2336e9378 100644 --- a/docs/en/operations/external-authenticators/ldap.md +++ b/docs/en/operations/external-authenticators/ldap.md @@ -112,7 +112,7 @@ At each login attempt, ClickHouse tries to "bind" to the specified DN defined by Note, that user `my_user` refers to `my_ldap_server`. This LDAP server must be configured in the main `config.xml` file as described previously. -When SQL-driven [Access Control and Account Management](../access-rights.md#access-control) is enabled, users that are authenticated by LDAP servers can also be created using the [CREATE USER](../../sql-reference/statements/create/user.md#create-user-statement) statement. +When SQL-driven [Access Control and Account Management](/docs/en/guides/sre/user-management/index.md#access-control) is enabled, users that are authenticated by LDAP servers can also be created using the [CREATE USER](/docs/en/sql-reference/statements/create/user.md#create-user-statement) statement. Query: @@ -120,11 +120,11 @@ Query: CREATE USER my_user IDENTIFIED WITH ldap SERVER 'my_ldap_server'; ``` -## LDAP Exernal User Directory {#ldap-external-user-directory} +## LDAP External User Directory {#ldap-external-user-directory} In addition to the locally defined users, a remote LDAP server can be used as a source of user definitions. To achieve this, specify previously defined LDAP server name (see [LDAP Server Definition](#ldap-server-definition)) in the `ldap` section inside the `users_directories` section of the `config.xml` file. -At each login attempt, ClickHouse tries to find the user definition locally and authenticate it as usual. If the user is not defined, ClickHouse will assume the definition exists in the external LDAP directory and will try to "bind" to the specified DN at the LDAP server using the provided credentials. If successful, the user will be considered existing and authenticated. The user will be assigned roles from the list specified in the `roles` section. Additionally, LDAP "search" can be performed and results can be transformed and treated as role names and then be assigned to the user if the `role_mapping` section is also configured. All this implies that the SQL-driven [Access Control and Account Management](../access-rights.md#access-control) is enabled and roles are created using the [CREATE ROLE](../../sql-reference/statements/create/role.md#create-role-statement) statement. +At each login attempt, ClickHouse tries to find the user definition locally and authenticate it as usual. If the user is not defined, ClickHouse will assume the definition exists in the external LDAP directory and will try to "bind" to the specified DN at the LDAP server using the provided credentials. If successful, the user will be considered existing and authenticated. The user will be assigned roles from the list specified in the `roles` section. Additionally, LDAP "search" can be performed and results can be transformed and treated as role names and then be assigned to the user if the `role_mapping` section is also configured. All this implies that the SQL-driven [Access Control and Account Management](/docs/en/guides/sre/user-management/index.md#access-control) is enabled and roles are created using the [CREATE ROLE](/docs/en/sql-reference/statements/create/role.md#create-role-statement) statement. **Example** @@ -173,7 +173,7 @@ Note that `my_ldap_server` referred in the `ldap` section inside the `user_direc - `roles` — Section with a list of locally defined roles that will be assigned to each user retrieved from the LDAP server. - If no roles are specified here or assigned during role mapping (below), user will not be able to perform any actions after authentication. - `role_mapping` — Section with LDAP search parameters and mapping rules. - - When a user authenticates, while still bound to LDAP, an LDAP search is performed using `search_filter` and the name of the logged-in user. For each entry found during that search, the value of the specified attribute is extracted. For each attribute value that has the specified prefix, the prefix is removed, and the rest of the value becomes the name of a local role defined in ClickHouse, which is expected to be created beforehand by the [CREATE ROLE](../../sql-reference/statements/create/role.md#create-role-statement) statement. + - When a user authenticates, while still bound to LDAP, an LDAP search is performed using `search_filter` and the name of the logged-in user. For each entry found during that search, the value of the specified attribute is extracted. For each attribute value that has the specified prefix, the prefix is removed, and the rest of the value becomes the name of a local role defined in ClickHouse, which is expected to be created beforehand by the [CREATE ROLE](/docs/en/sql-reference/statements/create/role.md#create-role-statement) statement. - There can be multiple `role_mapping` sections defined inside the same `ldap` section. All of them will be applied. - `base_dn` — Template used to construct the base DN for the LDAP search. - The resulting DN will be constructed by replacing all `{user_name}`, `{bind_dn}`, and `{user_dn}` substrings of the template with the actual user name, bind DN, and user DN during each LDAP search. diff --git a/docs/en/operations/monitoring.md b/docs/en/operations/monitoring.md index 2b3c4bdbbdf..04c5840d514 100644 --- a/docs/en/operations/monitoring.md +++ b/docs/en/operations/monitoring.md @@ -2,6 +2,7 @@ slug: /en/operations/monitoring sidebar_position: 45 sidebar_label: Monitoring +description: You can monitor the utilization of hardware resources and also ClickHouse server metrics. --- # Monitoring diff --git a/docs/en/operations/optimizing-performance/sampling-query-profiler.md b/docs/en/operations/optimizing-performance/sampling-query-profiler.md index 72eb655101f..ab42eec4190 100644 --- a/docs/en/operations/optimizing-performance/sampling-query-profiler.md +++ b/docs/en/operations/optimizing-performance/sampling-query-profiler.md @@ -7,11 +7,23 @@ import SelfManaged from '@site/docs/en/_snippets/_self_managed_only_no_roadmap.m # Sampling Query Profiler - - ClickHouse runs sampling profiler that allows analyzing query execution. Using profiler you can find source code routines that used the most frequently during query execution. You can trace CPU time and wall-clock time spent including idle time. -To use profiler: +Query profiler is automatically enabled in ClickHouse Cloud and you can run a sample query as follows + +``` sql +SELECT + count(), + arrayStringConcat(arrayMap(x -> concat(demangle(addressToSymbol(x)), '\n ', addressToLine(x)), trace), '\n') AS sym +FROM system.trace_log +WHERE (query_id = 'ebca3574-ad0a-400a-9cbc-dca382f5998c') AND (event_date = today()) +GROUP BY trace +ORDER BY count() DESC +LIMIT 10 +SETTINGS allow_introspection_functions = 1 +``` + +In self-managed deployments, to use query profiler: - Setup the [trace_log](../../operations/server-configuration-parameters/settings.md#server_configuration_parameters-trace_log) section of the server configuration. diff --git a/docs/en/operations/query-cache.md b/docs/en/operations/query-cache.md index 1a486de7904..a5afee63e6e 100644 --- a/docs/en/operations/query-cache.md +++ b/docs/en/operations/query-cache.md @@ -85,8 +85,8 @@ make the matching more natural, all query-level settings related to the query ca If the query was aborted due to an exception or user cancellation, no entry is written into the query cache. -The size of the query cache, the maximum number of cache entries and the maximum size of cache entries (in bytes and in records) can -be configured using different [server configuration options](server-configuration-parameters/settings.md#server_configuration_parameters_query-cache). +The size of the query cache in bytes, the maximum number of cache entries and the maximum size of individual cache entries (in bytes and in +records) can be configured using different [server configuration options](server-configuration-parameters/settings.md#server_configuration_parameters_query-cache). To define how long a query must run at least such that its result can be cached, you can use setting [query_cache_min_query_duration](settings/settings.md#query-cache-min-query-duration). For example, the result of query diff --git a/docs/en/operations/server-configuration-parameters/settings.md b/docs/en/operations/server-configuration-parameters/settings.md index 0424c3520e0..bd0fb03bad0 100644 --- a/docs/en/operations/server-configuration-parameters/settings.md +++ b/docs/en/operations/server-configuration-parameters/settings.md @@ -2,6 +2,7 @@ slug: /en/operations/server-configuration-parameters/settings sidebar_position: 57 sidebar_label: Server Settings +description: This section contains descriptions of server settings that cannot be changed at the session or query level. --- # Server Settings @@ -275,7 +276,7 @@ Path: - Specify the absolute path or the path relative to the server config file. - The path can contain wildcards \* and ?. -See also “[Dictionaries](../../sql-reference/dictionaries/external-dictionaries/external-dicts.md)â€. +See also “[Dictionaries](../../sql-reference/dictionaries/index.md)â€. **Example** @@ -1025,7 +1026,7 @@ If the number of **idle** threads in the Backups IO Thread pool exceeds `max_bac Possible values: - Positive integer. -- Zero. +- Zero. Default value: `0`. @@ -1360,7 +1361,7 @@ If the table does not exist, ClickHouse will create it. If the structure of the The following settings are available: -- `size`: The maximum cache size in bytes. 0 means the query cache is disabled. Default value: `1073741824` (1 GiB). +- `max_size`: The maximum cache size in bytes. 0 means the query cache is disabled. Default value: `1073741824` (1 GiB). - `max_entries`: The maximum number of `SELECT` query results stored in the cache. Default value: `1024`. - `max_entry_size`: The maximum size in bytes `SELECT` query results may have to be saved in the cache. Default value: `1048576` (1 MiB). - `max_entry_rows`: The maximum number of rows `SELECT` query results may have to be saved in the cache. Default value: `30000000` (30 mil). @@ -1368,7 +1369,7 @@ The following settings are available: Changed settings take effect immediately. :::warning -Data for the query cache is allocated in DRAM. If memory is scarce, make sure to set a small value for `size` or disable the query cache altogether. +Data for the query cache is allocated in DRAM. If memory is scarce, make sure to set a small value for `max_size` or disable the query cache altogether. ::: **Example** @@ -1881,6 +1882,16 @@ The update is performed asynchronously, in a separate system thread. Manage executing [distributed ddl queries](../../sql-reference/distributed-ddl.md) (CREATE, DROP, ALTER, RENAME) on cluster. Works only if [ZooKeeper](#server-settings_zookeeper) is enabled. +The configurable settings within `` include: + +- **path**: the path in Keeper for the `task_queue` for DDL queries +- **profile**: the profile used to execute the DDL queries +- **pool_size**: how many `ON CLUSTER` queries can be run simultaneously +- **max_tasks_in_queue**: the maximum number of tasks that can be in the queue. Default is 1,000 +- **task_max_lifetime**: delete node if its age is greater than this value. Default is `7 * 24 * 60 * 60` (a week in seconds) +- **cleanup_delay_period**: cleaning starts after new node event is received if the last cleaning wasn't made sooner than `cleanup_delay_period` seconds ago. Default is 60 seconds + + **Example** ```xml @@ -1917,7 +1928,7 @@ Default value: `/var/lib/clickhouse/access/`. **See also** -- [Access Control and Account Management](../../operations/access-rights.md#access-control) +- [Access Control and Account Management](../../guides/sre/user-management/index.md#access-control) ## user_directories {#user_directories} diff --git a/docs/en/operations/settings/settings-formats.md b/docs/en/operations/settings/settings-formats.md index 172627c7c3e..aa991cd9f15 100644 --- a/docs/en/operations/settings/settings-formats.md +++ b/docs/en/operations/settings/settings-formats.md @@ -964,7 +964,7 @@ Default value: 1. ### input_format_arrow_import_nested {#input_format_arrow_import_nested} -Enables or disables the ability to insert the data into [Nested](../../sql-reference/data-types/nested-data-structures/nested.md) columns as an array of structs in [Arrow](../../interfaces/formats.md/#data_types-matching-arrow) input format. +Enables or disables the ability to insert the data into [Nested](../../sql-reference/data-types/nested-data-structures/index.md) columns as an array of structs in [Arrow](../../interfaces/formats.md/#data_types-matching-arrow) input format. Possible values: @@ -1024,7 +1024,7 @@ Default value: `none`. ### input_format_orc_import_nested {#input_format_orc_import_nested} -Enables or disables the ability to insert the data into [Nested](../../sql-reference/data-types/nested-data-structures/nested.md) columns as an array of structs in [ORC](../../interfaces/formats.md/#data-format-orc) input format. +Enables or disables the ability to insert the data into [Nested](../../sql-reference/data-types/nested-data-structures/index.md) columns as an array of structs in [ORC](../../interfaces/formats.md/#data-format-orc) input format. Possible values: @@ -1073,7 +1073,7 @@ Default value: `none`. ### input_format_parquet_import_nested {#input_format_parquet_import_nested} -Enables or disables the ability to insert the data into [Nested](../../sql-reference/data-types/nested-data-structures/nested.md) columns as an array of structs in [Parquet](../../interfaces/formats.md/#data-format-parquet) input format. +Enables or disables the ability to insert the data into [Nested](../../sql-reference/data-types/nested-data-structures/index.md) columns as an array of structs in [Parquet](../../interfaces/formats.md/#data-format-parquet) input format. Possible values: @@ -1538,6 +1538,6 @@ Default value: `1GiB`. ### input_format_native_allow_types_conversion {#input_format_native_allow_types_conversion} -Allow types conversion in Native input format between columns from input data and requested columns. +Allow types conversion in Native input format between columns from input data and requested columns. Enabled by default. diff --git a/docs/en/operations/settings/settings-profiles.md b/docs/en/operations/settings/settings-profiles.md index 4527152583f..2f39a75453c 100644 --- a/docs/en/operations/settings/settings-profiles.md +++ b/docs/en/operations/settings/settings-profiles.md @@ -9,7 +9,7 @@ sidebar_label: Settings Profiles A settings profile is a collection of settings grouped under the same name. :::note -ClickHouse also supports [SQL-driven workflow](../../operations/access-rights.md#access-control) for managing settings profiles. We recommend using it. +ClickHouse also supports [SQL-driven workflow](../../guides/sre/user-management/index.md#access-control) for managing settings profiles. We recommend using it. ::: The profile can have any name. You can specify the same profile for different users. The most important thing you can write in the settings profile is `readonly=1`, which ensures read-only access. diff --git a/docs/en/operations/settings/settings-users.md b/docs/en/operations/settings/settings-users.md index b55d64fc4f7..9b27af61851 100644 --- a/docs/en/operations/settings/settings-users.md +++ b/docs/en/operations/settings/settings-users.md @@ -9,7 +9,7 @@ sidebar_label: User Settings The `users` section of the `user.xml` configuration file contains user settings. :::note -ClickHouse also supports [SQL-driven workflow](../../operations/access-rights.md#access-control) for managing users. We recommend using it. +ClickHouse also supports [SQL-driven workflow](../../guides/sre/user-management/index.md#access-control) for managing users. We recommend using it. ::: Structure of the `users` section: @@ -77,7 +77,7 @@ Password can be specified in plaintext or in SHA256 (hex format). ### access_management {#access_management-user-setting} -This setting enables or disables using of SQL-driven [access control and account management](../../operations/access-rights.md#access-control) for the user. +This setting enables or disables using of SQL-driven [access control and account management](../../guides/sre/user-management/index.md#access-control) for the user. Possible values: diff --git a/docs/en/operations/settings/settings.md b/docs/en/operations/settings/settings.md index 3c53f4fd0cf..f3c0f20f3a6 100644 --- a/docs/en/operations/settings/settings.md +++ b/docs/en/operations/settings/settings.md @@ -2999,7 +2999,7 @@ It can be useful when merges are CPU bounded not IO bounded (performing heavy da ## max_final_threads {#max-final-threads} -Sets the maximum number of parallel threads for the `SELECT` query data read phase with the [FINAL](../../sql-reference/statements/select/from.md/#select-from-final) modifier. +Sets the maximum number of parallel threads for the `SELECT` query data read phase with the [FINAL](../../sql-reference/statements/select/from.md#select-from-final) modifier. Possible values: @@ -3094,9 +3094,9 @@ Possible values: Default value: `0`. -## s3_truncate_on_insert +## s3_truncate_on_insert -Enables or disables truncate before inserts in s3 engine tables. If disabled, an exception will be thrown on insert attempts if an S3 object already exists. +Enables or disables truncate before inserts in s3 engine tables. If disabled, an exception will be thrown on insert attempts if an S3 object already exists. Possible values: - 0 — `INSERT` query appends new data to the end of the file. @@ -3104,9 +3104,9 @@ Possible values: Default value: `0`. -## hdfs_truncate_on_insert +## hdfs_truncate_on_insert -Enables or disables truncation before an insert in hdfs engine tables. If disabled, an exception will be thrown on an attempt to insert if a file in HDFS already exists. +Enables or disables truncation before an insert in hdfs engine tables. If disabled, an exception will be thrown on an attempt to insert if a file in HDFS already exists. Possible values: - 0 — `INSERT` query appends new data to the end of the file. @@ -3114,11 +3114,11 @@ Possible values: Default value: `0`. -## engine_file_allow_create_multiple_files +## engine_file_allow_create_multiple_files Enables or disables creating a new file on each insert in file engine tables if the format has the suffix (`JSON`, `ORC`, `Parquet`, etc.). If enabled, on each insert a new file will be created with a name following this pattern: -`data.Parquet` -> `data.1.Parquet` -> `data.2.Parquet`, etc. +`data.Parquet` -> `data.1.Parquet` -> `data.2.Parquet`, etc. Possible values: - 0 — `INSERT` query appends new data to the end of the file. @@ -3126,11 +3126,11 @@ Possible values: Default value: `0`. -## s3_create_new_file_on_insert +## s3_create_new_file_on_insert Enables or disables creating a new file on each insert in s3 engine tables. If enabled, on each insert a new S3 object will be created with the key, similar to this pattern: -initial: `data.Parquet.gz` -> `data.1.Parquet.gz` -> `data.2.Parquet.gz`, etc. +initial: `data.Parquet.gz` -> `data.1.Parquet.gz` -> `data.2.Parquet.gz`, etc. Possible values: - 0 — `INSERT` query appends new data to the end of the file. @@ -3142,7 +3142,7 @@ Default value: `0`. Enables or disables creating a new file on each insert in HDFS engine tables. If enabled, on each insert a new HDFS file will be created with the name, similar to this pattern: -initial: `data.Parquet.gz` -> `data.1.Parquet.gz` -> `data.2.Parquet.gz`, etc. +initial: `data.Parquet.gz` -> `data.1.Parquet.gz` -> `data.2.Parquet.gz`, etc. Possible values: - 0 — `INSERT` query appends new data to the end of the file. @@ -3438,7 +3438,7 @@ Default value: `throw`. ## flatten_nested {#flatten-nested} -Sets the data format of a [nested](../../sql-reference/data-types/nested-data-structures/nested.md) columns. +Sets the data format of a [nested](../../sql-reference/data-types/nested-data-structures/index.md) columns. Possible values: @@ -3753,7 +3753,7 @@ Default value: `1`. ## optimize_move_to_prewhere_if_final {#optimize_move_to_prewhere_if_final} -Enables or disables automatic [PREWHERE](../../sql-reference/statements/select/prewhere.md) optimization in [SELECT](../../sql-reference/statements/select/index.md) queries with [FINAL](../../sql-reference/statements/select/from.md/#select-from-final) modifier. +Enables or disables automatic [PREWHERE](../../sql-reference/statements/select/prewhere.md) optimization in [SELECT](../../sql-reference/statements/select/index.md) queries with [FINAL](../../sql-reference/statements/select/from.md#select-from-final) modifier. Works only for [*MergeTree](../../engines/table-engines/mergetree-family/index.md) tables. @@ -3770,7 +3770,7 @@ Default value: `0`. ## optimize_using_constraints -Use [constraints](../../sql-reference/statements/create/table#constraints) for query optimization. The default is `false`. +Use [constraints](../../sql-reference/statements/create/table.md#constraints) for query optimization. The default is `false`. Possible values: @@ -3778,7 +3778,7 @@ Possible values: ## optimize_append_index -Use [constraints](../../sql-reference/statements/create/table#constraints) in order to append index condition. The default is `false`. +Use [constraints](../../sql-reference/statements/create/table.md#constraints) in order to append index condition. The default is `false`. Possible values: @@ -3786,7 +3786,7 @@ Possible values: ## optimize_substitute_columns -Use [constraints](../../sql-reference/statements/create/table#constraints) for column substitution. The default is `false`. +Use [constraints](../../sql-reference/statements/create/table.md#constraints) for column substitution. The default is `false`. Possible values: @@ -3984,7 +3984,7 @@ Use this setting only for backward compatibility if your use cases depend on old ## final {#final} -Automatically applies [FINAL](../../sql-reference/statements/select/from/#final-modifier) modifier to all tables in a query, to tables where [FINAL](../../sql-reference/statements/select/from/#final-modifier) is applicable, including joined tables and tables in sub-queries, and +Automatically applies [FINAL](../../sql-reference/statements/select/from.md#final-modifier) modifier to all tables in a query, to tables where [FINAL](../../sql-reference/statements/select/from.md#final-modifier) is applicable, including joined tables and tables in sub-queries, and distributed tables. Possible values: @@ -4030,7 +4030,7 @@ SELECT * FROM test; ## asterisk_include_materialized_columns {#asterisk_include_materialized_columns} -Include [MATERIALIZED](../../sql-reference/statements/create/table/#materialized) columns for wildcard query (`SELECT *`). +Include [MATERIALIZED](../../sql-reference/statements/create/table.md#materialized) columns for wildcard query (`SELECT *`). Possible values: @@ -4041,7 +4041,7 @@ Default value: `0`. ## asterisk_include_alias_columns {#asterisk_include_alias_columns} -Include [ALIAS](../../sql-reference/statements/create/table/#alias) columns for wildcard query (`SELECT *`). +Include [ALIAS](../../sql-reference/statements/create/table.md#alias) columns for wildcard query (`SELECT *`). Possible values: @@ -4049,3 +4049,32 @@ Possible values: - 1 - enabled Default value: `0`. + +## stop_reading_on_first_cancel {#stop_reading_on_first_cancel} +When set to `true` and the user wants to interrupt a query (for example using `Ctrl+C` on the client), then the query continues execution only on data that was already read from the table. Afterward, it will return a partial result of the query for the part of the table that was read. To fully stop the execution of a query without a partial result, the user should send 2 cancel requests. + +**Example without setting on Ctrl+C** +```sql +SELECT sum(number) FROM numbers(10000000000) + +Cancelling query. +Ok. +Query was cancelled. + +0 rows in set. Elapsed: 1.334 sec. Processed 52.65 million rows, 421.23 MB (39.48 million rows/s., 315.85 MB/s.) +``` + +**Example with setting on Ctrl+C** +```sql +SELECT sum(number) FROM numbers(10000000000) SETTINGS stop_reading_on_first_cancel=true + +┌──────sum(number)─┠+│ 1355411451286266 │ +└──────────────────┘ + +1 row in set. Elapsed: 1.331 sec. Processed 52.13 million rows, 417.05 MB (39.17 million rows/s., 313.33 MB/s.) +``` + +Possible values: `true`, `false` + +Default value: `false` diff --git a/docs/en/operations/system-tables/dictionaries.md b/docs/en/operations/system-tables/dictionaries.md index 4b256f0de97..ca6b7faaa78 100644 --- a/docs/en/operations/system-tables/dictionaries.md +++ b/docs/en/operations/system-tables/dictionaries.md @@ -3,12 +3,12 @@ slug: /en/operations/system-tables/dictionaries --- # dictionaries -Contains information about [dictionaries](../../sql-reference/dictionaries/external-dictionaries/external-dicts.md). +Contains information about [dictionaries](../../sql-reference/dictionaries/index.md). Columns: - `database` ([String](../../sql-reference/data-types/string.md)) — Name of the database containing the dictionary created by DDL query. Empty string for other dictionaries. -- `name` ([String](../../sql-reference/data-types/string.md)) — [Dictionary name](../../sql-reference/dictionaries/external-dictionaries/external-dicts-dict.md). +- `name` ([String](../../sql-reference/data-types/string.md)) — [Dictionary name](../../sql-reference/dictionaries/index.md). - `uuid` ([UUID](../../sql-reference/data-types/uuid.md)) — Dictionary UUID. - `status` ([Enum8](../../sql-reference/data-types/enum.md)) — Dictionary status. Possible values: - `NOT_LOADED` — Dictionary was not loaded because it was not used. @@ -18,20 +18,20 @@ Columns: - `LOADED_AND_RELOADING` — Dictionary is loaded successfully, and is being reloaded right now (frequent reasons: [SYSTEM RELOAD DICTIONARY](../../sql-reference/statements/system.md#query_language-system-reload-dictionary) query, timeout, dictionary config has changed). - `FAILED_AND_RELOADING` — Could not load the dictionary as a result of an error and is loading now. - `origin` ([String](../../sql-reference/data-types/string.md)) — Path to the configuration file that describes the dictionary. -- `type` ([String](../../sql-reference/data-types/string.md)) — Type of a dictionary allocation. [Storing Dictionaries in Memory](../../sql-reference/dictionaries/external-dictionaries/external-dicts-dict-layout.md). -- `key.names` ([Array](../../sql-reference/data-types/array.md)([String](../../sql-reference/data-types/string.md))) — Array of [key names](../../sql-reference/dictionaries/external-dictionaries/external-dicts-dict-structure.md#ext_dict_structure-key) provided by the dictionary. -- `key.types` ([Array](../../sql-reference/data-types/array.md)([String](../../sql-reference/data-types/string.md))) — Corresponding array of [key types](../../sql-reference/dictionaries/external-dictionaries/external-dicts-dict-structure.md#ext_dict_structure-key) provided by the dictionary. -- `attribute.names` ([Array](../../sql-reference/data-types/array.md)([String](../../sql-reference/data-types/string.md))) — Array of [attribute names](../../sql-reference/dictionaries/external-dictionaries/external-dicts-dict-structure.md#ext_dict_structure-attributes) provided by the dictionary. -- `attribute.types` ([Array](../../sql-reference/data-types/array.md)([String](../../sql-reference/data-types/string.md))) — Corresponding array of [attribute types](../../sql-reference/dictionaries/external-dictionaries/external-dicts-dict-structure.md#ext_dict_structure-attributes) provided by the dictionary. +- `type` ([String](../../sql-reference/data-types/string.md)) — Type of a dictionary allocation. [Storing Dictionaries in Memory](../../sql-reference/dictionaries/index.md#storig-dictionaries-in-memory). +- `key.names` ([Array](../../sql-reference/data-types/array.md)([String](../../sql-reference/data-types/string.md))) — Array of [key names](../../sql-reference/dictionaries/index.md#dictionary-key-and-fields#ext_dict_structure-key) provided by the dictionary. +- `key.types` ([Array](../../sql-reference/data-types/array.md)([String](../../sql-reference/data-types/string.md))) — Corresponding array of [key types](../../sql-reference/dictionaries/index.md#dictionary-key-and-fields#ext_dict_structure-key) provided by the dictionary. +- `attribute.names` ([Array](../../sql-reference/data-types/array.md)([String](../../sql-reference/data-types/string.md))) — Array of [attribute names](../../sql-reference/dictionaries/index.md#dictionary-key-and-fields#ext_dict_structure-attributes) provided by the dictionary. +- `attribute.types` ([Array](../../sql-reference/data-types/array.md)([String](../../sql-reference/data-types/string.md))) — Corresponding array of [attribute types](../../sql-reference/dictionaries/index.md#dictionary-key-and-fields#ext_dict_structure-attributes) provided by the dictionary. - `bytes_allocated` ([UInt64](../../sql-reference/data-types/int-uint.md#uint-ranges)) — Amount of RAM allocated for the dictionary. - `query_count` ([UInt64](../../sql-reference/data-types/int-uint.md#uint-ranges)) — Number of queries since the dictionary was loaded or since the last successful reboot. - `hit_rate` ([Float64](../../sql-reference/data-types/float.md)) — For cache dictionaries, the percentage of uses for which the value was in the cache. - `found_rate` ([Float64](../../sql-reference/data-types/float.md)) — The percentage of uses for which the value was found. - `element_count` ([UInt64](../../sql-reference/data-types/int-uint.md#uint-ranges)) — Number of items stored in the dictionary. - `load_factor` ([Float64](../../sql-reference/data-types/float.md)) — Percentage filled in the dictionary (for a hashed dictionary, the percentage filled in the hash table). -- `source` ([String](../../sql-reference/data-types/string.md)) — Text describing the [data source](../../sql-reference/dictionaries/external-dictionaries/external-dicts-dict-sources.md) for the dictionary. -- `lifetime_min` ([UInt64](../../sql-reference/data-types/int-uint.md#uint-ranges)) — Minimum [lifetime](../../sql-reference/dictionaries/external-dictionaries/external-dicts-dict-lifetime.md) of the dictionary in memory, after which ClickHouse tries to reload the dictionary (if `invalidate_query` is set, then only if it has changed). Set in seconds. -- `lifetime_max` ([UInt64](../../sql-reference/data-types/int-uint.md#uint-ranges)) — Maximum [lifetime](../../sql-reference/dictionaries/external-dictionaries/external-dicts-dict-lifetime.md) of the dictionary in memory, after which ClickHouse tries to reload the dictionary (if `invalidate_query` is set, then only if it has changed). Set in seconds. +- `source` ([String](../../sql-reference/data-types/string.md)) — Text describing the [data source](../../sql-reference/dictionaries/index.md#dictionary-sources) for the dictionary. +- `lifetime_min` ([UInt64](../../sql-reference/data-types/int-uint.md#uint-ranges)) — Minimum [lifetime](../../sql-reference/dictionaries/index.md#dictionary-updates) of the dictionary in memory, after which ClickHouse tries to reload the dictionary (if `invalidate_query` is set, then only if it has changed). Set in seconds. +- `lifetime_max` ([UInt64](../../sql-reference/data-types/int-uint.md#uint-ranges)) — Maximum [lifetime](../../sql-reference/dictionaries/index.md#dictionary-updates) of the dictionary in memory, after which ClickHouse tries to reload the dictionary (if `invalidate_query` is set, then only if it has changed). Set in seconds. - `loading_start_time` ([DateTime](../../sql-reference/data-types/datetime.md)) — Start time for loading the dictionary. - `last_successful_update_time` ([DateTime](../../sql-reference/data-types/datetime.md)) — End time for loading or updating the dictionary. Helps to monitor some troubles with dictionary sources and investigate the causes. - `loading_duration` ([Float32](../../sql-reference/data-types/float.md)) — Duration of a dictionary loading. diff --git a/docs/en/operations/system-tables/marked_dropped_tables.md b/docs/en/operations/system-tables/dropped_tables.md similarity index 87% rename from docs/en/operations/system-tables/marked_dropped_tables.md rename to docs/en/operations/system-tables/dropped_tables.md index 23e969f7624..cb6cec0035a 100644 --- a/docs/en/operations/system-tables/marked_dropped_tables.md +++ b/docs/en/operations/system-tables/dropped_tables.md @@ -1,7 +1,7 @@ --- -slug: /en/operations/system-tables/marked_dropped_tables +slug: /en/operations/system-tables/dropped_tables --- -# marked_dropped_tables +# dropped_tables Contains information about tables that drop table has been executed but data cleanup has not been actually performed. @@ -17,11 +17,11 @@ Columns: **Example** -The following example shows how to get information about marked_dropped_tables. +The following example shows how to get information about dropped_tables. ``` sql SELECT * -FROM system.marked_dropped_tables\G +FROM system.dropped_tables\G ``` ``` text diff --git a/docs/en/operations/system-tables/quotas.md b/docs/en/operations/system-tables/quotas.md index ca8fc4d166f..ffe7a95df5b 100644 --- a/docs/en/operations/system-tables/quotas.md +++ b/docs/en/operations/system-tables/quotas.md @@ -20,7 +20,7 @@ Columns: - `apply_to_all` ([UInt8](../../sql-reference/data-types/int-uint.md#uint-ranges)) — Logical value. It shows which users the quota is applied to. Values: - `0` — The quota applies to users specify in the `apply_to_list`. - `1` — The quota applies to all users except those listed in `apply_to_except`. -- `apply_to_list` ([Array](../../sql-reference/data-types/array.md)([String](../../sql-reference/data-types/string.md))) — List of user names/[roles](../../operations/access-rights.md#role-management) that the quota should be applied to. +- `apply_to_list` ([Array](../../sql-reference/data-types/array.md)([String](../../sql-reference/data-types/string.md))) — List of user names/[roles](../../guides/sre/user-management/index.md#role-management) that the quota should be applied to. - `apply_to_except` ([Array](../../sql-reference/data-types/array.md)([String](../../sql-reference/data-types/string.md))) — List of user names/roles that the quota should not apply to. ## See Also {#see-also} diff --git a/docs/en/operations/system-tables/roles.md b/docs/en/operations/system-tables/roles.md index 729c98c89f3..5ef5e765c0f 100644 --- a/docs/en/operations/system-tables/roles.md +++ b/docs/en/operations/system-tables/roles.md @@ -3,7 +3,7 @@ slug: /en/operations/system-tables/roles --- # roles -Contains information about configured [roles](../../operations/access-rights.md#role-management). +Contains information about configured [roles](../../guides/sre/user-management/index.md#role-management). Columns: diff --git a/docs/en/operations/system-tables/users.md b/docs/en/operations/system-tables/users.md index b8c0403b8d6..385e3151eb7 100644 --- a/docs/en/operations/system-tables/users.md +++ b/docs/en/operations/system-tables/users.md @@ -3,7 +3,7 @@ slug: /en/operations/system-tables/users --- # users -Contains a list of [user accounts](../../operations/access-rights.md#user-account-management) configured at the server. +Contains a list of [user accounts](../../guides/sre/user-management/index.md#user-account-management) configured at the server. Columns: - `name` ([String](../../sql-reference/data-types/string.md)) — User name. diff --git a/docs/en/operations/tips.md b/docs/en/operations/tips.md index da34a6b7e9c..13353cd8e6a 100644 --- a/docs/en/operations/tips.md +++ b/docs/en/operations/tips.md @@ -126,7 +126,7 @@ Otherwise you may get `Illegal instruction` crashes when hypervisor is run on ol ## ClickHouse Keeper and ZooKeeper {#zookeeper} -ClickHouse Keeper is recommended to replace ZooKeeper for ClickHouse clusters. See the documentation for [ClickHouse Keeper](clickhouse-keeper.md) +ClickHouse Keeper is recommended to replace ZooKeeper for ClickHouse clusters. See the documentation for [ClickHouse Keeper](../guides/sre/keeper/index.md) If you would like to continue using ZooKeeper then it is best to use a fresh version of ZooKeeper – 3.4.9 or later. The version in stable Linux distributions may be outdated. @@ -134,7 +134,7 @@ You should never use manually written scripts to transfer data between different If you want to divide an existing ZooKeeper cluster into two, the correct way is to increase the number of its replicas and then reconfigure it as two independent clusters. -You can run ClickHouse Keeper on the same server as ClickHouse in test environments, or in environments with low ingestion rate. +You can run ClickHouse Keeper on the same server as ClickHouse in test environments, or in environments with low ingestion rate. For production environments we suggest to use separate servers for ClickHouse and ZooKeeper/Keeper, or place ClickHouse files and Keeper files on to separate disks. Because ZooKeeper/Keeper are very sensitive for disk latency and ClickHouse may utilize all available system resources. You can have ZooKeeper observers in an ensemble but ClickHouse servers should not interact with observers. diff --git a/docs/en/operations/utilities/clickhouse-local.md b/docs/en/operations/utilities/clickhouse-local.md index 08640b5c16b..6bf1269c1d9 100644 --- a/docs/en/operations/utilities/clickhouse-local.md +++ b/docs/en/operations/utilities/clickhouse-local.md @@ -4,9 +4,9 @@ sidebar_position: 60 sidebar_label: clickhouse-local --- -# clickhouse-local +# clickhouse-local -The `clickhouse-local` program enables you to perform fast processing on local files, without having to deploy and configure the ClickHouse server. It accepts data that represent tables and queries them using [ClickHouse SQL dialect](../../sql-reference/). `clickhouse-local` uses the same core as ClickHouse server, so it supports most of the features and the same set of formats and table engines. +The `clickhouse-local` program enables you to perform fast processing on local files, without having to deploy and configure the ClickHouse server. It accepts data that represent tables and queries them using [ClickHouse SQL dialect](../../sql-reference/index.md). `clickhouse-local` uses the same core as ClickHouse server, so it supports most of the features and the same set of formats and table engines. By default `clickhouse-local` has access to data on the same host, and it does not depend on the server's configuration. It also supports loading server configuration using `--config-file` argument. For temporary data, a unique temporary data directory is created by default. diff --git a/docs/en/operations/utilities/index.md b/docs/en/operations/utilities/index.md index a8c0239c102..bf6b3a63d23 100644 --- a/docs/en/operations/utilities/index.md +++ b/docs/en/operations/utilities/index.md @@ -1,11 +1,11 @@ --- slug: /en/operations/utilities/ sidebar_position: 56 -sidebar_label: Overview +sidebar_label: List of tools and utilities pagination_next: 'en/operations/utilities/clickhouse-copier' --- -# ClickHouse Utilities +# List of tools and utilities - [clickhouse-local](../../operations/utilities/clickhouse-local.md) — Allows running SQL queries on data without starting the ClickHouse server, similar to how `awk` does this. - [clickhouse-copier](../../operations/utilities/clickhouse-copier.md) — Copies (and reshards) data from one cluster to another cluster. diff --git a/docs/en/sql-reference/_category_.yml b/docs/en/sql-reference/_category_.yml index d799ecef539..45eaa6e7c16 100644 --- a/docs/en/sql-reference/_category_.yml +++ b/docs/en/sql-reference/_category_.yml @@ -1,7 +1,7 @@ -position: 15 +position: 1 label: 'SQL Reference' collapsible: true collapsed: true link: - type: doc - id: en/sql-reference/index + type: generated-index + slug: /en/sql-reference diff --git a/docs/en/sql-reference/aggregate-functions/reference/contingency.md b/docs/en/sql-reference/aggregate-functions/reference/contingency.md index e75537778fe..9e89e99e66d 100644 --- a/docs/en/sql-reference/aggregate-functions/reference/contingency.md +++ b/docs/en/sql-reference/aggregate-functions/reference/contingency.md @@ -5,7 +5,7 @@ sidebar_position: 350 # contingency -The `contingency` function calculates the [contingency coefficient](https://en.wikipedia.org/wiki/Contingency_table#Cram%C3%A9r's_V_and_the_contingency_coefficient_C), a value that measures the association between two columns in a table. The computation is similar to [the `cramersV` function](./cramersv) but with a different denominator in the square root. +The `contingency` function calculates the [contingency coefficient](https://en.wikipedia.org/wiki/Contingency_table#Cram%C3%A9r's_V_and_the_contingency_coefficient_C), a value that measures the association between two columns in a table. The computation is similar to [the `cramersV` function](./cramersv.md) but with a different denominator in the square root. **Syntax** diff --git a/docs/en/sql-reference/aggregate-functions/reference/cramersvbiascorrected.md b/docs/en/sql-reference/aggregate-functions/reference/cramersvbiascorrected.md index 51524033147..651b5e7b5a2 100644 --- a/docs/en/sql-reference/aggregate-functions/reference/cramersvbiascorrected.md +++ b/docs/en/sql-reference/aggregate-functions/reference/cramersvbiascorrected.md @@ -6,7 +6,7 @@ sidebar_position: 352 # cramersVBiasCorrected -Cramér's V is a measure of association between two columns in a table. The result of the [`cramersV` function](./cramersv) ranges from 0 (corresponding to no association between the variables) to 1 and can reach 1 only when each value is completely determined by the other. The function can be heavily biased, so this version of Cramér's V uses the [bias correction](https://en.wikipedia.org/wiki/Cram%C3%A9r%27s_V#Bias_correction). +Cramér's V is a measure of association between two columns in a table. The result of the [`cramersV` function](./cramersv.md) ranges from 0 (corresponding to no association between the variables) to 1 and can reach 1 only when each value is completely determined by the other. The function can be heavily biased, so this version of Cramér's V uses the [bias correction](https://en.wikipedia.org/wiki/Cram%C3%A9r%27s_V#Bias_correction). diff --git a/docs/en/sql-reference/aggregate-functions/reference/exponentialmovingaverage.md b/docs/en/sql-reference/aggregate-functions/reference/exponentialmovingaverage.md index 5546ade1758..5d82d3575fc 100644 --- a/docs/en/sql-reference/aggregate-functions/reference/exponentialmovingaverage.md +++ b/docs/en/sql-reference/aggregate-functions/reference/exponentialmovingaverage.md @@ -19,7 +19,7 @@ Each `value` corresponds to the determinate `timeunit`. The half-life `x` is the **Arguments** - `value` — Value. [Integer](../../../sql-reference/data-types/int-uint.md), [Float](../../../sql-reference/data-types/float.md) or [Decimal](../../../sql-reference/data-types/decimal.md). -- `timeunit` — Timeunit. [Integer](../../../sql-reference/data-types/int-uint.md), [Float](../../../sql-reference/data-types/float.md) or [Decimal](../../../sql-reference/data-types/decimal.md). Timeunit is not timestamp (seconds), it's -- an index of the time interval. Can be calculated using [intDiv](../../functions/arithmetic-functions/#intdiva-b). +- `timeunit` — Timeunit. [Integer](../../../sql-reference/data-types/int-uint.md), [Float](../../../sql-reference/data-types/float.md) or [Decimal](../../../sql-reference/data-types/decimal.md). Timeunit is not timestamp (seconds), it's -- an index of the time interval. Can be calculated using [intDiv](../../functions/arithmetic-functions.md#intdiva-b). **Parameters** diff --git a/docs/en/sql-reference/data-types/index.md b/docs/en/sql-reference/data-types/index.md index ef6a0fb3ea5..c61a3069db6 100644 --- a/docs/en/sql-reference/data-types/index.md +++ b/docs/en/sql-reference/data-types/index.md @@ -1,13 +1,33 @@ --- slug: /en/sql-reference/data-types/ -sidebar_label: Data Types +sidebar_label: List of data types sidebar_position: 37 --- -# Data Types +# ClickHouse Data Types -ClickHouse can store various kinds of data in table cells. +ClickHouse can store various kinds of data in table cells. This section describes the supported data types and special considerations for using and/or implementing them if any. -This section describes the supported data types and special considerations for using and/or implementing them if any. +:::note +You can check whether a data type name is case-sensitive in the [system.data_type_families](../../operations/system-tables/data_type_families.md#system_tables-data_type_families) table. +::: -You can check whether data type name is case-sensitive in the [system.data_type_families](../../operations/system-tables/data_type_families.md#system_tables-data_type_families) table. +ClickHouse data types include: + +- **Integer types**: [signed and unsigned integers](./int-uint.md) (`UInt8`, `UInt16`, `UInt32`, `UInt64`, `UInt128`, `UInt256`, `Int8`, `Int16`, `Int32`, `Int64`, `Int128`, `Int256`) +- **Floating-point numbers**: [floats](./float.md)(`Float32` and `Float64`) and [`Decimal` values](./decimal.md) +- **Boolean**: ClickHouse has a [`Boolean` type](./boolean.md) +- **Strings**: [`String`](./string.md) and [`FixedString`](./fixedstring.md) +- **Dates**: use [`Date`](./date.md) and [`Date32`](./date32.md) for days, and [`DateTime`](./datetime.md) and [`DateTime64`](./datetime64.md) for instances in time +- **JSON**: the [`JSON` object](./json.md) stores a JSON document in a single column +- **UUID**: a performant option for storing [`UUID` values](./uuid.md) +- **Low cardinality types**: use an [`Enum`](./enum.md) when you have a handful of unique values, or use [`LowCardinality`](./lowcardinality.md) when you have up to 10,000 unique values of a column +- **Arrays**: any column can be defined as an [`Array` of values](./array.md) +- **Maps**: use [`Map`](./map.md) for storing key/value pairs +- **Aggregation function types**: use [`SimpleAggregateFunction`](./simpleaggregatefunction.md) and [`AggregateFunction`](./aggregatefunction.md) for storing the intermediate status of aggregate function results +- **Nested data structures**: A [`Nested` data structure](./nested-data-structures/index.md) is like a table inside a cell +- **Tuples**: A [`Tuple` of elements](./tuple.md), each having an individual type. +- **Nullable**: [`Nullbale`](./nullable.md) allows you to store a value as `NULL` when a value is "missing" (instead of the column gettings its default value for the data type) +- **IP addresses**: use [`IPv4`](./domains/ipv4.md) and [`IPv6`](./domains/ipv6.md) to efficiently store IP addresses +- **Geo types**: for[ geographical data](./geo.md), including `Point`, `Ring`, `Polygon` and `MultiPolygon` +- **Special data types**: including [`Expression`](./special-data-types/expression.md), [`Set`](./special-data-types/set.md), [`Nothing`](./special-data-types/nothing.md) and [`Interval`](./special-data-types/interval.md) \ No newline at end of file diff --git a/docs/en/sql-reference/data-types/json.md b/docs/en/sql-reference/data-types/json.md index d9099ba5ad3..a21898de9a2 100644 --- a/docs/en/sql-reference/data-types/json.md +++ b/docs/en/sql-reference/data-types/json.md @@ -7,7 +7,7 @@ sidebar_label: JSON # JSON :::warning -This feature is experimental and is not production ready. If you need to work with JSON documents, consider using [this guide](/docs/en/guides/developer/working-with-json/json-load-data.md) instead. +This feature is experimental and is not production ready. If you need to work with JSON documents, consider using [this guide](/docs/en/integrations/data-ingestion/data-formats/json.md) instead. ::: Stores JavaScript Object Notation (JSON) documents in a single column. diff --git a/docs/en/sql-reference/data-types/nested-data-structures/index.md b/docs/en/sql-reference/data-types/nested-data-structures/index.md index 1d958c018d8..d118170cd39 100644 --- a/docs/en/sql-reference/data-types/nested-data-structures/index.md +++ b/docs/en/sql-reference/data-types/nested-data-structures/index.md @@ -1,7 +1,105 @@ --- -slug: /en/sql-reference/data-types/nested-data-structures/ -sidebar_label: Nested Data Structures -sidebar_position: 54 +slug: /en/sql-reference/data-types/nested-data-structures/nested +sidebar_position: 57 +sidebar_label: Nested(Name1 Type1, Name2 Type2, ...) --- -# Nested Data Structures +# Nested + +## Nested(name1 Type1, Name2 Type2, …) + +A nested data structure is like a table inside a cell. The parameters of a nested data structure – the column names and types – are specified the same way as in a [CREATE TABLE](../../../sql-reference/statements/create/table.md) query. Each table row can correspond to any number of rows in a nested data structure. + +Example: + +``` sql +CREATE TABLE test.visits +( + CounterID UInt32, + StartDate Date, + Sign Int8, + IsNew UInt8, + VisitID UInt64, + UserID UInt64, + ... + Goals Nested + ( + ID UInt32, + Serial UInt32, + EventTime DateTime, + Price Int64, + OrderID String, + CurrencyID UInt32 + ), + ... +) ENGINE = CollapsingMergeTree(StartDate, intHash32(UserID), (CounterID, StartDate, intHash32(UserID), VisitID), 8192, Sign) +``` + +This example declares the `Goals` nested data structure, which contains data about conversions (goals reached). Each row in the ‘visits’ table can correspond to zero or any number of conversions. + +When [flatten_nested](../../../operations/settings/settings.md#flatten-nested) is set to `0` (which is not by default), arbitrary levels of nesting are supported. + +In most cases, when working with a nested data structure, its columns are specified with column names separated by a dot. These columns make up an array of matching types. All the column arrays of a single nested data structure have the same length. + +Example: + +``` sql +SELECT + Goals.ID, + Goals.EventTime +FROM test.visits +WHERE CounterID = 101500 AND length(Goals.ID) < 5 +LIMIT 10 +``` + +``` text +┌─Goals.ID───────────────────────┬─Goals.EventTime───────────────────────────────────────────────────────────────────────────┠+│ [1073752,591325,591325] │ ['2014-03-17 16:38:10','2014-03-17 16:38:48','2014-03-17 16:42:27'] │ +│ [1073752] │ ['2014-03-17 00:28:25'] │ +│ [1073752] │ ['2014-03-17 10:46:20'] │ +│ [1073752,591325,591325,591325] │ ['2014-03-17 13:59:20','2014-03-17 22:17:55','2014-03-17 22:18:07','2014-03-17 22:18:51'] │ +│ [] │ [] │ +│ [1073752,591325,591325] │ ['2014-03-17 11:37:06','2014-03-17 14:07:47','2014-03-17 14:36:21'] │ +│ [] │ [] │ +│ [] │ [] │ +│ [591325,1073752] │ ['2014-03-17 00:46:05','2014-03-17 00:46:05'] │ +│ [1073752,591325,591325,591325] │ ['2014-03-17 13:28:33','2014-03-17 13:30:26','2014-03-17 18:51:21','2014-03-17 18:51:45'] │ +└────────────────────────────────┴───────────────────────────────────────────────────────────────────────────────────────────┘ +``` + +It is easiest to think of a nested data structure as a set of multiple column arrays of the same length. + +The only place where a SELECT query can specify the name of an entire nested data structure instead of individual columns is the ARRAY JOIN clause. For more information, see “ARRAY JOIN clauseâ€. Example: + +``` sql +SELECT + Goal.ID, + Goal.EventTime +FROM test.visits +ARRAY JOIN Goals AS Goal +WHERE CounterID = 101500 AND length(Goals.ID) < 5 +LIMIT 10 +``` + +``` text +┌─Goal.ID─┬──────Goal.EventTime─┠+│ 1073752 │ 2014-03-17 16:38:10 │ +│ 591325 │ 2014-03-17 16:38:48 │ +│ 591325 │ 2014-03-17 16:42:27 │ +│ 1073752 │ 2014-03-17 00:28:25 │ +│ 1073752 │ 2014-03-17 10:46:20 │ +│ 1073752 │ 2014-03-17 13:59:20 │ +│ 591325 │ 2014-03-17 22:17:55 │ +│ 591325 │ 2014-03-17 22:18:07 │ +│ 591325 │ 2014-03-17 22:18:51 │ +│ 1073752 │ 2014-03-17 11:37:06 │ +└─────────┴─────────────────────┘ +``` + +You can’t perform SELECT for an entire nested data structure. You can only explicitly list individual columns that are part of it. + +For an INSERT query, you should pass all the component column arrays of a nested data structure separately (as if they were individual column arrays). During insertion, the system checks that they have the same length. + +For a DESCRIBE query, the columns in a nested data structure are listed separately in the same way. + +The ALTER query for elements in a nested data structure has limitations. diff --git a/docs/en/sql-reference/data-types/nested-data-structures/nested.md b/docs/en/sql-reference/data-types/nested-data-structures/nested.md deleted file mode 100644 index d118170cd39..00000000000 --- a/docs/en/sql-reference/data-types/nested-data-structures/nested.md +++ /dev/null @@ -1,105 +0,0 @@ ---- -slug: /en/sql-reference/data-types/nested-data-structures/nested -sidebar_position: 57 -sidebar_label: Nested(Name1 Type1, Name2 Type2, ...) ---- - -# Nested - -## Nested(name1 Type1, Name2 Type2, …) - -A nested data structure is like a table inside a cell. The parameters of a nested data structure – the column names and types – are specified the same way as in a [CREATE TABLE](../../../sql-reference/statements/create/table.md) query. Each table row can correspond to any number of rows in a nested data structure. - -Example: - -``` sql -CREATE TABLE test.visits -( - CounterID UInt32, - StartDate Date, - Sign Int8, - IsNew UInt8, - VisitID UInt64, - UserID UInt64, - ... - Goals Nested - ( - ID UInt32, - Serial UInt32, - EventTime DateTime, - Price Int64, - OrderID String, - CurrencyID UInt32 - ), - ... -) ENGINE = CollapsingMergeTree(StartDate, intHash32(UserID), (CounterID, StartDate, intHash32(UserID), VisitID), 8192, Sign) -``` - -This example declares the `Goals` nested data structure, which contains data about conversions (goals reached). Each row in the ‘visits’ table can correspond to zero or any number of conversions. - -When [flatten_nested](../../../operations/settings/settings.md#flatten-nested) is set to `0` (which is not by default), arbitrary levels of nesting are supported. - -In most cases, when working with a nested data structure, its columns are specified with column names separated by a dot. These columns make up an array of matching types. All the column arrays of a single nested data structure have the same length. - -Example: - -``` sql -SELECT - Goals.ID, - Goals.EventTime -FROM test.visits -WHERE CounterID = 101500 AND length(Goals.ID) < 5 -LIMIT 10 -``` - -``` text -┌─Goals.ID───────────────────────┬─Goals.EventTime───────────────────────────────────────────────────────────────────────────┠-│ [1073752,591325,591325] │ ['2014-03-17 16:38:10','2014-03-17 16:38:48','2014-03-17 16:42:27'] │ -│ [1073752] │ ['2014-03-17 00:28:25'] │ -│ [1073752] │ ['2014-03-17 10:46:20'] │ -│ [1073752,591325,591325,591325] │ ['2014-03-17 13:59:20','2014-03-17 22:17:55','2014-03-17 22:18:07','2014-03-17 22:18:51'] │ -│ [] │ [] │ -│ [1073752,591325,591325] │ ['2014-03-17 11:37:06','2014-03-17 14:07:47','2014-03-17 14:36:21'] │ -│ [] │ [] │ -│ [] │ [] │ -│ [591325,1073752] │ ['2014-03-17 00:46:05','2014-03-17 00:46:05'] │ -│ [1073752,591325,591325,591325] │ ['2014-03-17 13:28:33','2014-03-17 13:30:26','2014-03-17 18:51:21','2014-03-17 18:51:45'] │ -└────────────────────────────────┴───────────────────────────────────────────────────────────────────────────────────────────┘ -``` - -It is easiest to think of a nested data structure as a set of multiple column arrays of the same length. - -The only place where a SELECT query can specify the name of an entire nested data structure instead of individual columns is the ARRAY JOIN clause. For more information, see “ARRAY JOIN clauseâ€. Example: - -``` sql -SELECT - Goal.ID, - Goal.EventTime -FROM test.visits -ARRAY JOIN Goals AS Goal -WHERE CounterID = 101500 AND length(Goals.ID) < 5 -LIMIT 10 -``` - -``` text -┌─Goal.ID─┬──────Goal.EventTime─┠-│ 1073752 │ 2014-03-17 16:38:10 │ -│ 591325 │ 2014-03-17 16:38:48 │ -│ 591325 │ 2014-03-17 16:42:27 │ -│ 1073752 │ 2014-03-17 00:28:25 │ -│ 1073752 │ 2014-03-17 10:46:20 │ -│ 1073752 │ 2014-03-17 13:59:20 │ -│ 591325 │ 2014-03-17 22:17:55 │ -│ 591325 │ 2014-03-17 22:18:07 │ -│ 591325 │ 2014-03-17 22:18:51 │ -│ 1073752 │ 2014-03-17 11:37:06 │ -└─────────┴─────────────────────┘ -``` - -You can’t perform SELECT for an entire nested data structure. You can only explicitly list individual columns that are part of it. - -For an INSERT query, you should pass all the component column arrays of a nested data structure separately (as if they were individual column arrays). During insertion, the system checks that they have the same length. - -For a DESCRIBE query, the columns in a nested data structure are listed separately in the same way. - -The ALTER query for elements in a nested data structure has limitations. diff --git a/docs/en/sql-reference/dictionaries/external-dictionaries/_snippet_dictionary_in_cloud.md b/docs/en/sql-reference/dictionaries/_snippet_dictionary_in_cloud.md similarity index 100% rename from docs/en/sql-reference/dictionaries/external-dictionaries/_snippet_dictionary_in_cloud.md rename to docs/en/sql-reference/dictionaries/_snippet_dictionary_in_cloud.md diff --git a/docs/en/sql-reference/dictionaries/external-dictionaries/_category_.yml b/docs/en/sql-reference/dictionaries/external-dictionaries/_category_.yml deleted file mode 100644 index af79ff9af23..00000000000 --- a/docs/en/sql-reference/dictionaries/external-dictionaries/_category_.yml +++ /dev/null @@ -1,8 +0,0 @@ -position: 37 -label: 'Dictionaries' -collapsible: true -collapsed: true -link: - type: generated-index - title: Dictionaries - slug: /en/sql-reference/dictionaries/external-dictionaries diff --git a/docs/en/sql-reference/dictionaries/external-dictionaries/external-dicts-dict-hierarchical.md b/docs/en/sql-reference/dictionaries/external-dictionaries/external-dicts-dict-hierarchical.md deleted file mode 100644 index ee9cd2c1f2e..00000000000 --- a/docs/en/sql-reference/dictionaries/external-dictionaries/external-dicts-dict-hierarchical.md +++ /dev/null @@ -1,67 +0,0 @@ ---- -slug: /en/sql-reference/dictionaries/external-dictionaries/external-dicts-dict-hierarchical -sidebar_position: 45 -sidebar_label: Hierarchical dictionaries ---- - -# Hierarchical Dictionaries - -ClickHouse supports hierarchical dictionaries with a [numeric key](../../dictionaries/external-dictionaries/external-dicts-dict-structure.md#numeric-key). - -Look at the following hierarchical structure: - -``` text -0 (Common parent) -│ -├── 1 (Russia) -│ │ -│ └── 2 (Moscow) -│ │ -│ └── 3 (Center) -│ -└── 4 (Great Britain) - │ - └── 5 (London) -``` - -This hierarchy can be expressed as the following dictionary table. - -| region_id | parent_region | region_name | -|------------|----------------|---------------| -| 1 | 0 | Russia | -| 2 | 1 | Moscow | -| 3 | 2 | Center | -| 4 | 0 | Great Britain | -| 5 | 4 | London | - -This table contains a column `parent_region` that contains the key of the nearest parent for the element. - -ClickHouse supports the [hierarchical](../../../sql-reference/dictionaries/external-dictionaries/external-dicts-dict-structure.md#hierarchical-dict-attr) property for [external dictionary](../../../sql-reference/dictionaries/external-dictionaries/) attributes. This property allows you to configure the hierarchical dictionary similar to described above. - -The [dictGetHierarchy](../../../sql-reference/functions/ext-dict-functions.md#dictgethierarchy) function allows you to get the parent chain of an element. - -For our example, the structure of dictionary can be the following: - -``` xml - - - - region_id - - - - parent_region - UInt64 - 0 - true - - - - region_name - String - - - - - -``` diff --git a/docs/en/sql-reference/dictionaries/external-dictionaries/external-dicts-dict-layout.md b/docs/en/sql-reference/dictionaries/external-dictionaries/external-dicts-dict-layout.md deleted file mode 100644 index 4dc6fd33849..00000000000 --- a/docs/en/sql-reference/dictionaries/external-dictionaries/external-dicts-dict-layout.md +++ /dev/null @@ -1,751 +0,0 @@ ---- -slug: /en/sql-reference/dictionaries/external-dictionaries/external-dicts-dict-layout -sidebar_position: 41 -sidebar_label: Storing Dictionaries in Memory ---- -import CloudDetails from '@site/docs/en/sql-reference/dictionaries/external-dictionaries/_snippet_dictionary_in_cloud.md'; - -# Storing Dictionaries in Memory - -There are a variety of ways to store dictionaries in memory. - -We recommend [flat](#flat), [hashed](#dicts-external_dicts_dict_layout-hashed) and [complex_key_hashed](#complex-key-hashed), which provide optimal processing speed. - -Caching is not recommended because of potentially poor performance and difficulties in selecting optimal parameters. Read more in the section [cache](#cache). - -There are several ways to improve dictionary performance: - -- Call the function for working with the dictionary after `GROUP BY`. -- Mark attributes to extract as injective. An attribute is called injective if different attribute values correspond to different keys. So when `GROUP BY` uses a function that fetches an attribute value by the key, this function is automatically taken out of `GROUP BY`. - -ClickHouse generates an exception for errors with dictionaries. Examples of errors: - -- The dictionary being accessed could not be loaded. -- Error querying a `cached` dictionary. - -You can view the list of dictionaries and their statuses in the [system.dictionaries](../../../operations/system-tables/dictionaries.md) table. - - - -The configuration looks like this: - -``` xml - - - ... - - - - - - ... - - -``` - -Corresponding [DDL-query](../../../sql-reference/statements/create/dictionary.md): - -``` sql -CREATE DICTIONARY (...) -... -LAYOUT(LAYOUT_TYPE(param value)) -- layout settings -... -``` - -Dictionaries without word `complex-key*` in a layout have a key with [UInt64](../../../sql-reference/data-types/int-uint.md) type, `complex-key*` dictionaries have a composite key (complex, with arbitrary types). - -[UInt64](../../../sql-reference/data-types/int-uint.md) keys in XML dictionaries are defined with `` tag. - -Configuration example (column key_column has UInt64 type): -```xml -... - - - key_column - -... -``` - -Composite `complex` keys XML dictionaries are defined `` tag. - -Configuration example of a composite key (key has one element with [String](../../../sql-reference/data-types/string.md) type): -```xml -... - - - - country_code - String - - -... -``` - -## Ways to Store Dictionaries in Memory - -- [flat](#flat) -- [hashed](#dicts-external_dicts_dict_layout-hashed) -- [sparse_hashed](#dicts-external_dicts_dict_layout-sparse_hashed) -- [complex_key_hashed](#complex-key-hashed) -- [complex_key_sparse_hashed](#complex-key-sparse-hashed) -- [hashed_array](#dicts-external_dicts_dict_layout-hashed-array) -- [complex_key_hashed_array](#complex-key-hashed-array) -- [range_hashed](#range-hashed) -- [complex_key_range_hashed](#complex-key-range-hashed) -- [cache](#cache) -- [complex_key_cache](#complex-key-cache) -- [ssd_cache](#ssd-cache) -- [complex_key_ssd_cache](#complex-key-ssd-cache) -- [direct](#direct) -- [complex_key_direct](#complex-key-direct) -- [ip_trie](#ip-trie) - -### flat - -The dictionary is completely stored in memory in the form of flat arrays. How much memory does the dictionary use? The amount is proportional to the size of the largest key (in space used). - -The dictionary key has the [UInt64](../../../sql-reference/data-types/int-uint.md) type and the value is limited to `max_array_size` (by default — 500,000). If a larger key is discovered when creating the dictionary, ClickHouse throws an exception and does not create the dictionary. Dictionary flat arrays initial size is controlled by `initial_array_size` setting (by default — 1024). - -All types of sources are supported. When updating, data (from a file or from a table) is read in it entirety. - -This method provides the best performance among all available methods of storing the dictionary. - -Configuration example: - -``` xml - - - 50000 - 5000000 - - -``` - -or - -``` sql -LAYOUT(FLAT(INITIAL_ARRAY_SIZE 50000 MAX_ARRAY_SIZE 5000000)) -``` - -### hashed - -The dictionary is completely stored in memory in the form of a hash table. The dictionary can contain any number of elements with any identifiers In practice, the number of keys can reach tens of millions of items. - -The dictionary key has the [UInt64](../../../sql-reference/data-types/int-uint.md) type. - -All types of sources are supported. When updating, data (from a file or from a table) is read in its entirety. - -Configuration example: - -``` xml - - - -``` - -or - -``` sql -LAYOUT(HASHED()) -``` - -If `shards` greater then 1 (default is `1`) the dictionary will load data in parallel, useful if you have huge amount of elements in one dictionary. - -Configuration example: - -``` xml - - - 10 - - 10000 - - -``` - -or - -``` sql -LAYOUT(HASHED(SHARDS 10 [SHARD_LOAD_QUEUE_BACKLOG 10000])) -``` - -### sparse_hashed - -Similar to `hashed`, but uses less memory in favor more CPU usage. - -The dictionary key has the [UInt64](../../../sql-reference/data-types/int-uint.md) type. - -Configuration example: - -``` xml - - - -``` - -or - -``` sql -LAYOUT(SPARSE_HASHED()) -``` - -It is also possible to use `shards` for this type of dictionary, and again it is more important for `sparse_hashed` then for `hashed`, since `sparse_hashed` is slower. - -### complex_key_hashed - -This type of storage is for use with composite [keys](../../../sql-reference/dictionaries/external-dictionaries/external-dicts-dict-structure.md). Similar to `hashed`. - -Configuration example: - -``` xml - - - 1 - - - -``` - -or - -``` sql -LAYOUT(COMPLEX_KEY_HASHED([SHARDS 1] [SHARD_LOAD_QUEUE_BACKLOG 10000])) -``` - -### complex_key_sparse_hashed - -This type of storage is for use with composite [keys](../../../sql-reference/dictionaries/external-dictionaries/external-dicts-dict-structure.md). Similar to [sparse_hashed](#dicts-external_dicts_dict_layout-sparse_hashed). - -Configuration example: - -``` xml - - - 1 - - -``` - -or - -``` sql -LAYOUT(COMPLEX_KEY_SPARSE_HASHED([SHARDS 1] [SHARD_LOAD_QUEUE_BACKLOG 10000])) -``` - -### hashed_array - -The dictionary is completely stored in memory. Each attribute is stored in an array. The key attribute is stored in the form of a hashed table where value is an index in the attributes array. The dictionary can contain any number of elements with any identifiers. In practice, the number of keys can reach tens of millions of items. - -The dictionary key has the [UInt64](../../../sql-reference/data-types/int-uint.md) type. - -All types of sources are supported. When updating, data (from a file or from a table) is read in its entirety. - -Configuration example: - -``` xml - - - - -``` - -or - -``` sql -LAYOUT(HASHED_ARRAY()) -``` - -### complex_key_hashed_array - -This type of storage is for use with composite [keys](../../../sql-reference/dictionaries/external-dictionaries/external-dicts-dict-structure.md). Similar to [hashed_array](#dicts-external_dicts_dict_layout-hashed-array). - -Configuration example: - -``` xml - - - -``` - -or - -``` sql -LAYOUT(COMPLEX_KEY_HASHED_ARRAY()) -``` - -### range_hashed - -The dictionary is stored in memory in the form of a hash table with an ordered array of ranges and their corresponding values. - -The dictionary key has the [UInt64](../../../sql-reference/data-types/int-uint.md) type. -This storage method works the same way as hashed and allows using date/time (arbitrary numeric type) ranges in addition to the key. - -Example: The table contains discounts for each advertiser in the format: - -``` text -┌─advertiser_id─┬─discount_start_date─┬─discount_end_date─┬─amount─┠-│ 123 │ 2015-01-16 │ 2015-01-31 │ 0.25 │ -│ 123 │ 2015-01-01 │ 2015-01-15 │ 0.15 │ -│ 456 │ 2015-01-01 │ 2015-01-15 │ 0.05 │ -└───────────────┴─────────────────────┴───────────────────┴────────┘ -``` - -To use a sample for date ranges, define the `range_min` and `range_max` elements in the [structure](../../../sql-reference/dictionaries/external-dictionaries/external-dicts-dict-structure.md). These elements must contain elements `name` and `type` (if `type` is not specified, the default type will be used - Date). `type` can be any numeric type (Date / DateTime / UInt64 / Int32 / others). - -:::warning -Values of `range_min` and `range_max` should fit in `Int64` type. -::: - -Example: - -``` xml - - - - min - - - - - advertiser_id - - - discount_start_date - Date - - - discount_end_date - Date - - ... -``` - -or - -``` sql -CREATE DICTIONARY discounts_dict ( - advertiser_id UInt64, - discount_start_date Date, - discount_end_date Date, - amount Float64 -) -PRIMARY KEY id -SOURCE(CLICKHOUSE(TABLE 'discounts')) -LIFETIME(MIN 1 MAX 1000) -LAYOUT(RANGE_HASHED(range_lookup_strategy 'max')) -RANGE(MIN discount_start_date MAX discount_end_date) -``` - -To work with these dictionaries, you need to pass an additional argument to the `dictGet` function, for which a range is selected: - -``` sql -dictGet('dict_name', 'attr_name', id, date) -``` -Query example: - -``` sql -SELECT dictGet('discounts_dict', 'amount', 1, '2022-10-20'::Date); -``` - -This function returns the value for the specified `id`s and the date range that includes the passed date. - -Details of the algorithm: - -- If the `id` is not found or a range is not found for the `id`, it returns the default value of the attribute's type. -- If there are overlapping ranges and `range_lookup_strategy=min`, it returns a matching range with minimal `range_min`, if several ranges found, it returns a range with minimal `range_max`, if again several ranges found (several ranges had the same `range_min` and `range_max` it returns a random range of them. -- If there are overlapping ranges and `range_lookup_strategy=max`, it returns a matching range with maximal `range_min`, if several ranges found, it returns a range with maximal `range_max`, if again several ranges found (several ranges had the same `range_min` and `range_max` it returns a random range of them. -- If the `range_max` is `NULL`, the range is open. `NULL` is treated as maximal possible value. For the `range_min` `1970-01-01` or `0` (-MAX_INT) can be used as the open value. - -Configuration example: - -``` xml - - - ... - - - - - - - - Abcdef - - - StartTimeStamp - UInt64 - - - EndTimeStamp - UInt64 - - - XXXType - String - - - - - - -``` - -or - -``` sql -CREATE DICTIONARY somedict( - Abcdef UInt64, - StartTimeStamp UInt64, - EndTimeStamp UInt64, - XXXType String DEFAULT '' -) -PRIMARY KEY Abcdef -RANGE(MIN StartTimeStamp MAX EndTimeStamp) -``` - -Configuration example with overlapping ranges and open ranges: - -```sql -CREATE TABLE discounts -( - advertiser_id UInt64, - discount_start_date Date, - discount_end_date Nullable(Date), - amount Float64 -) -ENGINE = Memory; - -INSERT INTO discounts VALUES (1, '2015-01-01', Null, 0.1); -INSERT INTO discounts VALUES (1, '2015-01-15', Null, 0.2); -INSERT INTO discounts VALUES (2, '2015-01-01', '2015-01-15', 0.3); -INSERT INTO discounts VALUES (2, '2015-01-04', '2015-01-10', 0.4); -INSERT INTO discounts VALUES (3, '1970-01-01', '2015-01-15', 0.5); -INSERT INTO discounts VALUES (3, '1970-01-01', '2015-01-10', 0.6); - -SELECT * FROM discounts ORDER BY advertiser_id, discount_start_date; -┌─advertiser_id─┬─discount_start_date─┬─discount_end_date─┬─amount─┠-│ 1 │ 2015-01-01 │ á´ºáµá´¸á´¸ │ 0.1 │ -│ 1 │ 2015-01-15 │ á´ºáµá´¸á´¸ │ 0.2 │ -│ 2 │ 2015-01-01 │ 2015-01-15 │ 0.3 │ -│ 2 │ 2015-01-04 │ 2015-01-10 │ 0.4 │ -│ 3 │ 1970-01-01 │ 2015-01-15 │ 0.5 │ -│ 3 │ 1970-01-01 │ 2015-01-10 │ 0.6 │ -└───────────────┴─────────────────────┴───────────────────┴────────┘ - --- RANGE_LOOKUP_STRATEGY 'max' - -CREATE DICTIONARY discounts_dict -( - advertiser_id UInt64, - discount_start_date Date, - discount_end_date Nullable(Date), - amount Float64 -) -PRIMARY KEY advertiser_id -SOURCE(CLICKHOUSE(TABLE discounts)) -LIFETIME(MIN 600 MAX 900) -LAYOUT(RANGE_HASHED(RANGE_LOOKUP_STRATEGY 'max')) -RANGE(MIN discount_start_date MAX discount_end_date); - -select dictGet('discounts_dict', 'amount', 1, toDate('2015-01-14')) res; -┌─res─┠-│ 0.1 │ -- the only one range is matching: 2015-01-01 - Null -└─────┘ - -select dictGet('discounts_dict', 'amount', 1, toDate('2015-01-16')) res; -┌─res─┠-│ 0.2 │ -- two ranges are matching, range_min 2015-01-15 (0.2) is bigger than 2015-01-01 (0.1) -└─────┘ - -select dictGet('discounts_dict', 'amount', 2, toDate('2015-01-06')) res; -┌─res─┠-│ 0.4 │ -- two ranges are matching, range_min 2015-01-04 (0.4) is bigger than 2015-01-01 (0.3) -└─────┘ - -select dictGet('discounts_dict', 'amount', 3, toDate('2015-01-01')) res; -┌─res─┠-│ 0.5 │ -- two ranges are matching, range_min are equal, 2015-01-15 (0.5) is bigger than 2015-01-10 (0.6) -└─────┘ - -DROP DICTIONARY discounts_dict; - --- RANGE_LOOKUP_STRATEGY 'min' - -CREATE DICTIONARY discounts_dict -( - advertiser_id UInt64, - discount_start_date Date, - discount_end_date Nullable(Date), - amount Float64 -) -PRIMARY KEY advertiser_id -SOURCE(CLICKHOUSE(TABLE discounts)) -LIFETIME(MIN 600 MAX 900) -LAYOUT(RANGE_HASHED(RANGE_LOOKUP_STRATEGY 'min')) -RANGE(MIN discount_start_date MAX discount_end_date); - -select dictGet('discounts_dict', 'amount', 1, toDate('2015-01-14')) res; -┌─res─┠-│ 0.1 │ -- the only one range is matching: 2015-01-01 - Null -└─────┘ - -select dictGet('discounts_dict', 'amount', 1, toDate('2015-01-16')) res; -┌─res─┠-│ 0.1 │ -- two ranges are matching, range_min 2015-01-01 (0.1) is less than 2015-01-15 (0.2) -└─────┘ - -select dictGet('discounts_dict', 'amount', 2, toDate('2015-01-06')) res; -┌─res─┠-│ 0.3 │ -- two ranges are matching, range_min 2015-01-01 (0.3) is less than 2015-01-04 (0.4) -└─────┘ - -select dictGet('discounts_dict', 'amount', 3, toDate('2015-01-01')) res; -┌─res─┠-│ 0.6 │ -- two ranges are matching, range_min are equal, 2015-01-10 (0.6) is less than 2015-01-15 (0.5) -└─────┘ -``` - -### complex_key_range_hashed - -The dictionary is stored in memory in the form of a hash table with an ordered array of ranges and their corresponding values (see [range_hashed](#range-hashed)). This type of storage is for use with composite [keys](../../../sql-reference/dictionaries/external-dictionaries/external-dicts-dict-structure.md). - -Configuration example: - -``` sql -CREATE DICTIONARY range_dictionary -( - CountryID UInt64, - CountryKey String, - StartDate Date, - EndDate Date, - Tax Float64 DEFAULT 0.2 -) -PRIMARY KEY CountryID, CountryKey -SOURCE(CLICKHOUSE(TABLE 'date_table')) -LIFETIME(MIN 1 MAX 1000) -LAYOUT(COMPLEX_KEY_RANGE_HASHED()) -RANGE(MIN StartDate MAX EndDate); -``` - -### cache - -The dictionary is stored in a cache that has a fixed number of cells. These cells contain frequently used elements. - -The dictionary key has the [UInt64](../../../sql-reference/data-types/int-uint.md) type. - -When searching for a dictionary, the cache is searched first. For each block of data, all keys that are not found in the cache or are outdated are requested from the source using `SELECT attrs... FROM db.table WHERE id IN (k1, k2, ...)`. The received data is then written to the cache. - -If keys are not found in dictionary, then update cache task is created and added into update queue. Update queue properties can be controlled with settings `max_update_queue_size`, `update_queue_push_timeout_milliseconds`, `query_wait_timeout_milliseconds`, `max_threads_for_updates`. - -For cache dictionaries, the expiration [lifetime](../../../sql-reference/dictionaries/external-dictionaries/external-dicts-dict-lifetime.md) of data in the cache can be set. If more time than `lifetime` has passed since loading the data in a cell, the cell’s value is not used and key becomes expired. The key is re-requested the next time it needs to be used. This behaviour can be configured with setting `allow_read_expired_keys`. - -This is the least effective of all the ways to store dictionaries. The speed of the cache depends strongly on correct settings and the usage scenario. A cache type dictionary performs well only when the hit rates are high enough (recommended 99% and higher). You can view the average hit rate in the [system.dictionaries](../../../operations/system-tables/dictionaries.md) table. - -If setting `allow_read_expired_keys` is set to 1, by default 0. Then dictionary can support asynchronous updates. If a client requests keys and all of them are in cache, but some of them are expired, then dictionary will return expired keys for a client and request them asynchronously from the source. - -To improve cache performance, use a subquery with `LIMIT`, and call the function with the dictionary externally. - -All types of sources are supported. - -Example of settings: - -``` xml - - - - 1000000000 - - 0 - - 100000 - - 10 - - 60000 - - 4 - - -``` - -or - -``` sql -LAYOUT(CACHE(SIZE_IN_CELLS 1000000000)) -``` - -Set a large enough cache size. You need to experiment to select the number of cells: - -1. Set some value. -2. Run queries until the cache is completely full. -3. Assess memory consumption using the `system.dictionaries` table. -4. Increase or decrease the number of cells until the required memory consumption is reached. - -:::warning -Do not use ClickHouse as a source, because it is slow to process queries with random reads. -::: - -### complex_key_cache - -This type of storage is for use with composite [keys](../../../sql-reference/dictionaries/external-dictionaries/external-dicts-dict-structure.md). Similar to `cache`. - -### ssd_cache - -Similar to `cache`, but stores data on SSD and index in RAM. All cache dictionary settings related to update queue can also be applied to SSD cache dictionaries. - -The dictionary key has the [UInt64](../../../sql-reference/data-types/int-uint.md) type. - -``` xml - - - - 4096 - - 16777216 - - 131072 - - 1048576 - - /var/lib/clickhouse/user_files/test_dict - - -``` - -or - -``` sql -LAYOUT(SSD_CACHE(BLOCK_SIZE 4096 FILE_SIZE 16777216 READ_BUFFER_SIZE 1048576 - PATH '/var/lib/clickhouse/user_files/test_dict')) -``` - -### complex_key_ssd_cache - -This type of storage is for use with composite [keys](../../../sql-reference/dictionaries/external-dictionaries/external-dicts-dict-structure.md). Similar to `ssd_cache`. - -### direct - -The dictionary is not stored in memory and directly goes to the source during the processing of a request. - -The dictionary key has the [UInt64](../../../sql-reference/data-types/int-uint.md) type. - -All types of [sources](../../../sql-reference/dictionaries/external-dictionaries/external-dicts-dict-sources.md), except local files, are supported. - -Configuration example: - -``` xml - - - -``` - -or - -``` sql -LAYOUT(DIRECT()) -``` - -### complex_key_direct - -This type of storage is for use with composite [keys](../../../sql-reference/dictionaries/external-dictionaries/external-dicts-dict-structure.md). Similar to `direct`. - -### ip_trie - -This type of storage is for mapping network prefixes (IP addresses) to metadata such as ASN. - -**Example** - -Suppose we have a table in ClickHouse that contains our IP prefixes and mappings: - -```sql -CREATE TABLE my_ip_addresses ( - prefix String, - asn UInt32, - cca2 String -) -ENGINE = MergeTree -PRIMARY KEY prefix; -``` - -```sql -INSERT INTO my_ip_addresses VALUES - ('202.79.32.0/20', 17501, 'NP'), - ('2620:0:870::/48', 3856, 'US'), - ('2a02:6b8:1::/48', 13238, 'RU'), - ('2001:db8::/32', 65536, 'ZZ') -; -``` - -Let's define an `ip_trie` dictionary for this table. The `ip_trie` layout requires a composite key: - -``` xml - - - - prefix - String - - - - asn - UInt32 - - - - cca2 - String - ?? - - ... - - - - - - true - - -``` - -or - -``` sql -CREATE DICTIONARY my_ip_trie_dictionary ( - prefix String, - asn UInt32, - cca2 String DEFAULT '??' -) -PRIMARY KEY prefix -SOURCE(CLICKHOUSE(TABLE 'my_ip_addresses')) -LAYOUT(IP_TRIE) -LIFETIME(3600); -``` - -The key must have only one `String` type attribute that contains an allowed IP prefix. Other types are not supported yet. - -For queries, you must use the same functions (`dictGetT` with a tuple) as for dictionaries with composite keys. The syntax is: - -``` sql -dictGetT('dict_name', 'attr_name', tuple(ip)) -``` - -The function takes either `UInt32` for IPv4, or `FixedString(16)` for IPv6. For example: - -``` sql -select dictGet('my_ip_trie_dictionary', 'asn', tuple(IPv6StringToNum('2001:db8::1'))) -``` - -Other types are not supported yet. The function returns the attribute for the prefix that corresponds to this IP address. If there are overlapping prefixes, the most specific one is returned. - -Data must completely fit into RAM. - -## Related Content - -- [Using dictionaries to accelerate queries](https://clickhouse.com/blog/faster-queries-dictionaries-clickhouse) diff --git a/docs/en/sql-reference/dictionaries/external-dictionaries/external-dicts-dict-lifetime.md b/docs/en/sql-reference/dictionaries/external-dictionaries/external-dicts-dict-lifetime.md deleted file mode 100644 index 8e9dbd392aa..00000000000 --- a/docs/en/sql-reference/dictionaries/external-dictionaries/external-dicts-dict-lifetime.md +++ /dev/null @@ -1,142 +0,0 @@ ---- -slug: /en/sql-reference/dictionaries/external-dictionaries/external-dicts-dict-lifetime -sidebar_position: 42 -sidebar_label: Dictionary Updates ---- -import CloudDetails from '@site/docs/en/sql-reference/dictionaries/external-dictionaries/_snippet_dictionary_in_cloud.md'; - -# Dictionary Updates - -ClickHouse periodically updates the dictionaries. The update interval for fully downloaded dictionaries and the invalidation interval for cached dictionaries are defined in the `lifetime` tag in seconds. - -Dictionary updates (other than loading for first use) do not block queries. During updates, the old version of a dictionary is used. If an error occurs during an update, the error is written to the server log, and queries continue using the old version of dictionaries. - -Example of settings: - - - -``` xml - - ... - 300 - ... - -``` - -or - -``` sql -CREATE DICTIONARY (...) -... -LIFETIME(300) -... -``` - -Setting `0` (`LIFETIME(0)`) prevents dictionaries from updating. - -You can set a time interval for updates, and ClickHouse will choose a uniformly random time within this range. This is necessary in order to distribute the load on the dictionary source when updating on a large number of servers. - -Example of settings: - -``` xml - - ... - - 300 - 360 - - ... - -``` - -or - -``` sql -LIFETIME(MIN 300 MAX 360) -``` - -If `0` and `0`, ClickHouse does not reload the dictionary by timeout. -In this case, ClickHouse can reload the dictionary earlier if the dictionary configuration file was changed or the `SYSTEM RELOAD DICTIONARY` command was executed. - -When updating the dictionaries, the ClickHouse server applies different logic depending on the type of [source](../../../sql-reference/dictionaries/external-dictionaries/external-dicts-dict-sources.md): - -- For a text file, it checks the time of modification. If the time differs from the previously recorded time, the dictionary is updated. -- For MySQL source, the time of modification is checked using a `SHOW TABLE STATUS` query (in case of MySQL 8 you need to disable meta-information caching in MySQL by `set global information_schema_stats_expiry=0`). -- Dictionaries from other sources are updated every time by default. - -For other sources (ODBC, PostgreSQL, ClickHouse, etc), you can set up a query that will update the dictionaries only if they really changed, rather than each time. To do this, follow these steps: - -- The dictionary table must have a field that always changes when the source data is updated. -- The settings of the source must specify a query that retrieves the changing field. The ClickHouse server interprets the query result as a row, and if this row has changed relative to its previous state, the dictionary is updated. Specify the query in the `` field in the settings for the [source](../../../sql-reference/dictionaries/external-dictionaries/external-dicts-dict-sources.md). - -Example of settings: - -``` xml - - ... - - ... - SELECT update_time FROM dictionary_source where id = 1 - - ... - -``` - -or - -``` sql -... -SOURCE(ODBC(... invalidate_query 'SELECT update_time FROM dictionary_source where id = 1')) -... -``` - -For `Cache`, `ComplexKeyCache`, `SSDCache`, and `SSDComplexKeyCache` dictionaries both synchronious and asynchronious updates are supported. - -It is also possible for `Flat`, `Hashed`, `ComplexKeyHashed` dictionaries to only request data that was changed after the previous update. If `update_field` is specified as part of the dictionary source configuration, value of the previous update time in seconds will be added to the data request. Depends on source type (Executable, HTTP, MySQL, PostgreSQL, ClickHouse, or ODBC) different logic will be applied to `update_field` before request data from an external source. - -- If the source is HTTP then `update_field` will be added as a query parameter with the last update time as the parameter value. -- If the source is Executable then `update_field` will be added as an executable script argument with the last update time as the argument value. -- If the source is ClickHouse, MySQL, PostgreSQL, ODBC there will be an additional part of `WHERE`, where `update_field` is compared as greater or equal with the last update time. - - Per default, this `WHERE`-condition is checked at the highest level of the SQL-Query. Alternatively, the condition can be checked in any other `WHERE`-clause within the query using the `{condition}`-keyword. Example: - ```sql - ... - SOURCE(CLICKHOUSE(... - update_field 'added_time' - QUERY ' - SELECT my_arr.1 AS x, my_arr.2 AS y, creation_time - FROM ( - SELECT arrayZip(x_arr, y_arr) AS my_arr, creation_time - FROM dictionary_source - WHERE {condition} - )' - )) - ... - ``` - -If `update_field` option is set, additional option `update_lag` can be set. Value of `update_lag` option is subtracted from previous update time before request updated data. - -Example of settings: - -``` xml - - ... - - ... - added_time - 15 - - ... - -``` - -or - -``` sql -... -SOURCE(CLICKHOUSE(... update_field 'added_time' update_lag 15)) -... -``` - -## Related Content - -- [Using dictionaries to accelerate queries](https://clickhouse.com/blog/faster-queries-dictionaries-clickhouse) diff --git a/docs/en/sql-reference/dictionaries/external-dictionaries/external-dicts-dict-polygon.md b/docs/en/sql-reference/dictionaries/external-dictionaries/external-dicts-dict-polygon.md deleted file mode 100644 index 8ef19a181e7..00000000000 --- a/docs/en/sql-reference/dictionaries/external-dictionaries/external-dicts-dict-polygon.md +++ /dev/null @@ -1,140 +0,0 @@ ---- -slug: /en/sql-reference/dictionaries/external-dictionaries/external-dicts-dict-polygon -sidebar_position: 46 -sidebar_label: Polygon Dictionaries With Grids -title: "Polygon dictionaries" ---- -import CloudDetails from '@site/docs/en/sql-reference/dictionaries/external-dictionaries/_snippet_dictionary_in_cloud.md'; - -Polygon dictionaries allow you to efficiently search for the polygon containing specified points. -For example: defining a city area by geographical coordinates. - -Example of a polygon dictionary configuration: - - - -``` xml - - - - - key - Array(Array(Array(Array(Float64)))) - - - - - name - String - - - - - value - UInt64 - 0 - - - - - - 1 - - - - ... - -``` - -The corresponding [DDL-query](../../../sql-reference/statements/create/dictionary.md#create-dictionary-query): -``` sql -CREATE DICTIONARY polygon_dict_name ( - key Array(Array(Array(Array(Float64)))), - name String, - value UInt64 -) -PRIMARY KEY key -LAYOUT(POLYGON(STORE_POLYGON_KEY_COLUMN 1)) -... -``` - -When configuring the polygon dictionary, the key must have one of two types: - -- A simple polygon. It is an array of points. -- MultiPolygon. It is an array of polygons. Each polygon is a two-dimensional array of points. The first element of this array is the outer boundary of the polygon, and subsequent elements specify areas to be excluded from it. - -Points can be specified as an array or a tuple of their coordinates. In the current implementation, only two-dimensional points are supported. - -The user can [upload their own data](../../../sql-reference/dictionaries/external-dictionaries/external-dicts-dict-sources.md) in all formats supported by ClickHouse. - -There are 3 types of [in-memory storage](../../../sql-reference/dictionaries/external-dictionaries/external-dicts-dict-layout.md) available: - -- `POLYGON_SIMPLE`. This is a naive implementation, where a linear pass through all polygons is made for each query, and membership is checked for each one without using additional indexes. - -- `POLYGON_INDEX_EACH`. A separate index is built for each polygon, which allows you to quickly check whether it belongs in most cases (optimized for geographical regions). -Also, a grid is superimposed on the area under consideration, which significantly narrows the number of polygons under consideration. -The grid is created by recursively dividing the cell into 16 equal parts and is configured with two parameters. -The division stops when the recursion depth reaches `MAX_DEPTH` or when the cell crosses no more than `MIN_INTERSECTIONS` polygons. -To respond to the query, there is a corresponding cell, and the index for the polygons stored in it is accessed alternately. - -- `POLYGON_INDEX_CELL`. This placement also creates the grid described above. The same options are available. For each sheet cell, an index is built on all pieces of polygons that fall into it, which allows you to quickly respond to a request. - -- `POLYGON`. Synonym to `POLYGON_INDEX_CELL`. - -Dictionary queries are carried out using standard [functions](../../../sql-reference/functions/ext-dict-functions.md) for working with dictionaries. -An important difference is that here the keys will be the points for which you want to find the polygon containing them. - -**Example** - -Example of working with the dictionary defined above: - -``` sql -CREATE TABLE points ( - x Float64, - y Float64 -) -... -SELECT tuple(x, y) AS key, dictGet(dict_name, 'name', key), dictGet(dict_name, 'value', key) FROM points ORDER BY x, y; -``` - -As a result of executing the last command for each point in the 'points' table, a minimum area polygon containing this point will be found, and the requested attributes will be output. - -**Example** - -You can read columns from polygon dictionaries via SELECT query, just turn on the `store_polygon_key_column = 1` in the dictionary configuration or corresponding DDL-query. - -Query: - -``` sql -CREATE TABLE polygons_test_table -( - key Array(Array(Array(Tuple(Float64, Float64)))), - name String -) ENGINE = TinyLog; - -INSERT INTO polygons_test_table VALUES ([[[(3, 1), (0, 1), (0, -1), (3, -1)]]], 'Value'); - -CREATE DICTIONARY polygons_test_dictionary -( - key Array(Array(Array(Tuple(Float64, Float64)))), - name String -) -PRIMARY KEY key -SOURCE(CLICKHOUSE(TABLE 'polygons_test_table')) -LAYOUT(POLYGON(STORE_POLYGON_KEY_COLUMN 1)) -LIFETIME(0); - -SELECT * FROM polygons_test_dictionary; -``` - -Result: - -``` text -┌─key─────────────────────────────┬─name──┠-│ [[[(3,1),(0,1),(0,-1),(3,-1)]]] │ Value │ -└─────────────────────────────────┴───────┘ -``` - -## Related Content - -- [Exploring massive, real-world data sets: 100+ Years of Weather Records in ClickHouse](https://clickhouse.com/blog/real-world-data-noaa-climate-data) diff --git a/docs/en/sql-reference/dictionaries/external-dictionaries/external-dicts-dict-sources.md b/docs/en/sql-reference/dictionaries/external-dictionaries/external-dicts-dict-sources.md deleted file mode 100644 index 897945a6d9d..00000000000 --- a/docs/en/sql-reference/dictionaries/external-dictionaries/external-dicts-dict-sources.md +++ /dev/null @@ -1,847 +0,0 @@ ---- -slug: /en/sql-reference/dictionaries/external-dictionaries/external-dicts-dict-sources -sidebar_position: 43 -sidebar_label: Dictionary Sources ---- -import CloudDetails from '@site/docs/en/sql-reference/dictionaries/external-dictionaries/_snippet_dictionary_in_cloud.md'; - -# Dictionary Sources - - - -A dictionary can be connected to ClickHouse from many different sources. - -If the dictionary is configured using an xml-file, the configuration looks like this: - -``` xml - - - ... - - - - - - ... - - ... - -``` - -In case of [DDL-query](../../../sql-reference/statements/create/dictionary.md), the configuration described above will look like: - -``` sql -CREATE DICTIONARY dict_name (...) -... -SOURCE(SOURCE_TYPE(param1 val1 ... paramN valN)) -- Source configuration -... -``` - -The source is configured in the `source` section. - -For source types [Local file](#dicts-external_dicts_dict_sources-local_file), [Executable file](#dicts-external_dicts_dict_sources-executable), [HTTP(s)](#dicts-external_dicts_dict_sources-http), [ClickHouse](#dicts-external_dicts_dict_sources-clickhouse) -optional settings are available: - -``` xml - - - /opt/dictionaries/os.tsv - TabSeparated - - - 0 - - -``` - -or - -``` sql -SOURCE(FILE(path './user_files/os.tsv' format 'TabSeparated')) -SETTINGS(format_csv_allow_single_quotes = 0) -``` - -Types of sources (`source_type`): - -- [Local file](#dicts-external_dicts_dict_sources-local_file) -- [Executable File](#dicts-external_dicts_dict_sources-executable) -- [Executable Pool](#dicts-external_dicts_dict_sources-executable_pool) -- [HTTP(s)](#dicts-external_dicts_dict_sources-http) -- DBMS - - [ODBC](#odbc) - - [MySQL](#mysql) - - [ClickHouse](#clickhouse) - - [MongoDB](#mongodb) - - [Redis](#redis) - - [Cassandra](#cassandra) - - [PostgreSQL](#postgresql) - -## Local File - -Example of settings: - -``` xml - - - /opt/dictionaries/os.tsv - TabSeparated - - -``` - -or - -``` sql -SOURCE(FILE(path './user_files/os.tsv' format 'TabSeparated')) -``` - -Setting fields: - -- `path` – The absolute path to the file. -- `format` – The file format. All the formats described in [Formats](../../../interfaces/formats.md#formats) are supported. - -When a dictionary with source `FILE` is created via DDL command (`CREATE DICTIONARY ...`), the source file needs to be located in the `user_files` directory to prevent DB users from accessing arbitrary files on the ClickHouse node. - -**See Also** - -- [Dictionary function](../../../sql-reference/table-functions/dictionary.md#dictionary-function) - -## Executable File - -Working with executable files depends on [how the dictionary is stored in memory](../../../sql-reference/dictionaries/external-dictionaries/external-dicts-dict-layout.md). If the dictionary is stored using `cache` and `complex_key_cache`, ClickHouse requests the necessary keys by sending a request to the executable file’s STDIN. Otherwise, ClickHouse starts the executable file and treats its output as dictionary data. - -Example of settings: - -``` xml - - - cat /opt/dictionaries/os.tsv - TabSeparated - false - - -``` - -Setting fields: - -- `command` — The absolute path to the executable file, or the file name (if the command's directory is in the `PATH`). -- `format` — The file format. All the formats described in [Formats](../../../interfaces/formats.md#formats) are supported. -- `command_termination_timeout` — The executable script should contain a main read-write loop. After the dictionary is destroyed, the pipe is closed, and the executable file will have `command_termination_timeout` seconds to shutdown before ClickHouse will send a SIGTERM signal to the child process. `command_termination_timeout` is specified in seconds. Default value is 10. Optional parameter. -- `command_read_timeout` - Timeout for reading data from command stdout in milliseconds. Default value 10000. Optional parameter. -- `command_write_timeout` - Timeout for writing data to command stdin in milliseconds. Default value 10000. Optional parameter. -- `implicit_key` — The executable source file can return only values, and the correspondence to the requested keys is determined implicitly — by the order of rows in the result. Default value is false. -- `execute_direct` - If `execute_direct` = `1`, then `command` will be searched inside user_scripts folder specified by [user_scripts_path](../../../operations/server-configuration-parameters/settings.md#server_configuration_parameters-user_scripts_path). Additional script arguments can be specified using a whitespace separator. Example: `script_name arg1 arg2`. If `execute_direct` = `0`, `command` is passed as argument for `bin/sh -c`. Default value is `0`. Optional parameter. -- `send_chunk_header` - controls whether to send row count before sending a chunk of data to process. Optional. Default value is `false`. - -That dictionary source can be configured only via XML configuration. Creating dictionaries with executable source via DDL is disabled; otherwise, the DB user would be able to execute arbitrary binaries on the ClickHouse node. - -## Executable Pool - -Executable pool allows loading data from pool of processes. This source does not work with dictionary layouts that need to load all data from source. Executable pool works if the dictionary [is stored](external-dicts-dict-layout.md#ways-to-store-dictionaries-in-memory) using `cache`, `complex_key_cache`, `ssd_cache`, `complex_key_ssd_cache`, `direct`, or `complex_key_direct` layouts. - -Executable pool will spawn a pool of processes with the specified command and keep them running until they exit. The program should read data from STDIN while it is available and output the result to STDOUT. It can wait for the next block of data on STDIN. ClickHouse will not close STDIN after processing a block of data, but will pipe another chunk of data when needed. The executable script should be ready for this way of data processing — it should poll STDIN and flush data to STDOUT early. - -Example of settings: - -``` xml - - - while read key; do printf "$key\tData for key $key\n"; done - TabSeparated - 10 - 10 - false - - -``` - -Setting fields: - -- `command` — The absolute path to the executable file, or the file name (if the program directory is written to `PATH`). -- `format` — The file format. All the formats described in “[Formats](../../../interfaces/formats.md#formats)†are supported. -- `pool_size` — Size of pool. If 0 is specified as `pool_size` then there is no pool size restrictions. Default value is `16`. -- `command_termination_timeout` — executable script should contain main read-write loop. After dictionary is destroyed, pipe is closed, and executable file will have `command_termination_timeout` seconds to shutdown, before ClickHouse will send SIGTERM signal to child process. Specified in seconds. Default value is 10. Optional parameter. -- `max_command_execution_time` — Maximum executable script command execution time for processing block of data. Specified in seconds. Default value is 10. Optional parameter. -- `command_read_timeout` - timeout for reading data from command stdout in milliseconds. Default value 10000. Optional parameter. -- `command_write_timeout` - timeout for writing data to command stdin in milliseconds. Default value 10000. Optional parameter. -- `implicit_key` — The executable source file can return only values, and the correspondence to the requested keys is determined implicitly — by the order of rows in the result. Default value is false. Optional parameter. -- `execute_direct` - If `execute_direct` = `1`, then `command` will be searched inside user_scripts folder specified by [user_scripts_path](../../../operations/server-configuration-parameters/settings.md#server_configuration_parameters-user_scripts_path). Additional script arguments can be specified using whitespace separator. Example: `script_name arg1 arg2`. If `execute_direct` = `0`, `command` is passed as argument for `bin/sh -c`. Default value is `1`. Optional parameter. -- `send_chunk_header` - controls whether to send row count before sending a chunk of data to process. Optional. Default value is `false`. - -That dictionary source can be configured only via XML configuration. Creating dictionaries with executable source via DDL is disabled, otherwise, the DB user would be able to execute arbitrary binary on ClickHouse node. - -## Http(s) - -Working with an HTTP(s) server depends on [how the dictionary is stored in memory](../../../sql-reference/dictionaries/external-dictionaries/external-dicts-dict-layout.md). If the dictionary is stored using `cache` and `complex_key_cache`, ClickHouse requests the necessary keys by sending a request via the `POST` method. - -Example of settings: - -``` xml - - - http://[::1]/os.tsv - TabSeparated - - user - password - - -
- API-KEY - key -
-
-
- -``` - -or - -``` sql -SOURCE(HTTP( - url 'http://[::1]/os.tsv' - format 'TabSeparated' - credentials(user 'user' password 'password') - headers(header(name 'API-KEY' value 'key')) -)) -``` - -In order for ClickHouse to access an HTTPS resource, you must [configure openSSL](../../../operations/server-configuration-parameters/settings.md#server_configuration_parameters-openssl) in the server configuration. - -Setting fields: - -- `url` – The source URL. -- `format` – The file format. All the formats described in “[Formats](../../../interfaces/formats.md#formats)†are supported. -- `credentials` – Basic HTTP authentication. Optional parameter. -- `user` – Username required for the authentication. -- `password` – Password required for the authentication. -- `headers` – All custom HTTP headers entries used for the HTTP request. Optional parameter. -- `header` – Single HTTP header entry. -- `name` – Identifiant name used for the header send on the request. -- `value` – Value set for a specific identifiant name. - -When creating a dictionary using the DDL command (`CREATE DICTIONARY ...`) remote hosts for HTTP dictionaries are checked against the contents of `remote_url_allow_hosts` section from config to prevent database users to access arbitrary HTTP server. - -### Known Vulnerability of the ODBC Dictionary Functionality - -:::note -When connecting to the database through the ODBC driver connection parameter `Servername` can be substituted. In this case values of `USERNAME` and `PASSWORD` from `odbc.ini` are sent to the remote server and can be compromised. -::: - -**Example of insecure use** - -Let’s configure unixODBC for PostgreSQL. Content of `/etc/odbc.ini`: - -``` text -[gregtest] -Driver = /usr/lib/psqlodbca.so -Servername = localhost -PORT = 5432 -DATABASE = test_db -#OPTION = 3 -USERNAME = test -PASSWORD = test -``` - -If you then make a query such as - -``` sql -SELECT * FROM odbc('DSN=gregtest;Servername=some-server.com', 'test_db'); -``` - -ODBC driver will send values of `USERNAME` and `PASSWORD` from `odbc.ini` to `some-server.com`. - -### Example of Connecting Postgresql - -Ubuntu OS. - -Installing unixODBC and the ODBC driver for PostgreSQL: - -``` bash -$ sudo apt-get install -y unixodbc odbcinst odbc-postgresql -``` - -Configuring `/etc/odbc.ini` (or `~/.odbc.ini` if you signed in under a user that runs ClickHouse): - -``` text - [DEFAULT] - Driver = myconnection - - [myconnection] - Description = PostgreSQL connection to my_db - Driver = PostgreSQL Unicode - Database = my_db - Servername = 127.0.0.1 - UserName = username - Password = password - Port = 5432 - Protocol = 9.3 - ReadOnly = No - RowVersioning = No - ShowSystemTables = No - ConnSettings = -``` - -The dictionary configuration in ClickHouse: - -``` xml - - - table_name - - - - - DSN=myconnection - postgresql_table
-
- - - 300 - 360 - - - - - - - id - - - some_column - UInt64 - 0 - - -
-
-``` - -or - -``` sql -CREATE DICTIONARY table_name ( - id UInt64, - some_column UInt64 DEFAULT 0 -) -PRIMARY KEY id -SOURCE(ODBC(connection_string 'DSN=myconnection' table 'postgresql_table')) -LAYOUT(HASHED()) -LIFETIME(MIN 300 MAX 360) -``` - -You may need to edit `odbc.ini` to specify the full path to the library with the driver `DRIVER=/usr/local/lib/psqlodbcw.so`. - -### Example of Connecting MS SQL Server - -Ubuntu OS. - -Installing the ODBC driver for connecting to MS SQL: - -``` bash -$ sudo apt-get install tdsodbc freetds-bin sqsh -``` - -Configuring the driver: - -```bash - $ cat /etc/freetds/freetds.conf - ... - - [MSSQL] - host = 192.168.56.101 - port = 1433 - tds version = 7.0 - client charset = UTF-8 - - # test TDS connection - $ sqsh -S MSSQL -D database -U user -P password - - - $ cat /etc/odbcinst.ini - - [FreeTDS] - Description = FreeTDS - Driver = /usr/lib/x86_64-linux-gnu/odbc/libtdsodbc.so - Setup = /usr/lib/x86_64-linux-gnu/odbc/libtdsS.so - FileUsage = 1 - UsageCount = 5 - - $ cat /etc/odbc.ini - # $ cat ~/.odbc.ini # if you signed in under a user that runs ClickHouse - - [MSSQL] - Description = FreeTDS - Driver = FreeTDS - Servername = MSSQL - Database = test - UID = test - PWD = test - Port = 1433 - - - # (optional) test ODBC connection (to use isql-tool install the [unixodbc](https://packages.debian.org/sid/unixodbc)-package) - $ isql -v MSSQL "user" "password" -``` - -Remarks: -- to determine the earliest TDS version that is supported by a particular SQL Server version, refer to the product documentation or look at [MS-TDS Product Behavior](https://docs.microsoft.com/en-us/openspecs/windows_protocols/ms-tds/135d0ebe-5c4c-4a94-99bf-1811eccb9f4a) - -Configuring the dictionary in ClickHouse: - -``` xml - - - test - - - dict
- DSN=MSSQL;UID=test;PWD=test -
- - - - 300 - 360 - - - - - - - - - k - - - s - String - - - -
-
-``` - -or - -``` sql -CREATE DICTIONARY test ( - k UInt64, - s String DEFAULT '' -) -PRIMARY KEY k -SOURCE(ODBC(table 'dict' connection_string 'DSN=MSSQL;UID=test;PWD=test')) -LAYOUT(FLAT()) -LIFETIME(MIN 300 MAX 360) -``` - -## DBMS - -### ODBC - -You can use this method to connect any database that has an ODBC driver. - -Example of settings: - -``` xml - - - DatabaseName - ShemaName.TableName
- DSN=some_parameters - SQL_QUERY - SELECT id, value_1, value_2 FROM ShemaName.TableName -
- -``` - -or - -``` sql -SOURCE(ODBC( - db 'DatabaseName' - table 'SchemaName.TableName' - connection_string 'DSN=some_parameters' - invalidate_query 'SQL_QUERY' - query 'SELECT id, value_1, value_2 FROM db_name.table_name' -)) -``` - -Setting fields: - -- `db` – Name of the database. Omit it if the database name is set in the `` parameters. -- `table` – Name of the table and schema if exists. -- `connection_string` – Connection string. -- `invalidate_query` – Query for checking the dictionary status. Optional parameter. Read more in the section [Updating dictionaries](../../../sql-reference/dictionaries/external-dictionaries/external-dicts-dict-lifetime.md). -- `query` – The custom query. Optional parameter. - -:::note -The `table` and `query` fields cannot be used together. And either one of the `table` or `query` fields must be declared. -::: - -ClickHouse receives quoting symbols from ODBC-driver and quote all settings in queries to driver, so it’s necessary to set table name accordingly to table name case in database. - -If you have a problems with encodings when using Oracle, see the corresponding [FAQ](../../../faq/integration/oracle-odbc.md) item. - -### Mysql - -Example of settings: - -``` xml - - - 3306 - clickhouse - qwerty - - example01-1 - 1 - - - example01-2 - 1 - - db_name - table_name
- id=10 - SQL_QUERY - true - SELECT id, value_1, value_2 FROM db_name.table_name -
- -``` - -or - -``` sql -SOURCE(MYSQL( - port 3306 - user 'clickhouse' - password 'qwerty' - replica(host 'example01-1' priority 1) - replica(host 'example01-2' priority 1) - db 'db_name' - table 'table_name' - where 'id=10' - invalidate_query 'SQL_QUERY' - fail_on_connection_loss 'true' - query 'SELECT id, value_1, value_2 FROM db_name.table_name' -)) -``` - -Setting fields: - -- `port` – The port on the MySQL server. You can specify it for all replicas, or for each one individually (inside ``). - -- `user` – Name of the MySQL user. You can specify it for all replicas, or for each one individually (inside ``). - -- `password` – Password of the MySQL user. You can specify it for all replicas, or for each one individually (inside ``). - -- `replica` – Section of replica configurations. There can be multiple sections. - - - `replica/host` – The MySQL host. - - `replica/priority` – The replica priority. When attempting to connect, ClickHouse traverses the replicas in order of priority. The lower the number, the higher the priority. - -- `db` – Name of the database. - -- `table` – Name of the table. - -- `where` – The selection criteria. The syntax for conditions is the same as for `WHERE` clause in MySQL, for example, `id > 10 AND id < 20`. Optional parameter. - -- `invalidate_query` – Query for checking the dictionary status. Optional parameter. Read more in the section [Updating dictionaries](../../../sql-reference/dictionaries/external-dictionaries/external-dicts-dict-lifetime.md). - -- `fail_on_connection_loss` – The configuration parameter that controls behavior of the server on connection loss. If `true`, an exception is thrown immediately if the connection between client and server was lost. If `false`, the ClickHouse server retries to execute the query three times before throwing an exception. Note that retrying leads to increased response times. Default value: `false`. - -- `query` – The custom query. Optional parameter. - -:::note -The `table` or `where` fields cannot be used together with the `query` field. And either one of the `table` or `query` fields must be declared. -::: - -:::note -There is no explicit parameter `secure`. When establishing an SSL-connection security is mandatory. -::: - -MySQL can be connected to on a local host via sockets. To do this, set `host` and `socket`. - -Example of settings: - -``` xml - - - localhost - /path/to/socket/file.sock - clickhouse - qwerty - db_name - table_name
- id=10 - SQL_QUERY - true - SELECT id, value_1, value_2 FROM db_name.table_name -
- -``` - -or - -``` sql -SOURCE(MYSQL( - host 'localhost' - socket '/path/to/socket/file.sock' - user 'clickhouse' - password 'qwerty' - db 'db_name' - table 'table_name' - where 'id=10' - invalidate_query 'SQL_QUERY' - fail_on_connection_loss 'true' - query 'SELECT id, value_1, value_2 FROM db_name.table_name' -)) -``` - -### ClickHouse - -Example of settings: - -``` xml - - - example01-01-1 - 9000 - default - - default - ids
- id=10 - 1 - SELECT id, value_1, value_2 FROM default.ids -
- -``` - -or - -``` sql -SOURCE(CLICKHOUSE( - host 'example01-01-1' - port 9000 - user 'default' - password '' - db 'default' - table 'ids' - where 'id=10' - secure 1 - query 'SELECT id, value_1, value_2 FROM default.ids' -)); -``` - -Setting fields: - -- `host` – The ClickHouse host. If it is a local host, the query is processed without any network activity. To improve fault tolerance, you can create a [Distributed](../../../engines/table-engines/special/distributed.md) table and enter it in subsequent configurations. -- `port` – The port on the ClickHouse server. -- `user` – Name of the ClickHouse user. -- `password` – Password of the ClickHouse user. -- `db` – Name of the database. -- `table` – Name of the table. -- `where` – The selection criteria. May be omitted. -- `invalidate_query` – Query for checking the dictionary status. Optional parameter. Read more in the section [Updating dictionaries](../../../sql-reference/dictionaries/external-dictionaries/external-dicts-dict-lifetime.md). -- `secure` - Use ssl for connection. -- `query` – The custom query. Optional parameter. - -:::note -The `table` or `where` fields cannot be used together with the `query` field. And either one of the `table` or `query` fields must be declared. -::: - -### Mongodb - -Example of settings: - -``` xml - - - localhost - 27017 - - - test - dictionary_source - - -``` - -or - -``` sql -SOURCE(MONGODB( - host 'localhost' - port 27017 - user '' - password '' - db 'test' - collection 'dictionary_source' -)) -``` - -Setting fields: - -- `host` – The MongoDB host. -- `port` – The port on the MongoDB server. -- `user` – Name of the MongoDB user. -- `password` – Password of the MongoDB user. -- `db` – Name of the database. -- `collection` – Name of the collection. - -### Redis - -Example of settings: - -``` xml - - - localhost - 6379 - simple - 0 - - -``` - -or - -``` sql -SOURCE(REDIS( - host 'localhost' - port 6379 - storage_type 'simple' - db_index 0 -)) -``` - -Setting fields: - -- `host` – The Redis host. -- `port` – The port on the Redis server. -- `storage_type` – The structure of internal Redis storage using for work with keys. `simple` is for simple sources and for hashed single key sources, `hash_map` is for hashed sources with two keys. Ranged sources and cache sources with complex key are unsupported. May be omitted, default value is `simple`. -- `db_index` – The specific numeric index of Redis logical database. May be omitted, default value is 0. - -### Cassandra - -Example of settings: - -``` xml - - - localhost - 9042 - username - qwerty123 - database_name - table_name - 1 - 1 - One - "SomeColumn" = 42 - 8 - SELECT id, value_1, value_2 FROM database_name.table_name - - -``` - -Setting fields: - -- `host` – The Cassandra host or comma-separated list of hosts. -- `port` – The port on the Cassandra servers. If not specified, default port 9042 is used. -- `user` – Name of the Cassandra user. -- `password` – Password of the Cassandra user. -- `keyspace` – Name of the keyspace (database). -- `column_family` – Name of the column family (table). -- `allow_filering` – Flag to allow or not potentially expensive conditions on clustering key columns. Default value is 1. -- `partition_key_prefix` – Number of partition key columns in primary key of the Cassandra table. Required for compose key dictionaries. Order of key columns in the dictionary definition must be the same as in Cassandra. Default value is 1 (the first key column is a partition key and other key columns are clustering key). -- `consistency` – Consistency level. Possible values: `One`, `Two`, `Three`, `All`, `EachQuorum`, `Quorum`, `LocalQuorum`, `LocalOne`, `Serial`, `LocalSerial`. Default value is `One`. -- `where` – Optional selection criteria. -- `max_threads` – The maximum number of threads to use for loading data from multiple partitions in compose key dictionaries. -- `query` – The custom query. Optional parameter. - -:::note -The `column_family` or `where` fields cannot be used together with the `query` field. And either one of the `column_family` or `query` fields must be declared. -::: - -### PostgreSQL - -Example of settings: - -``` xml - - - 5432 - clickhouse - qwerty - db_name - table_name
- id=10 - SQL_QUERY - SELECT id, value_1, value_2 FROM db_name.table_name -
- -``` - -or - -``` sql -SOURCE(POSTGRESQL( - port 5432 - host 'postgresql-hostname' - user 'postgres_user' - password 'postgres_password' - db 'db_name' - table 'table_name' - replica(host 'example01-1' port 5432 priority 1) - replica(host 'example01-2' port 5432 priority 2) - where 'id=10' - invalidate_query 'SQL_QUERY' - query 'SELECT id, value_1, value_2 FROM db_name.table_name' -)) -``` - -Setting fields: - -- `host` – The host on the PostgreSQL server. You can specify it for all replicas, or for each one individually (inside ``). -- `port` – The port on the PostgreSQL server. You can specify it for all replicas, or for each one individually (inside ``). -- `user` – Name of the PostgreSQL user. You can specify it for all replicas, or for each one individually (inside ``). -- `password` – Password of the PostgreSQL user. You can specify it for all replicas, or for each one individually (inside ``). -- `replica` – Section of replica configurations. There can be multiple sections: - - `replica/host` – The PostgreSQL host. - - `replica/port` – The PostgreSQL port. - - `replica/priority` – The replica priority. When attempting to connect, ClickHouse traverses the replicas in order of priority. The lower the number, the higher the priority. -- `db` – Name of the database. -- `table` – Name of the table. -- `where` – The selection criteria. The syntax for conditions is the same as for `WHERE` clause in PostgreSQL. For example, `id > 10 AND id < 20`. Optional parameter. -- `invalidate_query` – Query for checking the dictionary status. Optional parameter. Read more in the section [Updating dictionaries](../../../sql-reference/dictionaries/external-dictionaries/external-dicts-dict-lifetime.md). -- `query` – The custom query. Optional parameter. - -:::note -The `table` or `where` fields cannot be used together with the `query` field. And either one of the `table` or `query` fields must be declared. -::: - -## Null - -A special source that can be used to create dummy (empty) dictionaries. Such dictionaries can useful for tests or with setups with separated data and query nodes at nodes with Distributed tables. - -``` sql -CREATE DICTIONARY null_dict ( - id UInt64, - val UInt8, - default_val UInt8 DEFAULT 123, - nullable_val Nullable(UInt8) -) -PRIMARY KEY id -SOURCE(NULL()) -LAYOUT(FLAT()) -LIFETIME(0); -``` - -## Related Content - -- [Using dictionaries to accelerate queries](https://clickhouse.com/blog/faster-queries-dictionaries-clickhouse) diff --git a/docs/en/sql-reference/dictionaries/external-dictionaries/external-dicts-dict-structure.md b/docs/en/sql-reference/dictionaries/external-dictionaries/external-dicts-dict-structure.md deleted file mode 100644 index 8271a342941..00000000000 --- a/docs/en/sql-reference/dictionaries/external-dictionaries/external-dicts-dict-structure.md +++ /dev/null @@ -1,181 +0,0 @@ ---- -slug: /en/sql-reference/dictionaries/external-dictionaries/external-dicts-dict-structure -sidebar_position: 44 -sidebar_label: Dictionary Key and Fields ---- -import CloudDetails from '@site/docs/en/sql-reference/dictionaries/external-dictionaries/_snippet_dictionary_in_cloud.md'; - -# Dictionary Key and Fields - - - -The `structure` clause describes the dictionary key and fields available for queries. - -XML description: - -``` xml - - - - Id - - - - - - - ... - - - -``` - -Attributes are described in the elements: - -- `` — [Key column](../../../sql-reference/dictionaries/external-dictionaries/external-dicts-dict-structure.md#ext_dict_structure-key). -- `` — [Data column](../../../sql-reference/dictionaries/external-dictionaries/external-dicts-dict-structure.md#ext_dict_structure-attributes). There can be a multiple number of attributes. - -DDL query: - -``` sql -CREATE DICTIONARY dict_name ( - Id UInt64, - -- attributes -) -PRIMARY KEY Id -... -``` - -Attributes are described in the query body: - -- `PRIMARY KEY` — [Key column](../../../sql-reference/dictionaries/external-dictionaries/external-dicts-dict-structure.md#ext_dict_structure-key) -- `AttrName AttrType` — [Data column](../../../sql-reference/dictionaries/external-dictionaries/external-dicts-dict-structure.md#ext_dict_structure-attributes). There can be a multiple number of attributes. - -## Key - -ClickHouse supports the following types of keys: - -- Numeric key. `UInt64`. Defined in the `` tag or using `PRIMARY KEY` keyword. -- Composite key. Set of values of different types. Defined in the tag `` or `PRIMARY KEY` keyword. - -An xml structure can contain either `` or ``. DDL-query must contain single `PRIMARY KEY`. - -:::warning -You must not describe key as an attribute. -::: - -### Numeric Key - -Type: `UInt64`. - -Configuration example: - -``` xml - - Id - -``` - -Configuration fields: - -- `name` – The name of the column with keys. - -For DDL-query: - -``` sql -CREATE DICTIONARY ( - Id UInt64, - ... -) -PRIMARY KEY Id -... -``` - -- `PRIMARY KEY` – The name of the column with keys. - -### Composite Key - -The key can be a `tuple` from any types of fields. The [layout](../../../sql-reference/dictionaries/external-dictionaries/external-dicts-dict-layout.md) in this case must be `complex_key_hashed` or `complex_key_cache`. - -:::tip -A composite key can consist of a single element. This makes it possible to use a string as the key, for instance. -::: - -The key structure is set in the element ``. Key fields are specified in the same format as the dictionary [attributes](../../../sql-reference/dictionaries/external-dictionaries/external-dicts-dict-structure.md). Example: - -``` xml - - - - field1 - String - - - field2 - UInt32 - - ... - -... -``` - -or - -``` sql -CREATE DICTIONARY ( - field1 String, - field2 String - ... -) -PRIMARY KEY field1, field2 -... -``` - -For a query to the `dictGet*` function, a tuple is passed as the key. Example: `dictGetString('dict_name', 'attr_name', tuple('string for field1', num_for_field2))`. - -## Attributes - -Configuration example: - -``` xml - - ... - - Name - ClickHouseDataType - - rand64() - true - true - true - - -``` - -or - -``` sql -CREATE DICTIONARY somename ( - Name ClickHouseDataType DEFAULT '' EXPRESSION rand64() HIERARCHICAL INJECTIVE IS_OBJECT_ID -) -``` - -Configuration fields: - -| Tag | Description | Required | -|------------------------------------------------------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|----------| -| `name` | Column name. | Yes | -| `type` | ClickHouse data type: [UInt8](../../../sql-reference/data-types/int-uint.md), [UInt16](../../../sql-reference/data-types/int-uint.md), [UInt32](../../../sql-reference/data-types/int-uint.md), [UInt64](../../../sql-reference/data-types/int-uint.md), [Int8](../../../sql-reference/data-types/int-uint.md), [Int16](../../../sql-reference/data-types/int-uint.md), [Int32](../../../sql-reference/data-types/int-uint.md), [Int64](../../../sql-reference/data-types/int-uint.md), [Float32](../../../sql-reference/data-types/float.md), [Float64](../../../sql-reference/data-types/float.md), [UUID](../../../sql-reference/data-types/uuid.md), [Decimal32](../../../sql-reference/data-types/decimal.md), [Decimal64](../../../sql-reference/data-types/decimal.md), [Decimal128](../../../sql-reference/data-types/decimal.md), [Decimal256](../../../sql-reference/data-types/decimal.md),[Date](../../../sql-reference/data-types/date), [Date32](../../../sql-reference/data-types/date32.md), [DateTime](../../../sql-reference/data-types/datetime.md), [DateTime64](../../../sql-reference/data-types/datetime64.md), [String](../../../sql-reference/data-types/string.md), [Array](../../../sql-reference/data-types/array.md).
ClickHouse tries to cast value from dictionary to the specified data type. For example, for MySQL, the field might be `TEXT`, `VARCHAR`, or `BLOB` in the MySQL source table, but it can be uploaded as `String` in ClickHouse.
[Nullable](../../../sql-reference/data-types/nullable.md) is currently supported for [Flat](external-dicts-dict-layout.md#flat), [Hashed](external-dicts-dict-layout.md#dicts-external_dicts_dict_layout-hashed), [ComplexKeyHashed](external-dicts-dict-layout.md#complex-key-hashed), [Direct](external-dicts-dict-layout.md#direct), [ComplexKeyDirect](external-dicts-dict-layout.md#complex-key-direct), [RangeHashed](external-dicts-dict-layout.md#range-hashed), [Polygon](external-dicts-dict-polygon.md), [Cache](external-dicts-dict-layout.md#cache), [ComplexKeyCache](external-dicts-dict-layout.md#complex-key-cache), [SSDCache](external-dicts-dict-layout.md#ssd-cache), [SSDComplexKeyCache](external-dicts-dict-layout.md#complex-key-ssd-cache) dictionaries. In [IPTrie](external-dicts-dict-layout.md#ip-trie) dictionaries `Nullable` types are not supported. | Yes | -| `null_value` | Default value for a non-existing element.
In the example, it is an empty string. [NULL](../../syntax.md#null-literal) value can be used only for the `Nullable` types (see the previous line with types description). | Yes | -| `expression` | [Expression](../../../sql-reference/syntax.md#syntax-expressions) that ClickHouse executes on the value.
The expression can be a column name in the remote SQL database. Thus, you can use it to create an alias for the remote column.

Default value: no expression. | No | -| `hierarchical` | If `true`, the attribute contains the value of a parent key for the current key. See [Hierarchical Dictionaries](../../../sql-reference/dictionaries/external-dictionaries/external-dicts-dict-hierarchical.md).

Default value: `false`. | No | -| `injective` | Flag that shows whether the `id -> attribute` image is [injective](https://en.wikipedia.org/wiki/Injective_function).
If `true`, ClickHouse can automatically place after the `GROUP BY` clause the requests to dictionaries with injection. Usually it significantly reduces the amount of such requests.

Default value: `false`. | No | -| `is_object_id` | Flag that shows whether the query is executed for a MongoDB document by `ObjectID`.

Default value: `false`. | No | - -**See Also** - -- [Functions for working with dictionaries](../../../sql-reference/functions/ext-dict-functions.md). - -## Related Content - -- [Using dictionaries to accelerate queries](https://clickhouse.com/blog/faster-queries-dictionaries-clickhouse) \ No newline at end of file diff --git a/docs/en/sql-reference/dictionaries/external-dictionaries/external-dicts-dict.md b/docs/en/sql-reference/dictionaries/external-dictionaries/external-dicts-dict.md deleted file mode 100644 index a923511ca5e..00000000000 --- a/docs/en/sql-reference/dictionaries/external-dictionaries/external-dicts-dict.md +++ /dev/null @@ -1,57 +0,0 @@ ---- -slug: /en/sql-reference/dictionaries/external-dictionaries/external-dicts-dict -sidebar_position: 40 -sidebar_label: Configuring a Dictionary ---- -import CloudDetails from '@site/docs/en/sql-reference/dictionaries/external-dictionaries/_snippet_dictionary_in_cloud.md'; - -# Configuring a Dictionary - - - -If dictionary is configured using xml file, than dictionary configuration has the following structure: - -``` xml - - dict_name - - - - - - - - - - - - - - - - - -``` - -Corresponding [DDL-query](../../../sql-reference/statements/create/dictionary.md) has the following structure: - -``` sql -CREATE DICTIONARY dict_name -( - ... -- attributes -) -PRIMARY KEY ... -- complex or single key configuration -SOURCE(...) -- Source configuration -LAYOUT(...) -- Memory layout configuration -LIFETIME(...) -- Lifetime of dictionary in memory -``` - -- `name` – The identifier that can be used to access the dictionary. Use the characters `[a-zA-Z0-9_\-]`. -- [source](../../../sql-reference/dictionaries/external-dictionaries/external-dicts-dict-sources.md) — Source of the dictionary. -- [layout](../../../sql-reference/dictionaries/external-dictionaries/external-dicts-dict-layout.md) — Dictionary layout in memory. -- [structure](../../../sql-reference/dictionaries/external-dictionaries/external-dicts-dict-structure.md) — Structure of the dictionary . A key and attributes that can be retrieved by this key. -- [lifetime](../../../sql-reference/dictionaries/external-dictionaries/external-dicts-dict-lifetime.md) — Frequency of dictionary updates. - -## Related Content - -- [Using dictionaries to accelerate queries](https://clickhouse.com/blog/faster-queries-dictionaries-clickhouse) \ No newline at end of file diff --git a/docs/en/sql-reference/dictionaries/external-dictionaries/external-dicts.md b/docs/en/sql-reference/dictionaries/external-dictionaries/external-dicts.md deleted file mode 100644 index 8621c68b428..00000000000 --- a/docs/en/sql-reference/dictionaries/external-dictionaries/external-dicts.md +++ /dev/null @@ -1,84 +0,0 @@ ---- -slug: /en/sql-reference/dictionaries/external-dictionaries/external-dicts -sidebar_position: 39 -sidebar_label: General Description ---- -import CloudDetails from '@site/docs/en/sql-reference/dictionaries/external-dictionaries/_snippet_dictionary_in_cloud.md'; - -# Dictionaries - -:::tip Tutorial -If you are getting started with Dictionaries in ClickHouse we have a tutorial that covers that topic. Take a look [here](/docs/en/tutorial.md). -::: - -You can add your own dictionaries from various data sources. The source for a dictionary can be a ClickHouse table, a local text or executable file, an HTTP(s) resource, or another DBMS. For more information, see “[Dictionary Sources](../../../sql-reference/dictionaries/external-dictionaries/external-dicts-dict-sources.md)â€. - -ClickHouse: - -- Fully or partially stores dictionaries in RAM. -- Periodically updates dictionaries and dynamically loads missing values. In other words, dictionaries can be loaded dynamically. -- Allows creating dictionaries with xml files or [DDL queries](../../../sql-reference/statements/create/dictionary.md). - -The configuration of dictionaries can be located in one or more xml-files. The path to the configuration is specified in the [dictionaries_config](../../../operations/server-configuration-parameters/settings.md#server_configuration_parameters-dictionaries_config) parameter. - -Dictionaries can be loaded at server startup or at first use, depending on the [dictionaries_lazy_load](../../../operations/server-configuration-parameters/settings.md#server_configuration_parameters-dictionaries_lazy_load) setting. - -The [dictionaries](../../../operations/system-tables/dictionaries.md#system_tables-dictionaries) system table contains information about dictionaries configured at server. For each dictionary you can find there: - -- Status of the dictionary. -- Configuration parameters. -- Metrics like amount of RAM allocated for the dictionary or a number of queries since the dictionary was successfully loaded. - - - -## Creating a dictionary with a DDL query - -Dictionaries can be created with [DDL queries](../../../sql-reference/statements/create/dictionary.md), and this is the recommended method because with DDL created dictionaries: -- No additional records are added to server configuration files -- The dictionaries can be worked with as first-class entities, like tables or views -- Data can be read directly, using familiar SELECT rather than dictionary table functions -- The dictionaries can be easily renamed - -## Creating a dictionary with a configuration file - -:::note -Creating a dictionary with a configuration file is not applicable to ClickHouse Cloud. Please use DDL (see above), and create your dictionary as user `default`. -::: - -The dictionary configuration file has the following format: - -``` xml - - An optional element with any content. Ignored by the ClickHouse server. - - - /etc/metrika.xml - - - - - - - - -``` - -You can [configure](../../../sql-reference/dictionaries/external-dictionaries/external-dicts-dict.md) any number of dictionaries in the same file. - - -:::note -You can convert values for a small dictionary by describing it in a `SELECT` query (see the [transform](../../../sql-reference/functions/other-functions.md) function). This functionality is not related to dictionaries. -::: - -## See Also - -- [Configuring a Dictionary](../../../sql-reference/dictionaries/external-dictionaries/external-dicts-dict.md) -- [Storing Dictionaries in Memory](../../../sql-reference/dictionaries/external-dictionaries/external-dicts-dict-layout.md) -- [Dictionary Updates](../../../sql-reference/dictionaries/external-dictionaries/external-dicts-dict-lifetime.md) -- [Dictionary Sources](../../../sql-reference/dictionaries/external-dictionaries/external-dicts-dict-sources.md) -- [Dictionary Key and Fields](../../../sql-reference/dictionaries/external-dictionaries/external-dicts-dict-structure.md) -- [Functions for Working with Dictionaries](../../../sql-reference/functions/ext-dict-functions.md) - -## Related Content - -- [Using dictionaries to accelerate queries](https://clickhouse.com/blog/faster-queries-dictionaries-clickhouse) diff --git a/docs/en/sql-reference/dictionaries/external-dictionaries/regexp-tree.md b/docs/en/sql-reference/dictionaries/external-dictionaries/regexp-tree.md deleted file mode 100644 index 5ad15b11d07..00000000000 --- a/docs/en/sql-reference/dictionaries/external-dictionaries/regexp-tree.md +++ /dev/null @@ -1,76 +0,0 @@ ---- -slug: /en/sql-reference/dictionaries/external-dictionaries/regexp-tree -sidebar_position: 47 -sidebar_label: RegExp Tree Dictionary -title: "RegExp Tree Dictionary" ---- -import CloudDetails from '@site/docs/en/sql-reference/dictionaries/external-dictionaries/_snippet_dictionary_in_cloud.md'; - -Regexp Tree dictionary stores multiple trees of regular expressions with attributions. Users can retrieve strings in the dictionary. If a string matches the root of the regexp tree, we will collect the corresponding attributes of the matched root and continue to walk the children. If any of the children matches the string, we will collect attributes and rewrite the old ones if conflicts occur, then continue the traverse until we reach leaf nodes. - -Example of the ddl query for creating Regexp Tree dictionary: - - - -```sql -create dictionary regexp_dict -( - regexp String, - name String, - version String -) -PRIMARY KEY(regexp) -SOURCE(YAMLRegExpTree(PATH '/var/lib/clickhouse/user_files/regexp_tree.yaml')) -LAYOUT(regexp_tree) -... -``` - -We only allow `YAMLRegExpTree` to work with regexp_tree dicitionary layout. If you want to use other sources, please set variable `regexp_dict_allow_other_sources` true. - -**Source** - -We introduce a type of source called `YAMLRegExpTree` representing the structure of Regexp Tree dictionary. An Example of a valid yaml config is like: - -```xml -- regexp: 'Linux/(\d+[\.\d]*).+tlinux' - name: 'TencentOS' - version: '\1' - -- regexp: '\d+/tclwebkit(?:\d+[\.\d]*)' - name: 'Andriod' - versions: - - regexp: '33/tclwebkit' - version: '13' - - regexp: '3[12]/tclwebkit' - version: '12' - - regexp: '30/tclwebkit' - version: '11' - - regexp: '29/tclwebkit' - version: '10' -``` - -The key `regexp` represents the regular expression of a tree node. The name of key is same as the dictionary key. The `name` and `version` is user-defined attributions in the dicitionary. The `versions` (which can be any name that not appear in attributions or the key) indicates the children nodes of this tree. - -**Back Reference** - -The value of an attribution could contain a back reference which refers to a capture group of the matched regular expression. Reference number ranges from 1 to 9 and writes as `$1` or `\1`. - -During the query execution, the back reference in the value will be replaced by the matched capture group. - -**Query** - -Due to the specialty of Regexp Tree dictionary, we only allow functions `dictGet`, `dictGetOrDefault` and `dictGetOrNull` work with it. - -Example: - -```sql -SELECT dictGet('regexp_dict', ('name', 'version'), '31/tclwebkit1024'); -``` - -Result: - -``` -┌─dictGet('regexp_dict', ('name', 'version'), '31/tclwebkit1024')─┠-│ ('Andriod','12') │ -└─────────────────────────────────────────────────────────────────┘ -``` diff --git a/docs/en/sql-reference/dictionaries/index.md b/docs/en/sql-reference/dictionaries/index.md index 9e6eed47d4a..2185e2b31c1 100644 --- a/docs/en/sql-reference/dictionaries/index.md +++ b/docs/en/sql-reference/dictionaries/index.md @@ -1,9 +1,12 @@ --- -slug: /en/sql-reference/dictionaries/ -sidebar_label: Dictionaries +slug: /en/sql-reference/dictionaries +sidebar_label: Defining Dictionaries sidebar_position: 35 --- +import SelfManaged from '@site/docs/en/_snippets/_self_managed_only_no_roadmap.md'; +import CloudDetails from '@site/docs/en/sql-reference/dictionaries/_snippet_dictionary_in_cloud.md'; + # Dictionaries A dictionary is a mapping (`key -> attributes`) that is convenient for various types of reference lists. @@ -12,5 +15,2349 @@ ClickHouse supports special functions for working with dictionaries that can be ClickHouse supports: -- [Dictionaries](../../sql-reference/dictionaries/external-dictionaries/external-dicts.md#dicts-external-dicts) with a [set of functions](../../sql-reference/functions/ext-dict-functions.md). -- [Embedded dictionaries](../../sql-reference/dictionaries/internal-dicts.md#internal_dicts) with a specific [set of functions](../../sql-reference/functions/ym-dict-functions.md). +- Dictionaries with a [set of functions](../../sql-reference/functions/ext-dict-functions.md). +- [Embedded dictionaries](#embedded_dictionaries) with a specific [set of functions](../../sql-reference/functions/ym-dict-functions.md). + + +:::tip Tutorial +If you are getting started with Dictionaries in ClickHouse we have a tutorial that covers that topic. Take a look [here](/docs/en/tutorial.md). +::: + +You can add your own dictionaries from various data sources. The source for a dictionary can be a ClickHouse table, a local text or executable file, an HTTP(s) resource, or another DBMS. For more information, see “[Dictionary Sources](#dictionary-sources)â€. + +ClickHouse: + +- Fully or partially stores dictionaries in RAM. +- Periodically updates dictionaries and dynamically loads missing values. In other words, dictionaries can be loaded dynamically. +- Allows creating dictionaries with xml files or [DDL queries](../../sql-reference/statements/create/dictionary.md). + +The configuration of dictionaries can be located in one or more xml-files. The path to the configuration is specified in the [dictionaries_config](../../operations/server-configuration-parameters/settings.md#server_configuration_parameters-dictionaries_config) parameter. + +Dictionaries can be loaded at server startup or at first use, depending on the [dictionaries_lazy_load](../../operations/server-configuration-parameters/settings.md#server_configuration_parameters-dictionaries_lazy_load) setting. + +The [dictionaries](../../operations/system-tables/dictionaries.md#system_tables-dictionaries) system table contains information about dictionaries configured at server. For each dictionary you can find there: + +- Status of the dictionary. +- Configuration parameters. +- Metrics like amount of RAM allocated for the dictionary or a number of queries since the dictionary was successfully loaded. + + + +## Creating a dictionary with a DDL query {#creating-a-dictionary-with-a-ddl-query} + +Dictionaries can be created with [DDL queries](../../sql-reference/statements/create/dictionary.md), and this is the recommended method because with DDL created dictionaries: +- No additional records are added to server configuration files +- The dictionaries can be worked with as first-class entities, like tables or views +- Data can be read directly, using familiar SELECT rather than dictionary table functions +- The dictionaries can be easily renamed + +## Creating a dictionary with a configuration file + +:::note +Creating a dictionary with a configuration file is not applicable to ClickHouse Cloud. Please use DDL (see above), and create your dictionary as user `default`. +::: + +The dictionary configuration file has the following format: + +``` xml + + An optional element with any content. Ignored by the ClickHouse server. + + + /etc/metrika.xml + + + + + + + + +``` + +You can [configure](#configuring-a-dictionary) any number of dictionaries in the same file. + + +:::note +You can convert values for a small dictionary by describing it in a `SELECT` query (see the [transform](../../sql-reference/functions/other-functions.md) function). This functionality is not related to dictionaries. +::: + +## Configuring a Dictionary {#configuring-a-dictionary} + + + +If dictionary is configured using xml file, than dictionary configuration has the following structure: + +``` xml + + dict_name + + + + + + + + + + + + + + + + + +``` + +Corresponding [DDL-query](../../sql-reference/statements/create/dictionary.md) has the following structure: + +``` sql +CREATE DICTIONARY dict_name +( + ... -- attributes +) +PRIMARY KEY ... -- complex or single key configuration +SOURCE(...) -- Source configuration +LAYOUT(...) -- Memory layout configuration +LIFETIME(...) -- Lifetime of dictionary in memory +``` + +## Storing Dictionaries in Memory {#storig-dictionaries-in-memory} + +There are a variety of ways to store dictionaries in memory. + +We recommend [flat](#flat), [hashed](#hashed) and [complex_key_hashed](#complex_key_hashed), which provide optimal processing speed. + +Caching is not recommended because of potentially poor performance and difficulties in selecting optimal parameters. Read more in the section [cache](#cache). + +There are several ways to improve dictionary performance: + +- Call the function for working with the dictionary after `GROUP BY`. +- Mark attributes to extract as injective. An attribute is called injective if different attribute values correspond to different keys. So when `GROUP BY` uses a function that fetches an attribute value by the key, this function is automatically taken out of `GROUP BY`. + +ClickHouse generates an exception for errors with dictionaries. Examples of errors: + +- The dictionary being accessed could not be loaded. +- Error querying a `cached` dictionary. + +You can view the list of dictionaries and their statuses in the [system.dictionaries](../../operations/system-tables/dictionaries.md) table. + + + +The configuration looks like this: + +``` xml + + + ... + + + + + + ... + + +``` + +Corresponding [DDL-query](../../sql-reference/statements/create/dictionary.md): + +``` sql +CREATE DICTIONARY (...) +... +LAYOUT(LAYOUT_TYPE(param value)) -- layout settings +... +``` + +Dictionaries without word `complex-key*` in a layout have a key with [UInt64](../../sql-reference/data-types/int-uint.md) type, `complex-key*` dictionaries have a composite key (complex, with arbitrary types). + +[UInt64](../../sql-reference/data-types/int-uint.md) keys in XML dictionaries are defined with `` tag. + +Configuration example (column key_column has UInt64 type): +```xml +... + + + key_column + +... +``` + +Composite `complex` keys XML dictionaries are defined `` tag. + +Configuration example of a composite key (key has one element with [String](../../sql-reference/data-types/string.md) type): +```xml +... + + + + country_code + String + + +... +``` + +## Ways to Store Dictionaries in Memory + +- [flat](#flat) +- [hashed](#hashed) +- [sparse_hashed](#sparse_hashed) +- [complex_key_hashed](#complex_key_hashed) +- [complex_key_sparse_hashed](#complex_key_sparse_hashed) +- [hashed_array](#hashed_array) +- [complex_key_hashed_array](#complex_key_hashed_array) +- [range_hashed](#range_hashed) +- [complex_key_range_hashed](#complex_key_range_hashed) +- [cache](#cache) +- [complex_key_cache](#complex_key_cache) +- [ssd_cache](#ssd_cache) +- [complex_key_ssd_cache](#complex_key_ssd_cache) +- [direct](#direct) +- [complex_key_direct](#complex_key_direct) +- [ip_trie](#ip_trie) + +### flat + +The dictionary is completely stored in memory in the form of flat arrays. How much memory does the dictionary use? The amount is proportional to the size of the largest key (in space used). + +The dictionary key has the [UInt64](../../sql-reference/data-types/int-uint.md) type and the value is limited to `max_array_size` (by default — 500,000). If a larger key is discovered when creating the dictionary, ClickHouse throws an exception and does not create the dictionary. Dictionary flat arrays initial size is controlled by `initial_array_size` setting (by default — 1024). + +All types of sources are supported. When updating, data (from a file or from a table) is read in it entirety. + +This method provides the best performance among all available methods of storing the dictionary. + +Configuration example: + +``` xml + + + 50000 + 5000000 + + +``` + +or + +``` sql +LAYOUT(FLAT(INITIAL_ARRAY_SIZE 50000 MAX_ARRAY_SIZE 5000000)) +``` + +### hashed + +The dictionary is completely stored in memory in the form of a hash table. The dictionary can contain any number of elements with any identifiers In practice, the number of keys can reach tens of millions of items. + +The dictionary key has the [UInt64](../../sql-reference/data-types/int-uint.md) type. + +All types of sources are supported. When updating, data (from a file or from a table) is read in its entirety. + +Configuration example: + +``` xml + + + +``` + +or + +``` sql +LAYOUT(HASHED()) +``` + +If `shards` greater then 1 (default is `1`) the dictionary will load data in parallel, useful if you have huge amount of elements in one dictionary. + +Configuration example: + +``` xml + + + 10 + + 10000 + + +``` + +or + +``` sql +LAYOUT(HASHED(SHARDS 10 [SHARD_LOAD_QUEUE_BACKLOG 10000])) +``` + +### sparse_hashed + +Similar to `hashed`, but uses less memory in favor more CPU usage. + +The dictionary key has the [UInt64](../../sql-reference/data-types/int-uint.md) type. + +Configuration example: + +``` xml + + + +``` + +or + +``` sql +LAYOUT(SPARSE_HASHED()) +``` + +It is also possible to use `shards` for this type of dictionary, and again it is more important for `sparse_hashed` then for `hashed`, since `sparse_hashed` is slower. + +### complex_key_hashed + +This type of storage is for use with composite [keys](#dictionary-key-and-fields). Similar to `hashed`. + +Configuration example: + +``` xml + + + 1 + + + +``` + +or + +``` sql +LAYOUT(COMPLEX_KEY_HASHED([SHARDS 1] [SHARD_LOAD_QUEUE_BACKLOG 10000])) +``` + +### complex_key_sparse_hashed + +This type of storage is for use with composite [keys](#dictionary-key-and-fields). Similar to [sparse_hashed](#sparse_hashed). + +Configuration example: + +``` xml + + + 1 + + +``` + +or + +``` sql +LAYOUT(COMPLEX_KEY_SPARSE_HASHED([SHARDS 1] [SHARD_LOAD_QUEUE_BACKLOG 10000])) +``` + +### hashed_array + +The dictionary is completely stored in memory. Each attribute is stored in an array. The key attribute is stored in the form of a hashed table where value is an index in the attributes array. The dictionary can contain any number of elements with any identifiers. In practice, the number of keys can reach tens of millions of items. + +The dictionary key has the [UInt64](../../sql-reference/data-types/int-uint.md) type. + +All types of sources are supported. When updating, data (from a file or from a table) is read in its entirety. + +Configuration example: + +``` xml + + + + +``` + +or + +``` sql +LAYOUT(HASHED_ARRAY()) +``` + +### complex_key_hashed_array + +This type of storage is for use with composite [keys](#dictionary-key-and-fields). Similar to [hashed_array](#hashed_array). + +Configuration example: + +``` xml + + + +``` + +or + +``` sql +LAYOUT(COMPLEX_KEY_HASHED_ARRAY()) +``` + +### range_hashed {#range_hashed} + +The dictionary is stored in memory in the form of a hash table with an ordered array of ranges and their corresponding values. + +The dictionary key has the [UInt64](../../sql-reference/data-types/int-uint.md) type. +This storage method works the same way as hashed and allows using date/time (arbitrary numeric type) ranges in addition to the key. + +Example: The table contains discounts for each advertiser in the format: + +``` text +┌─advertiser_id─┬─discount_start_date─┬─discount_end_date─┬─amount─┠+│ 123 │ 2015-01-16 │ 2015-01-31 │ 0.25 │ +│ 123 │ 2015-01-01 │ 2015-01-15 │ 0.15 │ +│ 456 │ 2015-01-01 │ 2015-01-15 │ 0.05 │ +└───────────────┴─────────────────────┴───────────────────┴────────┘ +``` + +To use a sample for date ranges, define the `range_min` and `range_max` elements in the [structure](#dictionary-key-and-fields). These elements must contain elements `name` and `type` (if `type` is not specified, the default type will be used - Date). `type` can be any numeric type (Date / DateTime / UInt64 / Int32 / others). + +:::warning +Values of `range_min` and `range_max` should fit in `Int64` type. +::: + +Example: + +``` xml + + + + min + + + + + advertiser_id + + + discount_start_date + Date + + + discount_end_date + Date + + ... +``` + +or + +``` sql +CREATE DICTIONARY discounts_dict ( + advertiser_id UInt64, + discount_start_date Date, + discount_end_date Date, + amount Float64 +) +PRIMARY KEY id +SOURCE(CLICKHOUSE(TABLE 'discounts')) +LIFETIME(MIN 1 MAX 1000) +LAYOUT(RANGE_HASHED(range_lookup_strategy 'max')) +RANGE(MIN discount_start_date MAX discount_end_date) +``` + +To work with these dictionaries, you need to pass an additional argument to the `dictGet` function, for which a range is selected: + +``` sql +dictGet('dict_name', 'attr_name', id, date) +``` +Query example: + +``` sql +SELECT dictGet('discounts_dict', 'amount', 1, '2022-10-20'::Date); +``` + +This function returns the value for the specified `id`s and the date range that includes the passed date. + +Details of the algorithm: + +- If the `id` is not found or a range is not found for the `id`, it returns the default value of the attribute's type. +- If there are overlapping ranges and `range_lookup_strategy=min`, it returns a matching range with minimal `range_min`, if several ranges found, it returns a range with minimal `range_max`, if again several ranges found (several ranges had the same `range_min` and `range_max` it returns a random range of them. +- If there are overlapping ranges and `range_lookup_strategy=max`, it returns a matching range with maximal `range_min`, if several ranges found, it returns a range with maximal `range_max`, if again several ranges found (several ranges had the same `range_min` and `range_max` it returns a random range of them. +- If the `range_max` is `NULL`, the range is open. `NULL` is treated as maximal possible value. For the `range_min` `1970-01-01` or `0` (-MAX_INT) can be used as the open value. + +Configuration example: + +``` xml + + + ... + + + + + + + + Abcdef + + + StartTimeStamp + UInt64 + + + EndTimeStamp + UInt64 + + + XXXType + String + + + + + + +``` + +or + +``` sql +CREATE DICTIONARY somedict( + Abcdef UInt64, + StartTimeStamp UInt64, + EndTimeStamp UInt64, + XXXType String DEFAULT '' +) +PRIMARY KEY Abcdef +RANGE(MIN StartTimeStamp MAX EndTimeStamp) +``` + +Configuration example with overlapping ranges and open ranges: + +```sql +CREATE TABLE discounts +( + advertiser_id UInt64, + discount_start_date Date, + discount_end_date Nullable(Date), + amount Float64 +) +ENGINE = Memory; + +INSERT INTO discounts VALUES (1, '2015-01-01', Null, 0.1); +INSERT INTO discounts VALUES (1, '2015-01-15', Null, 0.2); +INSERT INTO discounts VALUES (2, '2015-01-01', '2015-01-15', 0.3); +INSERT INTO discounts VALUES (2, '2015-01-04', '2015-01-10', 0.4); +INSERT INTO discounts VALUES (3, '1970-01-01', '2015-01-15', 0.5); +INSERT INTO discounts VALUES (3, '1970-01-01', '2015-01-10', 0.6); + +SELECT * FROM discounts ORDER BY advertiser_id, discount_start_date; +┌─advertiser_id─┬─discount_start_date─┬─discount_end_date─┬─amount─┠+│ 1 │ 2015-01-01 │ á´ºáµá´¸á´¸ │ 0.1 │ +│ 1 │ 2015-01-15 │ á´ºáµá´¸á´¸ │ 0.2 │ +│ 2 │ 2015-01-01 │ 2015-01-15 │ 0.3 │ +│ 2 │ 2015-01-04 │ 2015-01-10 │ 0.4 │ +│ 3 │ 1970-01-01 │ 2015-01-15 │ 0.5 │ +│ 3 │ 1970-01-01 │ 2015-01-10 │ 0.6 │ +└───────────────┴─────────────────────┴───────────────────┴────────┘ + +-- RANGE_LOOKUP_STRATEGY 'max' + +CREATE DICTIONARY discounts_dict +( + advertiser_id UInt64, + discount_start_date Date, + discount_end_date Nullable(Date), + amount Float64 +) +PRIMARY KEY advertiser_id +SOURCE(CLICKHOUSE(TABLE discounts)) +LIFETIME(MIN 600 MAX 900) +LAYOUT(RANGE_HASHED(RANGE_LOOKUP_STRATEGY 'max')) +RANGE(MIN discount_start_date MAX discount_end_date); + +select dictGet('discounts_dict', 'amount', 1, toDate('2015-01-14')) res; +┌─res─┠+│ 0.1 │ -- the only one range is matching: 2015-01-01 - Null +└─────┘ + +select dictGet('discounts_dict', 'amount', 1, toDate('2015-01-16')) res; +┌─res─┠+│ 0.2 │ -- two ranges are matching, range_min 2015-01-15 (0.2) is bigger than 2015-01-01 (0.1) +└─────┘ + +select dictGet('discounts_dict', 'amount', 2, toDate('2015-01-06')) res; +┌─res─┠+│ 0.4 │ -- two ranges are matching, range_min 2015-01-04 (0.4) is bigger than 2015-01-01 (0.3) +└─────┘ + +select dictGet('discounts_dict', 'amount', 3, toDate('2015-01-01')) res; +┌─res─┠+│ 0.5 │ -- two ranges are matching, range_min are equal, 2015-01-15 (0.5) is bigger than 2015-01-10 (0.6) +└─────┘ + +DROP DICTIONARY discounts_dict; + +-- RANGE_LOOKUP_STRATEGY 'min' + +CREATE DICTIONARY discounts_dict +( + advertiser_id UInt64, + discount_start_date Date, + discount_end_date Nullable(Date), + amount Float64 +) +PRIMARY KEY advertiser_id +SOURCE(CLICKHOUSE(TABLE discounts)) +LIFETIME(MIN 600 MAX 900) +LAYOUT(RANGE_HASHED(RANGE_LOOKUP_STRATEGY 'min')) +RANGE(MIN discount_start_date MAX discount_end_date); + +select dictGet('discounts_dict', 'amount', 1, toDate('2015-01-14')) res; +┌─res─┠+│ 0.1 │ -- the only one range is matching: 2015-01-01 - Null +└─────┘ + +select dictGet('discounts_dict', 'amount', 1, toDate('2015-01-16')) res; +┌─res─┠+│ 0.1 │ -- two ranges are matching, range_min 2015-01-01 (0.1) is less than 2015-01-15 (0.2) +└─────┘ + +select dictGet('discounts_dict', 'amount', 2, toDate('2015-01-06')) res; +┌─res─┠+│ 0.3 │ -- two ranges are matching, range_min 2015-01-01 (0.3) is less than 2015-01-04 (0.4) +└─────┘ + +select dictGet('discounts_dict', 'amount', 3, toDate('2015-01-01')) res; +┌─res─┠+│ 0.6 │ -- two ranges are matching, range_min are equal, 2015-01-10 (0.6) is less than 2015-01-15 (0.5) +└─────┘ +``` + +### complex_key_range_hashed + +The dictionary is stored in memory in the form of a hash table with an ordered array of ranges and their corresponding values (see [range_hashed](#range_hashed)). This type of storage is for use with composite [keys](#dictionary-key-and-fields). + +Configuration example: + +``` sql +CREATE DICTIONARY range_dictionary +( + CountryID UInt64, + CountryKey String, + StartDate Date, + EndDate Date, + Tax Float64 DEFAULT 0.2 +) +PRIMARY KEY CountryID, CountryKey +SOURCE(CLICKHOUSE(TABLE 'date_table')) +LIFETIME(MIN 1 MAX 1000) +LAYOUT(COMPLEX_KEY_RANGE_HASHED()) +RANGE(MIN StartDate MAX EndDate); +``` + +### cache + +The dictionary is stored in a cache that has a fixed number of cells. These cells contain frequently used elements. + +The dictionary key has the [UInt64](../../sql-reference/data-types/int-uint.md) type. + +When searching for a dictionary, the cache is searched first. For each block of data, all keys that are not found in the cache or are outdated are requested from the source using `SELECT attrs... FROM db.table WHERE id IN (k1, k2, ...)`. The received data is then written to the cache. + +If keys are not found in dictionary, then update cache task is created and added into update queue. Update queue properties can be controlled with settings `max_update_queue_size`, `update_queue_push_timeout_milliseconds`, `query_wait_timeout_milliseconds`, `max_threads_for_updates`. + +For cache dictionaries, the expiration [lifetime](#dictionary-updates) of data in the cache can be set. If more time than `lifetime` has passed since loading the data in a cell, the cell’s value is not used and key becomes expired. The key is re-requested the next time it needs to be used. This behaviour can be configured with setting `allow_read_expired_keys`. + +This is the least effective of all the ways to store dictionaries. The speed of the cache depends strongly on correct settings and the usage scenario. A cache type dictionary performs well only when the hit rates are high enough (recommended 99% and higher). You can view the average hit rate in the [system.dictionaries](../../operations/system-tables/dictionaries.md) table. + +If setting `allow_read_expired_keys` is set to 1, by default 0. Then dictionary can support asynchronous updates. If a client requests keys and all of them are in cache, but some of them are expired, then dictionary will return expired keys for a client and request them asynchronously from the source. + +To improve cache performance, use a subquery with `LIMIT`, and call the function with the dictionary externally. + +All types of sources are supported. + +Example of settings: + +``` xml + + + + 1000000000 + + 0 + + 100000 + + 10 + + 60000 + + 4 + + +``` + +or + +``` sql +LAYOUT(CACHE(SIZE_IN_CELLS 1000000000)) +``` + +Set a large enough cache size. You need to experiment to select the number of cells: + +1. Set some value. +2. Run queries until the cache is completely full. +3. Assess memory consumption using the `system.dictionaries` table. +4. Increase or decrease the number of cells until the required memory consumption is reached. + +:::warning +Do not use ClickHouse as a source, because it is slow to process queries with random reads. +::: + +### complex_key_cache + +This type of storage is for use with composite [keys](#dictionary-key-and-fields). Similar to `cache`. + +### ssd_cache + +Similar to `cache`, but stores data on SSD and index in RAM. All cache dictionary settings related to update queue can also be applied to SSD cache dictionaries. + +The dictionary key has the [UInt64](../../sql-reference/data-types/int-uint.md) type. + +``` xml + + + + 4096 + + 16777216 + + 131072 + + 1048576 + + /var/lib/clickhouse/user_files/test_dict + + +``` + +or + +``` sql +LAYOUT(SSD_CACHE(BLOCK_SIZE 4096 FILE_SIZE 16777216 READ_BUFFER_SIZE 1048576 + PATH '/var/lib/clickhouse/user_files/test_dict')) +``` + +### complex_key_ssd_cache + +This type of storage is for use with composite [keys](#dictionary-key-and-fields). Similar to `ssd_cache`. + +### direct + +The dictionary is not stored in memory and directly goes to the source during the processing of a request. + +The dictionary key has the [UInt64](../../sql-reference/data-types/int-uint.md) type. + +All types of [sources](#dictionary-sources), except local files, are supported. + +Configuration example: + +``` xml + + + +``` + +or + +``` sql +LAYOUT(DIRECT()) +``` + +### complex_key_direct + +This type of storage is for use with composite [keys](#dictionary-key-and-fields). Similar to `direct`. + +### ip_trie + +This type of storage is for mapping network prefixes (IP addresses) to metadata such as ASN. + +**Example** + +Suppose we have a table in ClickHouse that contains our IP prefixes and mappings: + +```sql +CREATE TABLE my_ip_addresses ( + prefix String, + asn UInt32, + cca2 String +) +ENGINE = MergeTree +PRIMARY KEY prefix; +``` + +```sql +INSERT INTO my_ip_addresses VALUES + ('202.79.32.0/20', 17501, 'NP'), + ('2620:0:870::/48', 3856, 'US'), + ('2a02:6b8:1::/48', 13238, 'RU'), + ('2001:db8::/32', 65536, 'ZZ') +; +``` + +Let's define an `ip_trie` dictionary for this table. The `ip_trie` layout requires a composite key: + +``` xml + + + + prefix + String + + + + asn + UInt32 + + + + cca2 + String + ?? + + ... + + + + + + true + + +``` + +or + +``` sql +CREATE DICTIONARY my_ip_trie_dictionary ( + prefix String, + asn UInt32, + cca2 String DEFAULT '??' +) +PRIMARY KEY prefix +SOURCE(CLICKHOUSE(TABLE 'my_ip_addresses')) +LAYOUT(IP_TRIE) +LIFETIME(3600); +``` + +The key must have only one `String` type attribute that contains an allowed IP prefix. Other types are not supported yet. + +For queries, you must use the same functions (`dictGetT` with a tuple) as for dictionaries with composite keys. The syntax is: + +``` sql +dictGetT('dict_name', 'attr_name', tuple(ip)) +``` + +The function takes either `UInt32` for IPv4, or `FixedString(16)` for IPv6. For example: + +``` sql +select dictGet('my_ip_trie_dictionary', 'asn', tuple(IPv6StringToNum('2001:db8::1'))) +``` + +Other types are not supported yet. The function returns the attribute for the prefix that corresponds to this IP address. If there are overlapping prefixes, the most specific one is returned. + +Data must completely fit into RAM. + +## Dictionary Updates {#dictionary-updates} + +ClickHouse periodically updates the dictionaries. The update interval for fully downloaded dictionaries and the invalidation interval for cached dictionaries are defined in the `lifetime` tag in seconds. + +Dictionary updates (other than loading for first use) do not block queries. During updates, the old version of a dictionary is used. If an error occurs during an update, the error is written to the server log, and queries continue using the old version of dictionaries. + +Example of settings: + + + +``` xml + + ... + 300 + ... + +``` + +or + +``` sql +CREATE DICTIONARY (...) +... +LIFETIME(300) +... +``` + +Setting `0` (`LIFETIME(0)`) prevents dictionaries from updating. + +You can set a time interval for updates, and ClickHouse will choose a uniformly random time within this range. This is necessary in order to distribute the load on the dictionary source when updating on a large number of servers. + +Example of settings: + +``` xml + + ... + + 300 + 360 + + ... + +``` + +or + +``` sql +LIFETIME(MIN 300 MAX 360) +``` + +If `0` and `0`, ClickHouse does not reload the dictionary by timeout. +In this case, ClickHouse can reload the dictionary earlier if the dictionary configuration file was changed or the `SYSTEM RELOAD DICTIONARY` command was executed. + +When updating the dictionaries, the ClickHouse server applies different logic depending on the type of [source](#dictionary-sources): + +- For a text file, it checks the time of modification. If the time differs from the previously recorded time, the dictionary is updated. +- For MySQL source, the time of modification is checked using a `SHOW TABLE STATUS` query (in case of MySQL 8 you need to disable meta-information caching in MySQL by `set global information_schema_stats_expiry=0`). +- Dictionaries from other sources are updated every time by default. + +For other sources (ODBC, PostgreSQL, ClickHouse, etc), you can set up a query that will update the dictionaries only if they really changed, rather than each time. To do this, follow these steps: + +- The dictionary table must have a field that always changes when the source data is updated. +- The settings of the source must specify a query that retrieves the changing field. The ClickHouse server interprets the query result as a row, and if this row has changed relative to its previous state, the dictionary is updated. Specify the query in the `` field in the settings for the [source](#dictionary-sources). + +Example of settings: + +``` xml + + ... + + ... + SELECT update_time FROM dictionary_source where id = 1 + + ... + +``` + +or + +``` sql +... +SOURCE(ODBC(... invalidate_query 'SELECT update_time FROM dictionary_source where id = 1')) +... +``` + +For `Cache`, `ComplexKeyCache`, `SSDCache`, and `SSDComplexKeyCache` dictionaries both synchronious and asynchronious updates are supported. + +It is also possible for `Flat`, `Hashed`, `ComplexKeyHashed` dictionaries to only request data that was changed after the previous update. If `update_field` is specified as part of the dictionary source configuration, value of the previous update time in seconds will be added to the data request. Depends on source type (Executable, HTTP, MySQL, PostgreSQL, ClickHouse, or ODBC) different logic will be applied to `update_field` before request data from an external source. + +- If the source is HTTP then `update_field` will be added as a query parameter with the last update time as the parameter value. +- If the source is Executable then `update_field` will be added as an executable script argument with the last update time as the argument value. +- If the source is ClickHouse, MySQL, PostgreSQL, ODBC there will be an additional part of `WHERE`, where `update_field` is compared as greater or equal with the last update time. + - Per default, this `WHERE`-condition is checked at the highest level of the SQL-Query. Alternatively, the condition can be checked in any other `WHERE`-clause within the query using the `{condition}`-keyword. Example: + ```sql + ... + SOURCE(CLICKHOUSE(... + update_field 'added_time' + QUERY ' + SELECT my_arr.1 AS x, my_arr.2 AS y, creation_time + FROM ( + SELECT arrayZip(x_arr, y_arr) AS my_arr, creation_time + FROM dictionary_source + WHERE {condition} + )' + )) + ... + ``` + +If `update_field` option is set, additional option `update_lag` can be set. Value of `update_lag` option is subtracted from previous update time before request updated data. + +Example of settings: + +``` xml + + ... + + ... + added_time + 15 + + ... + +``` + +or + +``` sql +... +SOURCE(CLICKHOUSE(... update_field 'added_time' update_lag 15)) +... +``` + +## Dictionary Sources {#dictionary-sources} + + + +A dictionary can be connected to ClickHouse from many different sources. + +If the dictionary is configured using an xml-file, the configuration looks like this: + +``` xml + + + ... + + + + + + ... + + ... + +``` + +In case of [DDL-query](../../sql-reference/statements/create/dictionary.md), the configuration described above will look like: + +``` sql +CREATE DICTIONARY dict_name (...) +... +SOURCE(SOURCE_TYPE(param1 val1 ... paramN valN)) -- Source configuration +... +``` + +The source is configured in the `source` section. + +For source types [Local file](#local_file), [Executable file](#executable), [HTTP(s)](#https), [ClickHouse](#clickhouse) +optional settings are available: + +``` xml + + + /opt/dictionaries/os.tsv + TabSeparated + + + 0 + + +``` + +or + +``` sql +SOURCE(FILE(path './user_files/os.tsv' format 'TabSeparated')) +SETTINGS(format_csv_allow_single_quotes = 0) +``` + +Types of sources (`source_type`): + +- [Local file](#local_file) +- [Executable File](#executable) +- [Executable Pool](#executable_pool) +- [HTTP(s)](#http) +- DBMS + - [ODBC](#odbc) + - [MySQL](#mysql) + - [ClickHouse](#clickhouse) + - [MongoDB](#mongodb) + - [Redis](#redis) + - [Cassandra](#cassandra) + - [PostgreSQL](#postgresql) + +## Local File {#local_file} + +Example of settings: + +``` xml + + + /opt/dictionaries/os.tsv + TabSeparated + + +``` + +or + +``` sql +SOURCE(FILE(path './user_files/os.tsv' format 'TabSeparated')) +``` + +Setting fields: + +- `path` – The absolute path to the file. +- `format` – The file format. All the formats described in [Formats](../../interfaces/formats.md#formats) are supported. + +When a dictionary with source `FILE` is created via DDL command (`CREATE DICTIONARY ...`), the source file needs to be located in the `user_files` directory to prevent DB users from accessing arbitrary files on the ClickHouse node. + +**See Also** + +- [Dictionary function](../../sql-reference/table-functions/dictionary.md#dictionary-function) + +## Executable File {#executable} + +Working with executable files depends on [how the dictionary is stored in memory](#storig-dictionaries-in-memory). If the dictionary is stored using `cache` and `complex_key_cache`, ClickHouse requests the necessary keys by sending a request to the executable file’s STDIN. Otherwise, ClickHouse starts the executable file and treats its output as dictionary data. + +Example of settings: + +``` xml + + + cat /opt/dictionaries/os.tsv + TabSeparated + false + + +``` + +Setting fields: + +- `command` — The absolute path to the executable file, or the file name (if the command's directory is in the `PATH`). +- `format` — The file format. All the formats described in [Formats](../../interfaces/formats.md#formats) are supported. +- `command_termination_timeout` — The executable script should contain a main read-write loop. After the dictionary is destroyed, the pipe is closed, and the executable file will have `command_termination_timeout` seconds to shutdown before ClickHouse will send a SIGTERM signal to the child process. `command_termination_timeout` is specified in seconds. Default value is 10. Optional parameter. +- `command_read_timeout` - Timeout for reading data from command stdout in milliseconds. Default value 10000. Optional parameter. +- `command_write_timeout` - Timeout for writing data to command stdin in milliseconds. Default value 10000. Optional parameter. +- `implicit_key` — The executable source file can return only values, and the correspondence to the requested keys is determined implicitly — by the order of rows in the result. Default value is false. +- `execute_direct` - If `execute_direct` = `1`, then `command` will be searched inside user_scripts folder specified by [user_scripts_path](../../operations/server-configuration-parameters/settings.md#server_configuration_parameters-user_scripts_path). Additional script arguments can be specified using a whitespace separator. Example: `script_name arg1 arg2`. If `execute_direct` = `0`, `command` is passed as argument for `bin/sh -c`. Default value is `0`. Optional parameter. +- `send_chunk_header` - controls whether to send row count before sending a chunk of data to process. Optional. Default value is `false`. + +That dictionary source can be configured only via XML configuration. Creating dictionaries with executable source via DDL is disabled; otherwise, the DB user would be able to execute arbitrary binaries on the ClickHouse node. + +## Executable Pool {#executable_pool} + +Executable pool allows loading data from pool of processes. This source does not work with dictionary layouts that need to load all data from source. Executable pool works if the dictionary [is stored](#ways-to-store-dictionaries-in-memory) using `cache`, `complex_key_cache`, `ssd_cache`, `complex_key_ssd_cache`, `direct`, or `complex_key_direct` layouts. + +Executable pool will spawn a pool of processes with the specified command and keep them running until they exit. The program should read data from STDIN while it is available and output the result to STDOUT. It can wait for the next block of data on STDIN. ClickHouse will not close STDIN after processing a block of data, but will pipe another chunk of data when needed. The executable script should be ready for this way of data processing — it should poll STDIN and flush data to STDOUT early. + +Example of settings: + +``` xml + + + while read key; do printf "$key\tData for key $key\n"; done + TabSeparated + 10 + 10 + false + + +``` + +Setting fields: + +- `command` — The absolute path to the executable file, or the file name (if the program directory is written to `PATH`). +- `format` — The file format. All the formats described in “[Formats](../../interfaces/formats.md#formats)†are supported. +- `pool_size` — Size of pool. If 0 is specified as `pool_size` then there is no pool size restrictions. Default value is `16`. +- `command_termination_timeout` — executable script should contain main read-write loop. After dictionary is destroyed, pipe is closed, and executable file will have `command_termination_timeout` seconds to shutdown, before ClickHouse will send SIGTERM signal to child process. Specified in seconds. Default value is 10. Optional parameter. +- `max_command_execution_time` — Maximum executable script command execution time for processing block of data. Specified in seconds. Default value is 10. Optional parameter. +- `command_read_timeout` - timeout for reading data from command stdout in milliseconds. Default value 10000. Optional parameter. +- `command_write_timeout` - timeout for writing data to command stdin in milliseconds. Default value 10000. Optional parameter. +- `implicit_key` — The executable source file can return only values, and the correspondence to the requested keys is determined implicitly — by the order of rows in the result. Default value is false. Optional parameter. +- `execute_direct` - If `execute_direct` = `1`, then `command` will be searched inside user_scripts folder specified by [user_scripts_path](../../operations/server-configuration-parameters/settings.md#server_configuration_parameters-user_scripts_path). Additional script arguments can be specified using whitespace separator. Example: `script_name arg1 arg2`. If `execute_direct` = `0`, `command` is passed as argument for `bin/sh -c`. Default value is `1`. Optional parameter. +- `send_chunk_header` - controls whether to send row count before sending a chunk of data to process. Optional. Default value is `false`. + +That dictionary source can be configured only via XML configuration. Creating dictionaries with executable source via DDL is disabled, otherwise, the DB user would be able to execute arbitrary binary on ClickHouse node. + +## Http(s) {#https} + +Working with an HTTP(s) server depends on [how the dictionary is stored in memory](#storig-dictionaries-in-memory). If the dictionary is stored using `cache` and `complex_key_cache`, ClickHouse requests the necessary keys by sending a request via the `POST` method. + +Example of settings: + +``` xml + + + http://[::1]/os.tsv + TabSeparated + + user + password + + +
+ API-KEY + key +
+
+
+ +``` + +or + +``` sql +SOURCE(HTTP( + url 'http://[::1]/os.tsv' + format 'TabSeparated' + credentials(user 'user' password 'password') + headers(header(name 'API-KEY' value 'key')) +)) +``` + +In order for ClickHouse to access an HTTPS resource, you must [configure openSSL](../../operations/server-configuration-parameters/settings.md#server_configuration_parameters-openssl) in the server configuration. + +Setting fields: + +- `url` – The source URL. +- `format` – The file format. All the formats described in “[Formats](../../interfaces/formats.md#formats)†are supported. +- `credentials` – Basic HTTP authentication. Optional parameter. +- `user` – Username required for the authentication. +- `password` – Password required for the authentication. +- `headers` – All custom HTTP headers entries used for the HTTP request. Optional parameter. +- `header` – Single HTTP header entry. +- `name` – Identifiant name used for the header send on the request. +- `value` – Value set for a specific identifiant name. + +When creating a dictionary using the DDL command (`CREATE DICTIONARY ...`) remote hosts for HTTP dictionaries are checked against the contents of `remote_url_allow_hosts` section from config to prevent database users to access arbitrary HTTP server. + +### Known Vulnerability of the ODBC Dictionary Functionality + +:::note +When connecting to the database through the ODBC driver connection parameter `Servername` can be substituted. In this case values of `USERNAME` and `PASSWORD` from `odbc.ini` are sent to the remote server and can be compromised. +::: + +**Example of insecure use** + +Let’s configure unixODBC for PostgreSQL. Content of `/etc/odbc.ini`: + +``` text +[gregtest] +Driver = /usr/lib/psqlodbca.so +Servername = localhost +PORT = 5432 +DATABASE = test_db +#OPTION = 3 +USERNAME = test +PASSWORD = test +``` + +If you then make a query such as + +``` sql +SELECT * FROM odbc('DSN=gregtest;Servername=some-server.com', 'test_db'); +``` + +ODBC driver will send values of `USERNAME` and `PASSWORD` from `odbc.ini` to `some-server.com`. + +### Example of Connecting Postgresql + +Ubuntu OS. + +Installing unixODBC and the ODBC driver for PostgreSQL: + +``` bash +$ sudo apt-get install -y unixodbc odbcinst odbc-postgresql +``` + +Configuring `/etc/odbc.ini` (or `~/.odbc.ini` if you signed in under a user that runs ClickHouse): + +``` text + [DEFAULT] + Driver = myconnection + + [myconnection] + Description = PostgreSQL connection to my_db + Driver = PostgreSQL Unicode + Database = my_db + Servername = 127.0.0.1 + UserName = username + Password = password + Port = 5432 + Protocol = 9.3 + ReadOnly = No + RowVersioning = No + ShowSystemTables = No + ConnSettings = +``` + +The dictionary configuration in ClickHouse: + +``` xml + + + table_name + + + + + DSN=myconnection + postgresql_table
+
+ + + 300 + 360 + + + + + + + id + + + some_column + UInt64 + 0 + + +
+
+``` + +or + +``` sql +CREATE DICTIONARY table_name ( + id UInt64, + some_column UInt64 DEFAULT 0 +) +PRIMARY KEY id +SOURCE(ODBC(connection_string 'DSN=myconnection' table 'postgresql_table')) +LAYOUT(HASHED()) +LIFETIME(MIN 300 MAX 360) +``` + +You may need to edit `odbc.ini` to specify the full path to the library with the driver `DRIVER=/usr/local/lib/psqlodbcw.so`. + +### Example of Connecting MS SQL Server + +Ubuntu OS. + +Installing the ODBC driver for connecting to MS SQL: + +``` bash +$ sudo apt-get install tdsodbc freetds-bin sqsh +``` + +Configuring the driver: + +```bash + $ cat /etc/freetds/freetds.conf + ... + + [MSSQL] + host = 192.168.56.101 + port = 1433 + tds version = 7.0 + client charset = UTF-8 + + # test TDS connection + $ sqsh -S MSSQL -D database -U user -P password + + + $ cat /etc/odbcinst.ini + + [FreeTDS] + Description = FreeTDS + Driver = /usr/lib/x86_64-linux-gnu/odbc/libtdsodbc.so + Setup = /usr/lib/x86_64-linux-gnu/odbc/libtdsS.so + FileUsage = 1 + UsageCount = 5 + + $ cat /etc/odbc.ini + # $ cat ~/.odbc.ini # if you signed in under a user that runs ClickHouse + + [MSSQL] + Description = FreeTDS + Driver = FreeTDS + Servername = MSSQL + Database = test + UID = test + PWD = test + Port = 1433 + + + # (optional) test ODBC connection (to use isql-tool install the [unixodbc](https://packages.debian.org/sid/unixodbc)-package) + $ isql -v MSSQL "user" "password" +``` + +Remarks: +- to determine the earliest TDS version that is supported by a particular SQL Server version, refer to the product documentation or look at [MS-TDS Product Behavior](https://docs.microsoft.com/en-us/openspecs/windows_protocols/ms-tds/135d0ebe-5c4c-4a94-99bf-1811eccb9f4a) + +Configuring the dictionary in ClickHouse: + +``` xml + + + test + + + dict
+ DSN=MSSQL;UID=test;PWD=test +
+ + + + 300 + 360 + + + + + + + + + k + + + s + String + + + +
+
+``` + +or + +``` sql +CREATE DICTIONARY test ( + k UInt64, + s String DEFAULT '' +) +PRIMARY KEY k +SOURCE(ODBC(table 'dict' connection_string 'DSN=MSSQL;UID=test;PWD=test')) +LAYOUT(FLAT()) +LIFETIME(MIN 300 MAX 360) +``` + +## DBMS + +### ODBC + +You can use this method to connect any database that has an ODBC driver. + +Example of settings: + +``` xml + + + DatabaseName + ShemaName.TableName
+ DSN=some_parameters + SQL_QUERY + SELECT id, value_1, value_2 FROM ShemaName.TableName +
+ +``` + +or + +``` sql +SOURCE(ODBC( + db 'DatabaseName' + table 'SchemaName.TableName' + connection_string 'DSN=some_parameters' + invalidate_query 'SQL_QUERY' + query 'SELECT id, value_1, value_2 FROM db_name.table_name' +)) +``` + +Setting fields: + +- `db` – Name of the database. Omit it if the database name is set in the `` parameters. +- `table` – Name of the table and schema if exists. +- `connection_string` – Connection string. +- `invalidate_query` – Query for checking the dictionary status. Optional parameter. Read more in the section [Updating dictionaries](#dictionary-updates). +- `query` – The custom query. Optional parameter. + +:::note +The `table` and `query` fields cannot be used together. And either one of the `table` or `query` fields must be declared. +::: + +ClickHouse receives quoting symbols from ODBC-driver and quote all settings in queries to driver, so it’s necessary to set table name accordingly to table name case in database. + +If you have a problems with encodings when using Oracle, see the corresponding [FAQ](/knowledgebase/oracle-odbc) item. + +### Mysql + +Example of settings: + +``` xml + + + 3306 + clickhouse + qwerty + + example01-1 + 1 + + + example01-2 + 1 + + db_name + table_name
+ id=10 + SQL_QUERY + true + SELECT id, value_1, value_2 FROM db_name.table_name +
+ +``` + +or + +``` sql +SOURCE(MYSQL( + port 3306 + user 'clickhouse' + password 'qwerty' + replica(host 'example01-1' priority 1) + replica(host 'example01-2' priority 1) + db 'db_name' + table 'table_name' + where 'id=10' + invalidate_query 'SQL_QUERY' + fail_on_connection_loss 'true' + query 'SELECT id, value_1, value_2 FROM db_name.table_name' +)) +``` + +Setting fields: + +- `port` – The port on the MySQL server. You can specify it for all replicas, or for each one individually (inside ``). + +- `user` – Name of the MySQL user. You can specify it for all replicas, or for each one individually (inside ``). + +- `password` – Password of the MySQL user. You can specify it for all replicas, or for each one individually (inside ``). + +- `replica` – Section of replica configurations. There can be multiple sections. + + - `replica/host` – The MySQL host. + - `replica/priority` – The replica priority. When attempting to connect, ClickHouse traverses the replicas in order of priority. The lower the number, the higher the priority. + +- `db` – Name of the database. + +- `table` – Name of the table. + +- `where` – The selection criteria. The syntax for conditions is the same as for `WHERE` clause in MySQL, for example, `id > 10 AND id < 20`. Optional parameter. + +- `invalidate_query` – Query for checking the dictionary status. Optional parameter. Read more in the section [Updating dictionaries](#dictionary-updates). + +- `fail_on_connection_loss` – The configuration parameter that controls behavior of the server on connection loss. If `true`, an exception is thrown immediately if the connection between client and server was lost. If `false`, the ClickHouse server retries to execute the query three times before throwing an exception. Note that retrying leads to increased response times. Default value: `false`. + +- `query` – The custom query. Optional parameter. + +:::note +The `table` or `where` fields cannot be used together with the `query` field. And either one of the `table` or `query` fields must be declared. +::: + +:::note +There is no explicit parameter `secure`. When establishing an SSL-connection security is mandatory. +::: + +MySQL can be connected to on a local host via sockets. To do this, set `host` and `socket`. + +Example of settings: + +``` xml + + + localhost + /path/to/socket/file.sock + clickhouse + qwerty + db_name + table_name
+ id=10 + SQL_QUERY + true + SELECT id, value_1, value_2 FROM db_name.table_name +
+ +``` + +or + +``` sql +SOURCE(MYSQL( + host 'localhost' + socket '/path/to/socket/file.sock' + user 'clickhouse' + password 'qwerty' + db 'db_name' + table 'table_name' + where 'id=10' + invalidate_query 'SQL_QUERY' + fail_on_connection_loss 'true' + query 'SELECT id, value_1, value_2 FROM db_name.table_name' +)) +``` + +### ClickHouse + +Example of settings: + +``` xml + + + example01-01-1 + 9000 + default + + default + ids
+ id=10 + 1 + SELECT id, value_1, value_2 FROM default.ids +
+ +``` + +or + +``` sql +SOURCE(CLICKHOUSE( + host 'example01-01-1' + port 9000 + user 'default' + password '' + db 'default' + table 'ids' + where 'id=10' + secure 1 + query 'SELECT id, value_1, value_2 FROM default.ids' +)); +``` + +Setting fields: + +- `host` – The ClickHouse host. If it is a local host, the query is processed without any network activity. To improve fault tolerance, you can create a [Distributed](../../engines/table-engines/special/distributed.md) table and enter it in subsequent configurations. +- `port` – The port on the ClickHouse server. +- `user` – Name of the ClickHouse user. +- `password` – Password of the ClickHouse user. +- `db` – Name of the database. +- `table` – Name of the table. +- `where` – The selection criteria. May be omitted. +- `invalidate_query` – Query for checking the dictionary status. Optional parameter. Read more in the section [Updating dictionaries](#dictionary-updates). +- `secure` - Use ssl for connection. +- `query` – The custom query. Optional parameter. + +:::note +The `table` or `where` fields cannot be used together with the `query` field. And either one of the `table` or `query` fields must be declared. +::: + +### Mongodb + +Example of settings: + +``` xml + + + localhost + 27017 + + + test + dictionary_source + + +``` + +or + +``` sql +SOURCE(MONGODB( + host 'localhost' + port 27017 + user '' + password '' + db 'test' + collection 'dictionary_source' +)) +``` + +Setting fields: + +- `host` – The MongoDB host. +- `port` – The port on the MongoDB server. +- `user` – Name of the MongoDB user. +- `password` – Password of the MongoDB user. +- `db` – Name of the database. +- `collection` – Name of the collection. + +### Redis + +Example of settings: + +``` xml + + + localhost + 6379 + simple + 0 + + +``` + +or + +``` sql +SOURCE(REDIS( + host 'localhost' + port 6379 + storage_type 'simple' + db_index 0 +)) +``` + +Setting fields: + +- `host` – The Redis host. +- `port` – The port on the Redis server. +- `storage_type` – The structure of internal Redis storage using for work with keys. `simple` is for simple sources and for hashed single key sources, `hash_map` is for hashed sources with two keys. Ranged sources and cache sources with complex key are unsupported. May be omitted, default value is `simple`. +- `db_index` – The specific numeric index of Redis logical database. May be omitted, default value is 0. + +### Cassandra + +Example of settings: + +``` xml + + + localhost + 9042 + username + qwerty123 + database_name + table_name + 1 + 1 + One + "SomeColumn" = 42 + 8 + SELECT id, value_1, value_2 FROM database_name.table_name + + +``` + +Setting fields: + +- `host` – The Cassandra host or comma-separated list of hosts. +- `port` – The port on the Cassandra servers. If not specified, default port 9042 is used. +- `user` – Name of the Cassandra user. +- `password` – Password of the Cassandra user. +- `keyspace` – Name of the keyspace (database). +- `column_family` – Name of the column family (table). +- `allow_filering` – Flag to allow or not potentially expensive conditions on clustering key columns. Default value is 1. +- `partition_key_prefix` – Number of partition key columns in primary key of the Cassandra table. Required for compose key dictionaries. Order of key columns in the dictionary definition must be the same as in Cassandra. Default value is 1 (the first key column is a partition key and other key columns are clustering key). +- `consistency` – Consistency level. Possible values: `One`, `Two`, `Three`, `All`, `EachQuorum`, `Quorum`, `LocalQuorum`, `LocalOne`, `Serial`, `LocalSerial`. Default value is `One`. +- `where` – Optional selection criteria. +- `max_threads` – The maximum number of threads to use for loading data from multiple partitions in compose key dictionaries. +- `query` – The custom query. Optional parameter. + +:::note +The `column_family` or `where` fields cannot be used together with the `query` field. And either one of the `column_family` or `query` fields must be declared. +::: + +### PostgreSQL + +Example of settings: + +``` xml + + + 5432 + clickhouse + qwerty + db_name + table_name
+ id=10 + SQL_QUERY + SELECT id, value_1, value_2 FROM db_name.table_name +
+ +``` + +or + +``` sql +SOURCE(POSTGRESQL( + port 5432 + host 'postgresql-hostname' + user 'postgres_user' + password 'postgres_password' + db 'db_name' + table 'table_name' + replica(host 'example01-1' port 5432 priority 1) + replica(host 'example01-2' port 5432 priority 2) + where 'id=10' + invalidate_query 'SQL_QUERY' + query 'SELECT id, value_1, value_2 FROM db_name.table_name' +)) +``` + +Setting fields: + +- `host` – The host on the PostgreSQL server. You can specify it for all replicas, or for each one individually (inside ``). +- `port` – The port on the PostgreSQL server. You can specify it for all replicas, or for each one individually (inside ``). +- `user` – Name of the PostgreSQL user. You can specify it for all replicas, or for each one individually (inside ``). +- `password` – Password of the PostgreSQL user. You can specify it for all replicas, or for each one individually (inside ``). +- `replica` – Section of replica configurations. There can be multiple sections: + - `replica/host` – The PostgreSQL host. + - `replica/port` – The PostgreSQL port. + - `replica/priority` – The replica priority. When attempting to connect, ClickHouse traverses the replicas in order of priority. The lower the number, the higher the priority. +- `db` – Name of the database. +- `table` – Name of the table. +- `where` – The selection criteria. The syntax for conditions is the same as for `WHERE` clause in PostgreSQL. For example, `id > 10 AND id < 20`. Optional parameter. +- `invalidate_query` – Query for checking the dictionary status. Optional parameter. Read more in the section [Updating dictionaries](#dictionary-updates). +- `query` – The custom query. Optional parameter. + +:::note +The `table` or `where` fields cannot be used together with the `query` field. And either one of the `table` or `query` fields must be declared. +::: + +## Null + +A special source that can be used to create dummy (empty) dictionaries. Such dictionaries can useful for tests or with setups with separated data and query nodes at nodes with Distributed tables. + +``` sql +CREATE DICTIONARY null_dict ( + id UInt64, + val UInt8, + default_val UInt8 DEFAULT 123, + nullable_val Nullable(UInt8) +) +PRIMARY KEY id +SOURCE(NULL()) +LAYOUT(FLAT()) +LIFETIME(0); +``` + +## Dictionary Key and Fields {#dictionary-key-and-fields} + + + +The `structure` clause describes the dictionary key and fields available for queries. + +XML description: + +``` xml + + + + Id + + + + + + + ... + + + +``` + +Attributes are described in the elements: + +- `` — Key column +- `` — Data column: there can be a multiple number of attributes. + +DDL query: + +``` sql +CREATE DICTIONARY dict_name ( + Id UInt64, + -- attributes +) +PRIMARY KEY Id +... +``` + +Attributes are described in the query body: + +- `PRIMARY KEY` — Key column +- `AttrName AttrType` — Data column. There can be a multiple number of attributes. + +## Key + +ClickHouse supports the following types of keys: + +- Numeric key. `UInt64`. Defined in the `` tag or using `PRIMARY KEY` keyword. +- Composite key. Set of values of different types. Defined in the tag `` or `PRIMARY KEY` keyword. + +An xml structure can contain either `` or ``. DDL-query must contain single `PRIMARY KEY`. + +:::warning +You must not describe key as an attribute. +::: + +### Numeric Key + +Type: `UInt64`. + +Configuration example: + +``` xml + + Id + +``` + +Configuration fields: + +- `name` – The name of the column with keys. + +For DDL-query: + +``` sql +CREATE DICTIONARY ( + Id UInt64, + ... +) +PRIMARY KEY Id +... +``` + +- `PRIMARY KEY` – The name of the column with keys. + +### Composite Key + +The key can be a `tuple` from any types of fields. The [layout](#storig-dictionaries-in-memory) in this case must be `complex_key_hashed` or `complex_key_cache`. + +:::tip +A composite key can consist of a single element. This makes it possible to use a string as the key, for instance. +::: + +The key structure is set in the element ``. Key fields are specified in the same format as the dictionary [attributes](#dictionary-key-and-fields). Example: + +``` xml + + + + field1 + String + + + field2 + UInt32 + + ... + +... +``` + +or + +``` sql +CREATE DICTIONARY ( + field1 String, + field2 String + ... +) +PRIMARY KEY field1, field2 +... +``` + +For a query to the `dictGet*` function, a tuple is passed as the key. Example: `dictGetString('dict_name', 'attr_name', tuple('string for field1', num_for_field2))`. + +## Attributes + +Configuration example: + +``` xml + + ... + + Name + ClickHouseDataType + + rand64() + true + true + true + + +``` + +or + +``` sql +CREATE DICTIONARY somename ( + Name ClickHouseDataType DEFAULT '' EXPRESSION rand64() HIERARCHICAL INJECTIVE IS_OBJECT_ID +) +``` + +Configuration fields: + +| Tag | Description | Required | +|------------------------------------------------------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|----------| +| `name` | Column name. | Yes | +| `type` | ClickHouse data type: [UInt8](../../sql-reference/data-types/int-uint.md), [UInt16](../../sql-reference/data-types/int-uint.md), [UInt32](../../sql-reference/data-types/int-uint.md), [UInt64](../../sql-reference/data-types/int-uint.md), [Int8](../../sql-reference/data-types/int-uint.md), [Int16](../../sql-reference/data-types/int-uint.md), [Int32](../../sql-reference/data-types/int-uint.md), [Int64](../../sql-reference/data-types/int-uint.md), [Float32](../../sql-reference/data-types/float.md), [Float64](../../sql-reference/data-types/float.md), [UUID](../../sql-reference/data-types/uuid.md), [Decimal32](../../sql-reference/data-types/decimal.md), [Decimal64](../../sql-reference/data-types/decimal.md), [Decimal128](../../sql-reference/data-types/decimal.md), [Decimal256](../../sql-reference/data-types/decimal.md),[Date](../../sql-reference/data-types/date.md), [Date32](../../sql-reference/data-types/date32.md), [DateTime](../../sql-reference/data-types/datetime.md), [DateTime64](../../sql-reference/data-types/datetime64.md), [String](../../sql-reference/data-types/string.md), [Array](../../sql-reference/data-types/array.md).
ClickHouse tries to cast value from dictionary to the specified data type. For example, for MySQL, the field might be `TEXT`, `VARCHAR`, or `BLOB` in the MySQL source table, but it can be uploaded as `String` in ClickHouse.
[Nullable](../../sql-reference/data-types/nullable.md) is currently supported for [Flat](#flat), [Hashed](#hashed), [ComplexKeyHashed](#complex_key_hashed), [Direct](#direct), [ComplexKeyDirect](#complex_key_direct), [RangeHashed](#range_hashed), Polygon, [Cache](#cache), [ComplexKeyCache](#complex_key_cache), [SSDCache](#ssd_cache), [SSDComplexKeyCache](#complex_key_ssd_cache) dictionaries. In [IPTrie](#ip_trie) dictionaries `Nullable` types are not supported. | Yes | +| `null_value` | Default value for a non-existing element.
In the example, it is an empty string. [NULL](../syntax.md#null) value can be used only for the `Nullable` types (see the previous line with types description). | Yes | +| `expression` | [Expression](../../sql-reference/syntax.md#expressions) that ClickHouse executes on the value.
The expression can be a column name in the remote SQL database. Thus, you can use it to create an alias for the remote column.

Default value: no expression. | No | +| `hierarchical` | If `true`, the attribute contains the value of a parent key for the current key. See [Hierarchical Dictionaries](#hierarchical-dictionaries).

Default value: `false`. | No | +| `injective` | Flag that shows whether the `id -> attribute` image is [injective](https://en.wikipedia.org/wiki/Injective_function).
If `true`, ClickHouse can automatically place after the `GROUP BY` clause the requests to dictionaries with injection. Usually it significantly reduces the amount of such requests.

Default value: `false`. | No | +| `is_object_id` | Flag that shows whether the query is executed for a MongoDB document by `ObjectID`.

Default value: `false`. + +## Hierarchical Dictionaries {#hierarchical-dictionaries} + +ClickHouse supports hierarchical dictionaries with a [numeric key](#numeric-key). + +Look at the following hierarchical structure: + +``` text +0 (Common parent) +│ +├── 1 (Russia) +│ │ +│ └── 2 (Moscow) +│ │ +│ └── 3 (Center) +│ +└── 4 (Great Britain) + │ + └── 5 (London) +``` + +This hierarchy can be expressed as the following dictionary table. + +| region_id | parent_region | region_name | +|------------|----------------|---------------| +| 1 | 0 | Russia | +| 2 | 1 | Moscow | +| 3 | 2 | Center | +| 4 | 0 | Great Britain | +| 5 | 4 | London | + +This table contains a column `parent_region` that contains the key of the nearest parent for the element. + +ClickHouse supports the hierarchical property for external dictionary attributes. This property allows you to configure the hierarchical dictionary similar to described above. + +The [dictGetHierarchy](../../sql-reference/functions/ext-dict-functions.md#dictgethierarchy) function allows you to get the parent chain of an element. + +For our example, the structure of dictionary can be the following: + +``` xml + + + + region_id + + + + parent_region + UInt64 + 0 + true + + + + region_name + String + + + + + +``` + +## Polygon dictionaries {#polygon-dictionaries} + +Polygon dictionaries allow you to efficiently search for the polygon containing specified points. +For example: defining a city area by geographical coordinates. + +Example of a polygon dictionary configuration: + + + +``` xml + + + + + key + Array(Array(Array(Array(Float64)))) + + + + + name + String + + + + + value + UInt64 + 0 + + + + + + 1 + + + + ... + +``` + +The corresponding [DDL-query](../../sql-reference/statements/create/dictionary.md#create-dictionary-query): +``` sql +CREATE DICTIONARY polygon_dict_name ( + key Array(Array(Array(Array(Float64)))), + name String, + value UInt64 +) +PRIMARY KEY key +LAYOUT(POLYGON(STORE_POLYGON_KEY_COLUMN 1)) +... +``` + +When configuring the polygon dictionary, the key must have one of two types: + +- A simple polygon. It is an array of points. +- MultiPolygon. It is an array of polygons. Each polygon is a two-dimensional array of points. The first element of this array is the outer boundary of the polygon, and subsequent elements specify areas to be excluded from it. + +Points can be specified as an array or a tuple of their coordinates. In the current implementation, only two-dimensional points are supported. + +The user can upload their own data in all formats supported by ClickHouse. + +There are 3 types of [in-memory storage](#storig-dictionaries-in-memory) available: + +- `POLYGON_SIMPLE`. This is a naive implementation, where a linear pass through all polygons is made for each query, and membership is checked for each one without using additional indexes. + +- `POLYGON_INDEX_EACH`. A separate index is built for each polygon, which allows you to quickly check whether it belongs in most cases (optimized for geographical regions). +Also, a grid is superimposed on the area under consideration, which significantly narrows the number of polygons under consideration. +The grid is created by recursively dividing the cell into 16 equal parts and is configured with two parameters. +The division stops when the recursion depth reaches `MAX_DEPTH` or when the cell crosses no more than `MIN_INTERSECTIONS` polygons. +To respond to the query, there is a corresponding cell, and the index for the polygons stored in it is accessed alternately. + +- `POLYGON_INDEX_CELL`. This placement also creates the grid described above. The same options are available. For each sheet cell, an index is built on all pieces of polygons that fall into it, which allows you to quickly respond to a request. + +- `POLYGON`. Synonym to `POLYGON_INDEX_CELL`. + +Dictionary queries are carried out using standard [functions](../../sql-reference/functions/ext-dict-functions.md) for working with dictionaries. +An important difference is that here the keys will be the points for which you want to find the polygon containing them. + +**Example** + +Example of working with the dictionary defined above: + +``` sql +CREATE TABLE points ( + x Float64, + y Float64 +) +... +SELECT tuple(x, y) AS key, dictGet(dict_name, 'name', key), dictGet(dict_name, 'value', key) FROM points ORDER BY x, y; +``` + +As a result of executing the last command for each point in the 'points' table, a minimum area polygon containing this point will be found, and the requested attributes will be output. + +**Example** + +You can read columns from polygon dictionaries via SELECT query, just turn on the `store_polygon_key_column = 1` in the dictionary configuration or corresponding DDL-query. + +Query: + +``` sql +CREATE TABLE polygons_test_table +( + key Array(Array(Array(Tuple(Float64, Float64)))), + name String +) ENGINE = TinyLog; + +INSERT INTO polygons_test_table VALUES ([[[(3, 1), (0, 1), (0, -1), (3, -1)]]], 'Value'); + +CREATE DICTIONARY polygons_test_dictionary +( + key Array(Array(Array(Tuple(Float64, Float64)))), + name String +) +PRIMARY KEY key +SOURCE(CLICKHOUSE(TABLE 'polygons_test_table')) +LAYOUT(POLYGON(STORE_POLYGON_KEY_COLUMN 1)) +LIFETIME(0); + +SELECT * FROM polygons_test_dictionary; +``` + +Result: + +``` text +┌─key─────────────────────────────┬─name──┠+│ [[[(3,1),(0,1),(0,-1),(3,-1)]]] │ Value │ +└─────────────────────────────────┴───────┘ +``` + +## RegExp Tree Dictionary {#regexp-tree-dictionary} + +Regexp Tree dictionary stores multiple trees of regular expressions with attributions. Users can retrieve strings in the dictionary. If a string matches the root of the regexp tree, we will collect the corresponding attributes of the matched root and continue to walk the children. If any of the children matches the string, we will collect attributes and rewrite the old ones if conflicts occur, then continue the traverse until we reach leaf nodes. + +Example of the ddl query for creating Regexp Tree dictionary: + + + +```sql +create dictionary regexp_dict +( + regexp String, + name String, + version String +) +PRIMARY KEY(regexp) +SOURCE(YAMLRegExpTree(PATH '/var/lib/clickhouse/user_files/regexp_tree.yaml')) +LAYOUT(regexp_tree) +... +``` + +We only allow `YAMLRegExpTree` to work with regexp_tree dicitionary layout. If you want to use other sources, please set variable `regexp_dict_allow_other_sources` true. + +**Source** + +We introduce a type of source called `YAMLRegExpTree` representing the structure of Regexp Tree dictionary. An Example of a valid yaml config is like: + +```xml +- regexp: 'Linux/(\d+[\.\d]*).+tlinux' + name: 'TencentOS' + version: '\1' + +- regexp: '\d+/tclwebkit(?:\d+[\.\d]*)' + name: 'Andriod' + versions: + - regexp: '33/tclwebkit' + version: '13' + - regexp: '3[12]/tclwebkit' + version: '12' + - regexp: '30/tclwebkit' + version: '11' + - regexp: '29/tclwebkit' + version: '10' +``` + +The key `regexp` represents the regular expression of a tree node. The name of key is same as the dictionary key. The `name` and `version` is user-defined attributions in the dicitionary. The `versions` (which can be any name that not appear in attributions or the key) indicates the children nodes of this tree. + +**Back Reference** + +The value of an attribution could contain a back reference which refers to a capture group of the matched regular expression. Reference number ranges from 1 to 9 and writes as `$1` or `\1`. + +During the query execution, the back reference in the value will be replaced by the matched capture group. + +**Query** + +Due to the specialty of Regexp Tree dictionary, we only allow functions `dictGet`, `dictGetOrDefault` and `dictGetOrNull` work with it. + +Example: + +```sql +SELECT dictGet('regexp_dict', ('name', 'version'), '31/tclwebkit1024'); +``` + +Result: + +``` +┌─dictGet('regexp_dict', ('name', 'version'), '31/tclwebkit1024')─┠+│ ('Andriod','12') │ +└─────────────────────────────────────────────────────────────────┘ +``` + +## Embedded Dictionaries {#embedded-dictionaries} + + + +ClickHouse contains a built-in feature for working with a geobase. + +This allows you to: + +- Use a region’s ID to get its name in the desired language. +- Use a region’s ID to get the ID of a city, area, federal district, country, or continent. +- Check whether a region is part of another region. +- Get a chain of parent regions. + +All the functions support “translocality,†the ability to simultaneously use different perspectives on region ownership. For more information, see the section “Functions for working with web analytics dictionariesâ€. + +The internal dictionaries are disabled in the default package. +To enable them, uncomment the parameters `path_to_regions_hierarchy_file` and `path_to_regions_names_files` in the server configuration file. + +The geobase is loaded from text files. + +Place the `regions_hierarchy*.txt` files into the `path_to_regions_hierarchy_file` directory. This configuration parameter must contain the path to the `regions_hierarchy.txt` file (the default regional hierarchy), and the other files (`regions_hierarchy_ua.txt`) must be located in the same directory. + +Put the `regions_names_*.txt` files in the `path_to_regions_names_files` directory. + +You can also create these files yourself. The file format is as follows: + +`regions_hierarchy*.txt`: TabSeparated (no header), columns: + +- region ID (`UInt32`) +- parent region ID (`UInt32`) +- region type (`UInt8`): 1 - continent, 3 - country, 4 - federal district, 5 - region, 6 - city; other types do not have values +- population (`UInt32`) — optional column + +`regions_names_*.txt`: TabSeparated (no header), columns: + +- region ID (`UInt32`) +- region name (`String`) — Can’t contain tabs or line feeds, even escaped ones. + +A flat array is used for storing in RAM. For this reason, IDs shouldn’t be more than a million. + +Dictionaries can be updated without restarting the server. However, the set of available dictionaries is not updated. +For updates, the file modification times are checked. If a file has changed, the dictionary is updated. +The interval to check for changes is configured in the `builtin_dictionaries_reload_interval` parameter. +Dictionary updates (other than loading at first use) do not block queries. During updates, queries use the old versions of dictionaries. If an error occurs during an update, the error is written to the server log, and queries continue using the old version of dictionaries. + +We recommend periodically updating the dictionaries with the geobase. During an update, generate new files and write them to a separate location. When everything is ready, rename them to the files used by the server. + +There are also functions for working with OS identifiers and search engines, but they shouldn’t be used. + +## Embedded Dictionaries + + + +ClickHouse contains a built-in feature for working with a geobase. + +This allows you to: + +- Use a region’s ID to get its name in the desired language. +- Use a region’s ID to get the ID of a city, area, federal district, country, or continent. +- Check whether a region is part of another region. +- Get a chain of parent regions. + +All the functions support “translocality,†the ability to simultaneously use different perspectives on region ownership. For more information, see the section “Functions for working with web analytics dictionariesâ€. + +The internal dictionaries are disabled in the default package. +To enable them, uncomment the parameters `path_to_regions_hierarchy_file` and `path_to_regions_names_files` in the server configuration file. + +The geobase is loaded from text files. + +Place the `regions_hierarchy*.txt` files into the `path_to_regions_hierarchy_file` directory. This configuration parameter must contain the path to the `regions_hierarchy.txt` file (the default regional hierarchy), and the other files (`regions_hierarchy_ua.txt`) must be located in the same directory. + +Put the `regions_names_*.txt` files in the `path_to_regions_names_files` directory. + +You can also create these files yourself. The file format is as follows: + +`regions_hierarchy*.txt`: TabSeparated (no header), columns: + +- region ID (`UInt32`) +- parent region ID (`UInt32`) +- region type (`UInt8`): 1 - continent, 3 - country, 4 - federal district, 5 - region, 6 - city; other types do not have values +- population (`UInt32`) — optional column + +`regions_names_*.txt`: TabSeparated (no header), columns: + +- region ID (`UInt32`) +- region name (`String`) — Can’t contain tabs or line feeds, even escaped ones. + +A flat array is used for storing in RAM. For this reason, IDs shouldn’t be more than a million. + +Dictionaries can be updated without restarting the server. However, the set of available dictionaries is not updated. +For updates, the file modification times are checked. If a file has changed, the dictionary is updated. +The interval to check for changes is configured in the `builtin_dictionaries_reload_interval` parameter. +Dictionary updates (other than loading at first use) do not block queries. During updates, queries use the old versions of dictionaries. If an error occurs during an update, the error is written to the server log, and queries continue using the old version of dictionaries. + +We recommend periodically updating the dictionaries with the geobase. During an update, generate new files and write them to a separate location. When everything is ready, rename them to the files used by the server. + +There are also functions for working with OS identifiers and search engines, but they shouldn’t be used. diff --git a/docs/en/sql-reference/dictionaries/internal-dicts.md b/docs/en/sql-reference/dictionaries/internal-dicts.md deleted file mode 100644 index 11c6ee93aa6..00000000000 --- a/docs/en/sql-reference/dictionaries/internal-dicts.md +++ /dev/null @@ -1,55 +0,0 @@ ---- -slug: /en/sql-reference/dictionaries/internal-dicts -sidebar_position: 39 -sidebar_label: Embedded Dictionaries ---- -import SelfManaged from '@site/docs/en/_snippets/_self_managed_only_no_roadmap.md'; - -# Embedded Dictionaries - - - -ClickHouse contains a built-in feature for working with a geobase. - -This allows you to: - -- Use a region’s ID to get its name in the desired language. -- Use a region’s ID to get the ID of a city, area, federal district, country, or continent. -- Check whether a region is part of another region. -- Get a chain of parent regions. - -All the functions support “translocality,†the ability to simultaneously use different perspectives on region ownership. For more information, see the section “Functions for working with web analytics dictionariesâ€. - -The internal dictionaries are disabled in the default package. -To enable them, uncomment the parameters `path_to_regions_hierarchy_file` and `path_to_regions_names_files` in the server configuration file. - -The geobase is loaded from text files. - -Place the `regions_hierarchy*.txt` files into the `path_to_regions_hierarchy_file` directory. This configuration parameter must contain the path to the `regions_hierarchy.txt` file (the default regional hierarchy), and the other files (`regions_hierarchy_ua.txt`) must be located in the same directory. - -Put the `regions_names_*.txt` files in the `path_to_regions_names_files` directory. - -You can also create these files yourself. The file format is as follows: - -`regions_hierarchy*.txt`: TabSeparated (no header), columns: - -- region ID (`UInt32`) -- parent region ID (`UInt32`) -- region type (`UInt8`): 1 - continent, 3 - country, 4 - federal district, 5 - region, 6 - city; other types do not have values -- population (`UInt32`) — optional column - -`regions_names_*.txt`: TabSeparated (no header), columns: - -- region ID (`UInt32`) -- region name (`String`) — Can’t contain tabs or line feeds, even escaped ones. - -A flat array is used for storing in RAM. For this reason, IDs shouldn’t be more than a million. - -Dictionaries can be updated without restarting the server. However, the set of available dictionaries is not updated. -For updates, the file modification times are checked. If a file has changed, the dictionary is updated. -The interval to check for changes is configured in the `builtin_dictionaries_reload_interval` parameter. -Dictionary updates (other than loading at first use) do not block queries. During updates, queries use the old versions of dictionaries. If an error occurs during an update, the error is written to the server log, and queries continue using the old version of dictionaries. - -We recommend periodically updating the dictionaries with the geobase. During an update, generate new files and write them to a separate location. When everything is ready, rename them to the files used by the server. - -There are also functions for working with OS identifiers and search engines, but they shouldn’t be used. diff --git a/docs/en/sql-reference/functions/date-time-functions.md b/docs/en/sql-reference/functions/date-time-functions.md index ef0475027dd..d06ab253cf7 100644 --- a/docs/en/sql-reference/functions/date-time-functions.md +++ b/docs/en/sql-reference/functions/date-time-functions.md @@ -283,7 +283,7 @@ Result: ``` :::note -The return type of `toStartOf*`, `toLastDayOfMonth`, `toMonday`, `timeSlot` functions described below is determined by the configuration parameter [enable_extended_results_for_datetime_functions](../../operations/settings/settings#enable-extended-results-for-datetime-functions) which is `0` by default. +The return type of `toStartOf*`, `toLastDayOfMonth`, `toMonday`, `timeSlot` functions described below is determined by the configuration parameter [enable_extended_results_for_datetime_functions](../../operations/settings/settings.md#enable-extended-results-for-datetime-functions) which is `0` by default. Behavior for * `enable_extended_results_for_datetime_functions = 0`: Functions `toStartOfYear`, `toStartOfISOYear`, `toStartOfQuarter`, `toStartOfMonth`, `toStartOfWeek`, `toLastDayOfMonth`, `toMonday` return `Date` or `DateTime`. Functions `toStartOfDay`, `toStartOfHour`, `toStartOfFifteenMinutes`, `toStartOfTenMinutes`, `toStartOfFiveMinutes`, `toStartOfMinute`, `timeSlot` return `DateTime`. Though these functions can take values of the extended types `Date32` and `DateTime64` as an argument, passing them a time outside the normal range (year 1970 to 2149 for `Date` / 2106 for `DateTime`) will produce wrong results. @@ -1135,7 +1135,7 @@ SELECT ``` ```response ┌─toYYYYMM(now(), 'US/Eastern')─┠-│ 202303 │ +│ 202303 │ └───────────────────────────────┘ ``` @@ -1232,12 +1232,14 @@ SELECT timeSlots(toDateTime64('1980-12-12 21:01:02.1234', 4, 'UTC'), toDecimal64 └───────────────────────────────────────────────────────────────────────────────────────────────────────────┘ ``` -## formatDateTime +## formatDateTime {#date_time_functions-formatDateTime} Formats a Time according to the given Format string. Format is a constant expression, so you cannot have multiple formats for a single result column. formatDateTime uses MySQL datetime format style, refer to https://dev.mysql.com/doc/refman/8.0/en/date-and-time-functions.html#function_date-format. +The opposite operation of this function is [parseDateTime](/docs/en/sql-reference/functions/type-conversion-functions.md#type_conversion_functions-parseDateTime). + Alias: `DATE_FORMAT`. **Syntax** @@ -1257,7 +1259,7 @@ Using replacement fields, you can define a pattern for the resulting string. “ |----------|---------------------------------------------------------|------------| | %a | abbreviated weekday name (Mon-Sun) | Mon | | %b | abbreviated month name (Jan-Dec) | Jan | -| %c | month as a decimal number (01-12) | 01 | +| %c | month as an integer number (01-12) | 01 | | %C | year divided by 100 and truncated to integer (00-99) | 20 | | %d | day of the month, zero-padded (01-31) | 02 | | %D | Short MM/DD/YY date, equivalent to %m/%d/%y | 01/02/18 | @@ -1273,7 +1275,7 @@ Using replacement fields, you can define a pattern for the resulting string. “ | %j | day of the year (001-366) | 002 | | %k | hour in 24h format (00-23) | 22 | | %l | hour in 12h format (01-12) | 09 | -| %m | month as a decimal number (01-12) | 01 | +| %m | month as an integer number (01-12) | 01 | | %M | minute (00-59) | 33 | | %n | new-line character (‘’) | | | %p | AM or PM designation | PM | @@ -1286,7 +1288,7 @@ Using replacement fields, you can define a pattern for the resulting string. “ | %T | ISO 8601 time format (HH:MM:SS), equivalent to %H:%M:%S | 22:33:44 | | %u | ISO 8601 weekday as number with Monday as 1 (1-7) | 2 | | %V | ISO 8601 week number (01-53) | 01 | -| %w | weekday as a decimal number with Sunday as 0 (0-6) | 2 | +| %w | weekday as a integer number with Sunday as 0 (0-6) | 2 | | %W | full weekday name (Monday-Sunday) | Monday | | %y | Year, last two digits (00-99) | 18 | | %Y | Year | 2018 | @@ -1328,14 +1330,15 @@ Result: - [formatDateTimeInJodaSyntax](##formatDateTimeInJodaSyntax) -## formatDateTimeInJodaSyntax +## formatDateTimeInJodaSyntax {#date_time_functions-formatDateTimeInJodaSyntax} Similar to formatDateTime, except that it formats datetime in Joda style instead of MySQL style. Refer to https://joda-time.sourceforge.net/apidocs/org/joda/time/format/DateTimeFormat.html. +The opposite operation of this function is [parseDateTimeInJodaSyntax](/docs/en/sql-reference/functions/type-conversion-functions.md#type_conversion_functions-parseDateTimeInJodaSyntax). **Replacement fields** -Using replacement fields, you can define a pattern for the resulting string. +Using replacement fields, you can define a pattern for the resulting string. | Placeholder | Description | Presentation | Examples | diff --git a/docs/en/sql-reference/functions/ext-dict-functions.md b/docs/en/sql-reference/functions/ext-dict-functions.md index b4b7ec5ab21..07226b67601 100644 --- a/docs/en/sql-reference/functions/ext-dict-functions.md +++ b/docs/en/sql-reference/functions/ext-dict-functions.md @@ -6,11 +6,11 @@ sidebar_label: Dictionaries # Functions for Working with Dictionaries -:::note +:::note For dictionaries created with [DDL queries](../../sql-reference/statements/create/dictionary.md), the `dict_name` parameter must be fully specified, like `.`. Otherwise, the current database is used. ::: -For information on connecting and configuring dictionaries, see [Dictionaries](../../sql-reference/dictionaries/external-dictionaries/external-dicts.md). +For information on connecting and configuring dictionaries, see [Dictionaries](../../sql-reference/dictionaries/index.md). ## dictGet, dictGetOrDefault, dictGetOrNull @@ -31,7 +31,7 @@ dictGetOrNull('dict_name', attr_name, id_expr) **Returned value** -- If ClickHouse parses the attribute successfully in the [attribute’s data type](../../sql-reference/dictionaries/external-dictionaries/external-dicts-dict-structure.md#ext_dict_structure-attributes), functions return the value of the dictionary attribute that corresponds to `id_expr`. +- If ClickHouse parses the attribute successfully in the [attribute’s data type](../../sql-reference/dictionaries/index.md#dictionary-key-and-fields#ext_dict_structure-attributes), functions return the value of the dictionary attribute that corresponds to `id_expr`. - If there is no the key, corresponding to `id_expr`, in the dictionary, then: @@ -226,7 +226,7 @@ Result: **See Also** -- [Dictionaries](../../sql-reference/dictionaries/external-dictionaries/external-dicts.md) +- [Dictionaries](../../sql-reference/dictionaries/index.md) ## dictHas @@ -250,7 +250,7 @@ Type: `UInt8`. ## dictGetHierarchy -Creates an array, containing all the parents of a key in the [hierarchical dictionary](../../sql-reference/dictionaries/external-dictionaries/external-dicts-dict-hierarchical.md). +Creates an array, containing all the parents of a key in the [hierarchical dictionary](../../sql-reference/dictionaries/index.md#hierarchical-dictionaries). **Syntax** @@ -436,7 +436,7 @@ dictGet[Type]OrDefault('dict_name', 'attr_name', id_expr, default_value_expr) **Returned value** -- If ClickHouse parses the attribute successfully in the [attribute’s data type](../../sql-reference/dictionaries/external-dictionaries/external-dicts-dict-structure.md#ext_dict_structure-attributes), functions return the value of the dictionary attribute that corresponds to `id_expr`. +- If ClickHouse parses the attribute successfully in the [attribute’s data type](../../sql-reference/dictionaries/index.md#dictionary-key-and-fields#ext_dict_structure-attributes), functions return the value of the dictionary attribute that corresponds to `id_expr`. - If there is no requested `id_expr` in the dictionary then: diff --git a/docs/en/sql-reference/functions/geo/coordinates.md b/docs/en/sql-reference/functions/geo/coordinates.md index 1e023415890..01802e336bf 100644 --- a/docs/en/sql-reference/functions/geo/coordinates.md +++ b/docs/en/sql-reference/functions/geo/coordinates.md @@ -31,13 +31,13 @@ Generates an exception when the input parameter values fall outside of the range **Example** ``` sql -SELECT greatCircleDistance(55.755831, 37.617673, -55.755831, -37.617673) +SELECT greatCircleDistance(55.755831, 37.617673, -55.755831, -37.617673) AS greatCircleDistance ``` ``` text -┌─greatCircleDistance(55.755831, 37.617673, -55.755831, -37.617673)─┠-│ 14132374.194975413 │ -└───────────────────────────────────────────────────────────────────┘ +┌─greatCircleDistance─┠+│ 14128352 │ +└─────────────────────┘ ``` ## geoDistance @@ -47,6 +47,37 @@ The performance is the same as for `greatCircleDistance` (no performance drawbac Technical note: for close enough points we calculate the distance using planar approximation with the metric on the tangent plane at the midpoint of the coordinates. +``` sql +geoDistance(lon1Deg, lat1Deg, lon2Deg, lat2Deg) +``` + +**Input parameters** + +- `lon1Deg` — Longitude of the first point in degrees. Range: `[-180°, 180°]`. +- `lat1Deg` — Latitude of the first point in degrees. Range: `[-90°, 90°]`. +- `lon2Deg` — Longitude of the second point in degrees. Range: `[-180°, 180°]`. +- `lat2Deg` — Latitude of the second point in degrees. Range: `[-90°, 90°]`. + +Positive values correspond to North latitude and East longitude, and negative values correspond to South latitude and West longitude. + +**Returned value** + +The distance between two points on the Earth’s surface, in meters. + +Generates an exception when the input parameter values fall outside of the range. + +**Example** + +``` sql +SELECT geoDistance(38.8976, -77.0366, 39.9496, -75.1503) AS geoDistance +``` + +``` text +┌─geoDistance─┠+│ 212458.73 │ +└─────────────┘ +``` + ## greatCircleAngle Calculates the central angle between two points on the Earth’s surface using [the great-circle formula](https://en.wikipedia.org/wiki/Great-circle_distance). diff --git a/docs/en/sql-reference/functions/other-functions.md b/docs/en/sql-reference/functions/other-functions.md index 7146484361e..011b73405c5 100644 --- a/docs/en/sql-reference/functions/other-functions.md +++ b/docs/en/sql-reference/functions/other-functions.md @@ -792,7 +792,7 @@ neighbor(column, offset[, default_value]) The result of the function depends on the affected data blocks and the order of data in the block. -:::warning +:::warning It can reach the neighbor rows only inside the currently processed data block. ::: @@ -902,7 +902,7 @@ Result: Calculates the difference between successive row values ​​in the data block. Returns 0 for the first row and the difference from the previous row for each subsequent row. -:::warning +:::warning It can reach the previous row only inside the currently processed data block. ::: @@ -986,7 +986,7 @@ Each event has a start time and an end time. The start time is included in the e The function calculates the total number of active (concurrent) events for each event start time. -:::warning +:::warning Events must be ordered by the start time in ascending order. If this requirement is violated the function raises an exception. Every data block is processed separately. If events from different data blocks overlap then they can not be processed correctly. ::: @@ -1674,7 +1674,7 @@ Result: Accumulates states of an aggregate function for each row of a data block. -:::warning +:::warning The state is reset for each new data block. ::: @@ -2177,7 +2177,7 @@ Number of digits. Type: [UInt8](../../sql-reference/data-types/int-uint.md#uint-ranges). -:::note +:::note For `Decimal` values takes into account their scales: calculates result over underlying integer type which is `(value * scale)`. For example: `countDigits(42) = 2`, `countDigits(42.000) = 5`, `countDigits(0.04200) = 4`. I.e. you may check decimal overflow for `Decimal64` with `countDecimal(x) > 18`. It's a slow variant of [isDecimalOverflow](#is-decimal-overflow). ::: @@ -2260,7 +2260,7 @@ Result: ## currentProfiles -Returns a list of the current [settings profiles](../../operations/access-rights.md#settings-profiles-management) for the current user. +Returns a list of the current [settings profiles](../../guides/sre/user-management/index.md#settings-profiles-management) for the current user. The command [SET PROFILE](../../sql-reference/statements/set.md#query-set) could be used to change the current setting profile. If the command `SET PROFILE` was not used the function returns the profiles specified at the current user's definition (see [CREATE USER](../../sql-reference/statements/create/user.md#create-user-statement)). @@ -2272,7 +2272,7 @@ currentProfiles() **Returned value** -- List of the current user settings profiles. +- List of the current user settings profiles. Type: [Array](../../sql-reference/data-types/array.md)([String](../../sql-reference/data-types/string.md)). @@ -2288,7 +2288,7 @@ enabledProfiles() **Returned value** -- List of the enabled settings profiles. +- List of the enabled settings profiles. Type: [Array](../../sql-reference/data-types/array.md)([String](../../sql-reference/data-types/string.md)). @@ -2304,7 +2304,7 @@ defaultProfiles() **Returned value** -- List of the default settings profiles. +- List of the default settings profiles. Type: [Array](../../sql-reference/data-types/array.md)([String](../../sql-reference/data-types/string.md)). @@ -2320,7 +2320,7 @@ currentRoles() **Returned value** -- List of the current roles for the current user. +- List of the current roles for the current user. Type: [Array](../../sql-reference/data-types/array.md)([String](../../sql-reference/data-types/string.md)). @@ -2336,13 +2336,13 @@ enabledRoles() **Returned value** -- List of the enabled roles for the current user. +- List of the enabled roles for the current user. Type: [Array](../../sql-reference/data-types/array.md)([String](../../sql-reference/data-types/string.md)). ## defaultRoles -Returns the names of the roles which are enabled by default for the current user when he logins. Initially these are all roles granted to the current user (see [GRANT](../../sql-reference/statements/grant/#grant-select)), but that can be changed with the [SET DEFAULT ROLE](../../sql-reference/statements/set-role.md#set-default-role-statement) statement. +Returns the names of the roles which are enabled by default for the current user when he logins. Initially these are all roles granted to the current user (see [GRANT](../../sql-reference/statements/grant.md#grant-select)), but that can be changed with the [SET DEFAULT ROLE](../../sql-reference/statements/set-role.md#set-default-role-statement) statement. **Syntax** @@ -2352,7 +2352,7 @@ defaultRoles() **Returned value** -- List of the default roles for the current user. +- List of the default roles for the current user. Type: [Array](../../sql-reference/data-types/array.md)([String](../../sql-reference/data-types/string.md)). @@ -2499,7 +2499,7 @@ In the following example a configuration with two shards is used. The query is e Query: ``` sql -CREATE TABLE shard_num_example (dummy UInt8) +CREATE TABLE shard_num_example (dummy UInt8) ENGINE=Distributed(test_cluster_two_shards_localhost, system, one, dummy); SELECT dummy, shardNum(), shardCount() FROM shard_num_example; ``` diff --git a/docs/en/sql-reference/functions/string-search-functions.md b/docs/en/sql-reference/functions/string-search-functions.md index 2f660d820d1..38ccb43cbc9 100644 --- a/docs/en/sql-reference/functions/string-search-functions.md +++ b/docs/en/sql-reference/functions/string-search-functions.md @@ -375,7 +375,7 @@ For a case-insensitive search or/and in UTF-8 format use functions `multiSearchA In all `multiSearch*` functions the number of needles should be less than 28 because of implementation specification. ::: -## match(haystack, pattern) +## match(haystack, pattern), haystack REGEXP pattern operator Checks whether string `haystack` matches the regular expression `pattern`. The pattern is an [re2 regular expression](https://github.com/google/re2/wiki/Syntax) which has a more limited syntax than Perl regular expressions. @@ -551,7 +551,7 @@ Other LIKE expressions are internally converted to a regular expression and exec ## notLike(haystack, pattern), haystack NOT LIKE pattern operator -The same thing as ‘like’, but negative. +The same thing as `like`, but negative. ## ilike @@ -611,9 +611,9 @@ Result: └────┴─────────┴──────┘ ``` -**See Also** - +## notILike(haystack, pattern), haystack NOT ILIKE pattern operator +The same thing as `ilike`, but negative. ## ngramDistance(haystack, needle) diff --git a/docs/en/sql-reference/functions/type-conversion-functions.md b/docs/en/sql-reference/functions/type-conversion-functions.md index 5d96113fe50..90f6cf0aa7d 100644 --- a/docs/en/sql-reference/functions/type-conversion-functions.md +++ b/docs/en/sql-reference/functions/type-conversion-functions.md @@ -1148,6 +1148,85 @@ Result: └───────────────────────────┴──────────────────────────────┘ ``` +## parseDateTime {#type_conversion_functions-parseDateTime} + +Converts a [String](/docs/en/sql-reference/data-types/string.md) to [DateTime](/docs/en/sql-reference/data-types/datetime.md) according to a [MySQL format string](https://dev.mysql.com/doc/refman/8.0/en/date-and-time-functions.html#function_date-format). + +This function is the opposite operation of function [formatDateTime](/docs/en/sql-reference/functions/date-time-functions.md#date_time_functions-formatDateTime). + +**Syntax** + +``` sql +parseDateTime(str, format[, timezone]) +``` + +**Arguments** + +- `str` — the String to be parsed +- `format` — the format string +- `timezone` — [Timezone](/docs/en/operations/server-configuration-parameters/settings.md/#server_configuration_parameters-timezone). Optional. + +**Returned value(s)** + +Returns DateTime values parsed from input string according to a MySQL style format string. + +**Supported format specifiers** + +All format specifiers listed in [formatDateTime](/docs/en/sql-reference/functions/date-time-functions.md#date_time_functions-formatDateTime) except: +- %f: fractional second +- %Q: Quarter (1-4) + +**Example** + +``` sql +SELECT parseDateTime('2021-01-04+23:00:00', '%Y-%m-%d+%H:%i:%s') + +┌─parseDateTime('2021-01-04+23:00:00', '%Y-%m-%d+%H:%i:%s')─┠+│ 2021-01-04 23:00:00 │ +└───────────────────────────────────────────────────────────┘ +``` + +Alias: `TO_TIMESTAMP`. + +## parseDateTimeInJodaSyntax {#type_conversion_functions-parseDateTimeInJodaSyntax} + +Similar to [parseDateTime](#parsedatetime), except that the format string is in [Joda](https://joda-time.sourceforge.net/apidocs/org/joda/time/format/DateTimeFormat.html) instead of MySQL syntax. + +This function is the opposite operation of function [formatDateTimeInJodaSyntax](/docs/en/sql-reference/functions/date-time-functions.md#date_time_functions-formatDateTimeInJodaSyntax). + +**Syntax** + +``` sql +parseDateTimeInJodaSyntax(str, format[, timezone]) +``` + +**Arguments** + +- `str` — the String to be parsed +- `format` — the format string +- `timezone` — [Timezone](/docs/en/operations/server-configuration-parameters/settings.md/#server_configuration_parameters-timezone). Optional. + +**Returned value(s)** + +Returns DateTime values parsed from input string according to a Joda style format. + +**Supported format specifiers** + +All format specifiers listed in [formatDateTimeInJoda](/docs/en/sql-reference/functions/date-time-functions.md#date_time_functions-formatDateTime) are supported, except: +- S: fraction of second +- z: time zone +- Z: time zone offset/id + +**Example** + +``` sql +SELECT parseDateTimeInJodaSyntax('2023-02-24 14:53:31', 'yyyy-MM-dd HH:mm:ss', 'Europe/Minsk') + +┌─parseDateTimeInJodaSyntax('2023-02-24 14:53:31', 'yyyy-MM-dd HH:mm:ss', 'Europe/Minsk')─┠+│ 2023-02-24 14:53:31 │ +└─────────────────────────────────────────────────────────────────────────────────────────┘ +``` + ## parseDateTimeBestEffort ## parseDateTime32BestEffort @@ -1351,7 +1430,6 @@ Same as for [parseDateTime64BestEffort](#parsedatetime64besteffort), except that Same as for [parseDateTime64BestEffort](#parsedatetime64besteffort), except that this function prefers US date format (`MM/DD/YYYY` etc.) in case of ambiguity and returns zero date or zero date time when it encounters a date format that cannot be processed. - ## toLowCardinality Converts input parameter to the [LowCardinality](/docs/en/sql-reference/data-types/lowcardinality.md) version of same data type. diff --git a/docs/en/sql-reference/index.md b/docs/en/sql-reference/index.md new file mode 100644 index 00000000000..eddc5b204d9 --- /dev/null +++ b/docs/en/sql-reference/index.md @@ -0,0 +1,22 @@ +--- +keywords: [clickhouse, docs, sql reference, sql statements, sql, syntax] +title: SQL Reference +--- + +import { TwoColumnList } from '/src/components/two_column_list' +import { ClickableSquare } from '/src/components/clickable_square' +import { HorizontalDivide } from '/src/components/horizontal_divide' +import { ViewAllLink } from '/src/components/view_all_link' +import { VideoContainer } from '/src/components/video_container' + +import LinksDeployment from './sql-reference-links.json' + +# ClickHouse SQL Reference + +ClickHouse supports a declarative query language based on SQL that is identical to the ANSI SQL standard in many cases. + +Supported queries include GROUP BY, ORDER BY, subqueries in FROM, JOIN clause, IN operator, window functions and scalar subqueries. + + + + \ No newline at end of file diff --git a/docs/en/sql-reference/sql-reference-links.json b/docs/en/sql-reference/sql-reference-links.json new file mode 100644 index 00000000000..3811ad18462 --- /dev/null +++ b/docs/en/sql-reference/sql-reference-links.json @@ -0,0 +1,12 @@ +[ + { + "title": "Statements", + "description": "A list of available SQL statements in ClickHouse", + "url": "/docs/en/sql-reference/statements/" + }, + { + "title": "Database and Table Engines", + "description": "Engines determine where and how your data is stored", + "url": "/docs/en/engines/table-engines" + } +] diff --git a/docs/en/sql-reference/statements/alter/projection.md b/docs/en/sql-reference/statements/alter/projection.md index 908d28d7ab1..030e9352a00 100644 --- a/docs/en/sql-reference/statements/alter/projection.md +++ b/docs/en/sql-reference/statements/alter/projection.md @@ -17,7 +17,7 @@ Projections will create internally a new hidden table, this means that more IO a Example, If the projection has defined a different primary key, all the data from the original table will be duplicated. ::: -You can see more technical details about how projections work internally on this [page](/docs/en/guides/improving-query-performance/sparse-primary-indexes/sparse-primary-indexes-multiple.md/#option-3-projections). +You can see more technical details about how projections work internally on this [page](/docs/en/guides/best-practices/sparse-primary-indexes.md/#option-3-projections). ## Example filtering without using primary keys @@ -37,7 +37,7 @@ Using `ALTER TABLE`, we could add the Projection to an existing table: ``` ALTER TABLE visits_order ADD PROJECTION user_name_projection ( SELECT -* +* ORDER BY user_name ) @@ -128,7 +128,7 @@ SELECT user_agent, sum(pages_visited) FROM visits -GROUP BY user_id +GROUP BY user_agent ``` As mentioned before, we could review the `system.query_log` table. On the `projections` field we have the name of the projection used or empty if none has been used: @@ -161,6 +161,6 @@ The commands `ADD`, `DROP` and `CLEAR` are lightweight in a sense that they only Also, they are replicated, syncing projections metadata via ClickHouse Keeper or ZooKeeper. -:::note +:::note Projection manipulation is supported only for tables with [`*MergeTree`](/docs/en/engines/table-engines/mergetree-family/mergetree.md) engine (including [replicated](/docs/en/engines/table-engines/mergetree-family/replication.md) variants). ::: diff --git a/docs/en/sql-reference/statements/alter/view.md b/docs/en/sql-reference/statements/alter/view.md index e382cdace30..5c5bf0355f6 100644 --- a/docs/en/sql-reference/statements/alter/view.md +++ b/docs/en/sql-reference/statements/alter/view.md @@ -6,11 +6,166 @@ sidebar_label: VIEW # ALTER TABLE … MODIFY QUERY Statement -You can modify `SELECT` query that was specified when a [materialized view](../create/view.md#materialized) was created with the `ALTER TABLE … MODIFY QUERY` statement. Use it when the materialized view was created without the `TO [db.]name` clause. The `allow_experimental_alter_materialized_view_structure` setting must be enabled. +You can modify `SELECT` query that was specified when a [materialized view](../create/view.md#materialized) was created with the `ALTER TABLE … MODIFY QUERY` statement without interrupting ingestion process. -If a materialized view uses the `TO [db.]name` construction, you must [DETACH](../detach.md) the view, run [ALTER TABLE](index.md) query for the target table, and then [ATTACH](../attach.md) the previously detached (`DETACH`) view. +The `allow_experimental_alter_materialized_view_structure` setting must be enabled. -**Example** +This command is created to change materialized view created with `TO [db.]name` clause. It does not change the structure of the underling storage table and it does not change the columns' definition of the materialized view, because of this the application of this command is very limited for materialized views are created without `TO [db.]name` clause. + +**Example with TO table** + +```sql +CREATE TABLE events (ts DateTime, event_type String) +ENGINE = MergeTree ORDER BY (event_type, ts); + +CREATE TABLE events_by_day (ts DateTime, event_type String, events_cnt UInt64) +ENGINE = SummingMergeTree ORDER BY (event_type, ts); + +CREATE MATERIALIZED VIEW mv TO events_by_day AS +SELECT toStartOfDay(ts) ts, event_type, count() events_cnt +FROM events +GROUP BY ts, event_type; + +INSERT INTO events +SELECT Date '2020-01-01' + interval number * 900 second, + ['imp', 'click'][number%2+1] +FROM numbers(100); + +SELECT ts, event_type, sum(events_cnt) +FROM events_by_day +GROUP BY ts, event_type +ORDER BY ts, event_type; + +┌──────────────────ts─┬─event_type─┬─sum(events_cnt)─┠+│ 2020-01-01 00:00:00 │ click │ 48 │ +│ 2020-01-01 00:00:00 │ imp │ 48 │ +│ 2020-01-02 00:00:00 │ click │ 2 │ +│ 2020-01-02 00:00:00 │ imp │ 2 │ +└─────────────────────┴────────────┴─────────────────┘ + +-- Let's add the new measurment `cost` +-- and the new dimension `browser`. + +ALTER TABLE events + ADD COLUMN browser String, + ADD COLUMN cost Float64; + +-- Column do not have to match in a materialized view and TO +-- (destination table), so the next alter does not break insertion. + +ALTER TABLE events_by_day + ADD COLUMN cost Float64, + ADD COLUMN browser String after event_type, + MODIFY ORDER BY (event_type, ts, browser); + +INSERT INTO events +SELECT Date '2020-01-02' + interval number * 900 second, + ['imp', 'click'][number%2+1], + ['firefox', 'safary', 'chrome'][number%3+1], + 10/(number+1)%33 +FROM numbers(100); + +-- New columns `browser` and `cost` are empty because we did not change Materialized View yet. + +SELECT ts, event_type, browser, sum(events_cnt) events_cnt, round(sum(cost),2) cost +FROM events_by_day +GROUP BY ts, event_type, browser +ORDER BY ts, event_type; + +┌──────────────────ts─┬─event_type─┬─browser─┬─events_cnt─┬─cost─┠+│ 2020-01-01 00:00:00 │ click │ │ 48 │ 0 │ +│ 2020-01-01 00:00:00 │ imp │ │ 48 │ 0 │ +│ 2020-01-02 00:00:00 │ click │ │ 50 │ 0 │ +│ 2020-01-02 00:00:00 │ imp │ │ 50 │ 0 │ +│ 2020-01-03 00:00:00 │ click │ │ 2 │ 0 │ +│ 2020-01-03 00:00:00 │ imp │ │ 2 │ 0 │ +└─────────────────────┴────────────┴─────────┴────────────┴──────┘ + +SET allow_experimental_alter_materialized_view_structure=1; + +ALTER TABLE mv MODIFY QUERY + SELECT toStartOfDay(ts) ts, event_type, browser, + count() events_cnt, + sum(cost) cost + FROM events + GROUP BY ts, event_type, browser; + +INSERT INTO events +SELECT Date '2020-01-03' + interval number * 900 second, + ['imp', 'click'][number%2+1], + ['firefox', 'safary', 'chrome'][number%3+1], + 10/(number+1)%33 +FROM numbers(100); + +SELECT ts, event_type, browser, sum(events_cnt) events_cnt, round(sum(cost),2) cost +FROM events_by_day +GROUP BY ts, event_type, browser +ORDER BY ts, event_type; + +┌──────────────────ts─┬─event_type─┬─browser─┬─events_cnt─┬──cost─┠+│ 2020-01-01 00:00:00 │ click │ │ 48 │ 0 │ +│ 2020-01-01 00:00:00 │ imp │ │ 48 │ 0 │ +│ 2020-01-02 00:00:00 │ click │ │ 50 │ 0 │ +│ 2020-01-02 00:00:00 │ imp │ │ 50 │ 0 │ +│ 2020-01-03 00:00:00 │ click │ firefox │ 16 │ 6.84 │ +│ 2020-01-03 00:00:00 │ click │ │ 2 │ 0 │ +│ 2020-01-03 00:00:00 │ click │ safary │ 16 │ 9.82 │ +│ 2020-01-03 00:00:00 │ click │ chrome │ 16 │ 5.63 │ +│ 2020-01-03 00:00:00 │ imp │ │ 2 │ 0 │ +│ 2020-01-03 00:00:00 │ imp │ firefox │ 16 │ 15.14 │ +│ 2020-01-03 00:00:00 │ imp │ safary │ 16 │ 6.14 │ +│ 2020-01-03 00:00:00 │ imp │ chrome │ 16 │ 7.89 │ +│ 2020-01-04 00:00:00 │ click │ safary │ 1 │ 0.1 │ +│ 2020-01-04 00:00:00 │ click │ firefox │ 1 │ 0.1 │ +│ 2020-01-04 00:00:00 │ imp │ firefox │ 1 │ 0.1 │ +│ 2020-01-04 00:00:00 │ imp │ chrome │ 1 │ 0.1 │ +└─────────────────────┴────────────┴─────────┴────────────┴───────┘ + +-- !!! During `MODIFY ORDER BY` PRIMARY KEY was implicitly introduced. + +SHOW CREATE TABLE events_by_day FORMAT TSVRaw + +CREATE TABLE test.events_by_day +( + `ts` DateTime, + `event_type` String, + `browser` String, + `events_cnt` UInt64, + `cost` Float64 +) +ENGINE = SummingMergeTree +PRIMARY KEY (event_type, ts) +ORDER BY (event_type, ts, browser) +SETTINGS index_granularity = 8192 + +-- !!! The columns' definition is unchanged but it does not matter, we are not quering +-- MATERIALIZED VIEW, we are quering TO (storage) table. +-- SELECT section is updated. + +SHOW CREATE TABLE mv FORMAT TSVRaw; + +CREATE MATERIALIZED VIEW test.mv TO test.events_by_day +( + `ts` DateTime, + `event_type` String, + `events_cnt` UInt64 +) AS +SELECT + toStartOfDay(ts) AS ts, + event_type, + browser, + count() AS events_cnt, + sum(cost) AS cost +FROM test.events +GROUP BY + ts, + event_type, + browser +``` + +**Example without TO table** + +The application is very limited because you can only change the `SELECT` section without adding new columns. ```sql CREATE TABLE src_table (`a` UInt32) ENGINE = MergeTree ORDER BY a; @@ -25,6 +180,7 @@ SELECT * FROM mv; └───┘ ``` ```sql +set allow_experimental_alter_materialized_view_structure=1; ALTER TABLE mv MODIFY QUERY SELECT a * 2 as a FROM src_table; INSERT INTO src_table (a) VALUES (3), (4); SELECT * FROM mv; diff --git a/docs/en/sql-reference/statements/create/dictionary.md b/docs/en/sql-reference/statements/create/dictionary.md index e789dd9257f..29c72d62f24 100644 --- a/docs/en/sql-reference/statements/create/dictionary.md +++ b/docs/en/sql-reference/statements/create/dictionary.md @@ -5,7 +5,7 @@ sidebar_label: DICTIONARY title: "CREATE DICTIONARY" --- -Creates a new [dictionary](../../../sql-reference/dictionaries/external-dictionaries/external-dicts.md) with given [structure](../../../sql-reference/dictionaries/external-dictionaries/external-dicts-dict-structure.md), [source](../../../sql-reference/dictionaries/external-dictionaries/external-dicts-dict-sources.md), [layout](../../../sql-reference/dictionaries/external-dictionaries/external-dicts-dict-layout.md) and [lifetime](../../../sql-reference/dictionaries/external-dictionaries/external-dicts-dict-lifetime.md). +Creates a new [dictionary](../../../sql-reference/dictionaries/index.md) with given [structure](../../../sql-reference/dictionaries/index.md#dictionary-key-and-fields), [source](../../../sql-reference/dictionaries/index.md#dictionary-sources), [layout](../../../sql-reference/dictionaries/index.md#storig-dictionaries-in-memory) and [lifetime](../../../sql-reference/dictionaries/index.md#dictionary-updates). ## Syntax @@ -29,7 +29,7 @@ The dictionary structure consists of attributes. Dictionary attributes are speci `ON CLUSTER` clause allows creating dictionary on a cluster, see [Distributed DDL](../../../sql-reference/distributed-ddl.md). -Depending on dictionary [layout](../../../sql-reference/dictionaries/external-dictionaries/external-dicts-dict-layout.md) one or more attributes can be specified as dictionary keys. +Depending on dictionary [layout](../../../sql-reference/dictionaries/index.md#storig-dictionaries-in-memory) one or more attributes can be specified as dictionary keys. ## SOURCE @@ -125,9 +125,9 @@ LAYOUT(HASHED()) ### Create a dictionary from another database -Please see the details in [Dictionary sources](/docs/en/sql-reference/dictionaries/external-dictionaries/external-dicts-dict-sources.md/#dbms). +Please see the details in [Dictionary sources](/docs/en/sql-reference/dictionaries/index.md#dictionary-sources/#dbms). **See Also** -- For more information, see the [Dictionaries](../../../sql-reference/dictionaries/external-dictionaries/external-dicts.md) section. -- [system.dictionaries](../../../operations/system-tables/dictionaries.md) — This table contains information about [Dictionaries](../../../sql-reference/dictionaries/external-dictionaries/external-dicts.md). +- For more information, see the [Dictionaries](../../../sql-reference/dictionaries/index.md) section. +- [system.dictionaries](../../../operations/system-tables/dictionaries.md) — This table contains information about [Dictionaries](../../../sql-reference/dictionaries/index.md). diff --git a/docs/en/sql-reference/statements/create/quota.md b/docs/en/sql-reference/statements/create/quota.md index 3952743b480..7c31f93fff7 100644 --- a/docs/en/sql-reference/statements/create/quota.md +++ b/docs/en/sql-reference/statements/create/quota.md @@ -5,7 +5,7 @@ sidebar_label: QUOTA title: "CREATE QUOTA" --- -Creates a [quota](../../../operations/access-rights.md#quotas-management) that can be assigned to a user or a role. +Creates a [quota](../../../guides/sre/user-management/index.md#quotas-management) that can be assigned to a user or a role. Syntax: diff --git a/docs/en/sql-reference/statements/create/role.md b/docs/en/sql-reference/statements/create/role.md index 68fdd51e957..9b14e220e1f 100644 --- a/docs/en/sql-reference/statements/create/role.md +++ b/docs/en/sql-reference/statements/create/role.md @@ -5,7 +5,7 @@ sidebar_label: ROLE title: "CREATE ROLE" --- -Creates new [roles](../../../operations/access-rights.md#role-management). Role is a set of [privileges](../../../sql-reference/statements/grant.md#grant-privileges). A [user](../../../sql-reference/statements/create/user.md) assigned a role gets all the privileges of this role. +Creates new [roles](../../../guides/sre/user-management/index.md#role-management). Role is a set of [privileges](../../../sql-reference/statements/grant.md#grant-privileges). A [user](../../../sql-reference/statements/create/user.md) assigned a role gets all the privileges of this role. Syntax: @@ -22,7 +22,7 @@ User can have default roles which apply at user login. To set default roles, use To revoke a role, use the [REVOKE](../../../sql-reference/statements/revoke.md) statement. -To delete role, use the [DROP ROLE](../../../sql-reference/statements/drop#drop-role-statement) statement. The deleted role is being automatically revoked from all the users and roles to which it was assigned. +To delete role, use the [DROP ROLE](../../../sql-reference/statements/drop.md#drop-role-statement) statement. The deleted role is being automatically revoked from all the users and roles to which it was assigned. ## Examples diff --git a/docs/en/sql-reference/statements/create/row-policy.md b/docs/en/sql-reference/statements/create/row-policy.md index 31ce9221eea..56a57534234 100644 --- a/docs/en/sql-reference/statements/create/row-policy.md +++ b/docs/en/sql-reference/statements/create/row-policy.md @@ -5,9 +5,9 @@ sidebar_label: ROW POLICY title: "CREATE ROW POLICY" --- -Creates a [row policy](../../../operations/access-rights.md#row-policy-management), i.e. a filter used to determine which rows a user can read from a table. +Creates a [row policy](../../../guides/sre/user-management/index.md#row-policy-management), i.e. a filter used to determine which rows a user can read from a table. -:::warning +:::warning Row policies makes sense only for users with readonly access. If user can modify table or copy partitions between tables, it defeats the restrictions of row policies. ::: @@ -31,7 +31,7 @@ In the section `TO` you can provide a list of users and roles this policy should Keyword `ALL` means all the ClickHouse users including current user. Keyword `ALL EXCEPT` allow to exclude some users from the all users list, for example, `CREATE ROW POLICY ... TO ALL EXCEPT accountant, john@localhost` -:::note +:::note If there are no row policies defined for a table then any user can `SELECT` all the row from the table. Defining one or more row policies for the table makes the access to the table depending on the row policies no matter if those row policies are defined for the current user or not. For example, the following policy `CREATE ROW POLICY pol1 ON mydb.table1 USING b=1 TO mira, peter` diff --git a/docs/en/sql-reference/statements/create/settings-profile.md b/docs/en/sql-reference/statements/create/settings-profile.md index c2424ff6046..8e221a4d82f 100644 --- a/docs/en/sql-reference/statements/create/settings-profile.md +++ b/docs/en/sql-reference/statements/create/settings-profile.md @@ -5,7 +5,7 @@ sidebar_label: SETTINGS PROFILE title: "CREATE SETTINGS PROFILE" --- -Creates [settings profiles](../../../operations/access-rights.md#settings-profiles-management) that can be assigned to a user or a role. +Creates [settings profiles](../../../guides/sre/user-management/index.md#settings-profiles-management) that can be assigned to a user or a role. Syntax: @@ -27,7 +27,7 @@ CREATE USER robin IDENTIFIED BY 'password'; Create the `max_memory_usage_profile` settings profile with value and constraints for the `max_memory_usage` setting and assign it to user `robin`: ``` sql -CREATE -SETTINGS PROFILE max_memory_usage_profile SETTINGS max_memory_usage = 100000001 MIN 90000000 MAX 110000000 +CREATE +SETTINGS PROFILE max_memory_usage_profile SETTINGS max_memory_usage = 100000001 MIN 90000000 MAX 110000000 TO robin ``` diff --git a/docs/en/sql-reference/statements/create/table.md b/docs/en/sql-reference/statements/create/table.md index de39d960476..b29bf31e26c 100644 --- a/docs/en/sql-reference/statements/create/table.md +++ b/docs/en/sql-reference/statements/create/table.md @@ -313,7 +313,9 @@ Defines storage time for values. Can be specified only for MergeTree-family tabl ## Column Compression Codecs -By default, ClickHouse applies the `lz4` compression method. For `MergeTree`-engine family you can change the default compression method in the [compression](../../../operations/server-configuration-parameters/settings.md#server-settings-compression) section of a server configuration. +By default, ClickHouse applies `lz4` compression in the self-managed version, and `zstd` in ClickHouse Cloud. + +For `MergeTree`-engine family you can change the default compression method in the [compression](../../../operations/server-configuration-parameters/settings.md#server-settings-compression) section of a server configuration. You can also define the compression method for each individual column in the `CREATE TABLE` query. @@ -379,8 +381,8 @@ High compression levels are useful for asymmetric scenarios, like compress once, `DEFLATE_QPL` — [Deflate compression algorithm](https://github.com/intel/qpl) implemented by Intel® Query Processing Library. Some limitations apply: - DEFLATE_QPL is experimental and can only be used after setting configuration parameter `allow_experimental_codecs=1`. -- DEFLATE_QPL only works if ClickHouse was compiled with support for AVX2 or AVX512 instructions -- DEFLATE_QPL works best if the system has a Intel® IAA (In-Memory Analytics Accelerator) offloading device +- DEFLATE_QPL only works if ClickHouse was compiled with support for AVX2 or AVX512 instructions. Refer to [Build Clickhouse with DEFLATE_QPL](/docs/en/development/building_and_benchmarking_deflate_qpl.md/#Build-Clickhouse-with-DEFLATE_QPL) for more details. +- DEFLATE_QPL works best if the system has a Intel® IAA (In-Memory Analytics Accelerator) offloading device. Refer to [Accelerator Configuration](https://intel.github.io/qpl/documentation/get_started_docs/installation.html#accelerator-configuration) and [Benchmark with DEFLATE_QPL](/docs/en/development/building_and_benchmarking_deflate_qpl.md/#Run-Benchmark-with-DEFLATE_QPL) for more details. - DEFLATE_QPL-compressed data can only be transferred between ClickHouse nodes compiled with support for AVX2/AVX512 ### Specialized Codecs diff --git a/docs/en/sql-reference/statements/create/user.md b/docs/en/sql-reference/statements/create/user.md index a756b3d4a0d..a38523ee37b 100644 --- a/docs/en/sql-reference/statements/create/user.md +++ b/docs/en/sql-reference/statements/create/user.md @@ -5,7 +5,7 @@ sidebar_label: USER title: "CREATE USER" --- -Creates [user accounts](../../../operations/access-rights.md#user-account-management). +Creates [user accounts](../../../guides/sre/user-management/index.md#user-account-management). Syntax: @@ -36,7 +36,61 @@ There are multiple ways of user identification: - `IDENTIFIED WITH kerberos` or `IDENTIFIED WITH kerberos REALM 'realm'` - `IDENTIFIED WITH ssl_certificate CN 'mysite.com:user'` -For identification with sha256_hash using `SALT` - hash must be calculated from concatination of 'password' and 'salt'. +## Examples + +1. The following username is `name1` and does not require a password - which obviously doesn't provide much security: + + ```sql + CREATE USER name1 NOT IDENTIFIED + ``` + +2. To specify a plaintext password: + + ```sql + CREATE USER name2 IDENTIFIED WITH plaintext_password BY 'my_password' + ``` + + :::warning + The password is stored in a SQL text file in `/var/lib/clickhouse/access`, so it's not a good idea to use `plaintext_password`. Try `sha256_password` instead, as demonstrated next... + ::: + +3. The best option is to use a password that is hashed using SHA-256. ClickHouse will hash the password for you when you specify `IDENTIFIED WITH sha256_password`. For example: + + ```sql + CREATE USER name3 IDENTIFIED WITH sha256_password BY 'my_password' + ``` + + Notice ClickHouse generates and runs the following command for you: + + ```response + CREATE USER name3 + IDENTIFIED WITH sha256_hash + BY '8B3404953FCAA509540617F082DB13B3E0734F90FF6365C19300CC6A6EA818D6' + SALT 'D6489D8B5692D82FF944EA6415785A8A8A1AF33825456AFC554487725A74A609' + ``` + + The `name3` user can now login using `my_password`, but the password is stored as the hashed value above. THe following SQL file was created in `/var/lib/clickhouse/access` and gets executed at server startup: + + ```bash + /var/lib/clickhouse/access $ cat 3843f510-6ebd-a52d-72ac-e021686d8a93.sql + ATTACH USER name3 IDENTIFIED WITH sha256_hash BY '0C268556C1680BEF0640AAC1E7187566704208398DA31F03D18C74F5C5BE5053' SALT '4FB16307F5E10048196966DD7E6876AE53DE6A1D1F625488482C75F14A5097C7'; + ``` + + :::note + If you have already created a hash value and corresponding salt value for a username, then you can use `IDENTIFIED WITH sha256_hash BY 'hash'` or `IDENTIFIED WITH sha256_hash BY 'hash' SALT 'salt'`. For identification with `sha256_hash` using `SALT` - hash must be calculated from concatenation of 'password' and 'salt'. + ::: + +4. The `double_sha1_password` is not typically needed, but comes in handy when working with clients that require it (like the MySQL interface): + + ```sql + CREATE USER name4 IDENTIFIED WITH double_sha1_password BY 'my_password' + ``` + + ClickHouse generates and runs the following query: + + ```response + CREATE USER name4 IDENTIFIED WITH double_sha1_hash BY 'CCD3A959D6A004B9C3807B728BC2E55B67E10518' + ``` ## User Host diff --git a/docs/en/sql-reference/statements/delete.md b/docs/en/sql-reference/statements/delete.md index e1987e50af4..7d7b8855d51 100644 --- a/docs/en/sql-reference/statements/delete.md +++ b/docs/en/sql-reference/statements/delete.md @@ -30,12 +30,6 @@ SET allow_experimental_lightweight_delete = true; ::: -An [alternative way to delete rows](./alter/delete.md) in ClickHouse is `ALTER TABLE ... DELETE`, which might be more efficient if you do bulk deletes only occasionally and don't need the operation to be applied instantly. In most use cases the new lightweight `DELETE FROM` behavior will be considerably faster. - -:::warning -Even though deletes are becoming more lightweight in ClickHouse, they should still not be used as aggressively as on an OLTP system. Lightweight deletes are currently efficient for wide parts, but for compact parts, they can be a heavyweight operation, and it may be better to use `ALTER TABLE` for some scenarios. -::: - :::note `DELETE FROM` requires the `ALTER DELETE` privilege: ```sql @@ -51,7 +45,7 @@ The idea behind Lightweight Delete is that when a `DELETE FROM table ...` query The mask is implemented as a hidden `_row_exists` system column that stores True for all visible rows and False for deleted ones. This column is only present in a part if some rows in this part were deleted. In other words, the column is not persisted when it has all values equal to True. ## SELECT query -When the column is present `SELECT ... FROM table WHERE condition` query internally is extended by an additional predicate on `_row_exists` and becomes similar to +When the column is present `SELECT ... FROM table WHERE condition` query internally is extended by an additional predicate on `_row_exists` and becomes similar to ```sql SELECT ... FROM table PREWHERE _row_exists WHERE condition ``` diff --git a/docs/en/sql-reference/statements/describe-table.md b/docs/en/sql-reference/statements/describe-table.md index 4864743abbc..b9190107127 100644 --- a/docs/en/sql-reference/statements/describe-table.md +++ b/docs/en/sql-reference/statements/describe-table.md @@ -24,9 +24,9 @@ The `DESCRIBE` statement returns a row for each table column with the following - `ttl_expression` — A [TTL](../../engines/table-engines/mergetree-family/mergetree.md#table_engine-mergetree-ttl) expression. - `is_subcolumn` — A flag that equals `1` for internal subcolumns. It is included into the result only if subcolumn description is enabled by the [describe_include_subcolumns](../../operations/settings/settings.md#describe_include_subcolumns) setting. -All columns in [Nested](../../sql-reference/data-types/nested-data-structures/nested.md) data structures are described separately. The name of each column is prefixed with a parent column name and a dot. +All columns in [Nested](../../sql-reference/data-types/nested-data-structures/index.md) data structures are described separately. The name of each column is prefixed with a parent column name and a dot. -To show internal subcolumns of other data types, use the [describe_include_subcolumns](../../operations/settings/settings.md#describe_include_subcolumns) setting. +To show internal subcolumns of other data types, use the [describe_include_subcolumns](../../operations/settings/settings.md#describe_include_subcolumns) setting. **Example** diff --git a/docs/en/sql-reference/statements/detach.md b/docs/en/sql-reference/statements/detach.md index aa87b1ef613..5f1513d3f44 100644 --- a/docs/en/sql-reference/statements/detach.md +++ b/docs/en/sql-reference/statements/detach.md @@ -22,7 +22,7 @@ System log tables can be also attached back (e.g. `query_log`, `text_log`, etc). Note that you can not detach permanently the table which is already detached (temporary). But you can attach it back and then detach permanently again. -Also you can not [DROP](../../sql-reference/statements/drop#drop-table) the detached table, or [CREATE TABLE](../../sql-reference/statements/create/table.md) with the same name as detached permanently, or replace it with the other table with [RENAME TABLE](../../sql-reference/statements/rename.md) query. +Also you can not [DROP](../../sql-reference/statements/drop.md#drop-table) the detached table, or [CREATE TABLE](../../sql-reference/statements/create/table.md) with the same name as detached permanently, or replace it with the other table with [RENAME TABLE](../../sql-reference/statements/rename.md) query. The `SYNC` modifier executes the action without delay. diff --git a/docs/en/sql-reference/statements/index.md b/docs/en/sql-reference/statements/index.md index b286d8c932d..100b8889aaa 100644 --- a/docs/en/sql-reference/statements/index.md +++ b/docs/en/sql-reference/statements/index.md @@ -1,10 +1,10 @@ --- slug: /en/sql-reference/statements/ sidebar_position: 1 -sidebar_label: Statements +sidebar_label: List of statements --- -# ClickHouse SQL Statements +# ClickHouse SQL Statements Statements represent various kinds of action you can perform using SQL queries. Each kind of statement has it’s own syntax and usage details that are described separately: diff --git a/docs/en/sql-reference/statements/insert-into.md b/docs/en/sql-reference/statements/insert-into.md index f2d590d196b..354ab95c598 100644 --- a/docs/en/sql-reference/statements/insert-into.md +++ b/docs/en/sql-reference/statements/insert-into.md @@ -4,7 +4,7 @@ sidebar_position: 33 sidebar_label: INSERT INTO --- -# INSERT INTO Statement +# INSERT INTO Statement Inserts data into a table. @@ -89,7 +89,7 @@ INSERT INTO t FORMAT TabSeparated 22 Qwerty ``` -You can insert data separately from the query by using the command-line client or the HTTP interface. For more information, see the section “[Interfaces](../../interfaces)â€. +You can insert data separately from the query by using the [command-line client](/docs/en/integrations/sql-clients/clickhouse-client-local) or the [HTTP interface](/docs/en/interfaces/http/). :::note If you want to specify `SETTINGS` for `INSERT` query then you have to do it _before_ `FORMAT` clause since everything after `FORMAT format_name` is treated as data. For example: @@ -129,7 +129,7 @@ To insert a default value instead of `NULL` into a column with not nullable data INSERT INTO [db.]table [(c1, c2, c3)] FROM INFILE file_name [COMPRESSION type] FORMAT format_name ``` -Use the syntax above to insert data from a file, or files, stored on the **client** side. `file_name` and `type` are string literals. Input file [format](../../interfaces/formats.md) must be set in the `FORMAT` clause. +Use the syntax above to insert data from a file, or files, stored on the **client** side. `file_name` and `type` are string literals. Input file [format](../../interfaces/formats.md) must be set in the `FORMAT` clause. Compressed files are supported. The compression type is detected by the extension of the file name. Or it can be explicitly specified in a `COMPRESSION` clause. Supported types are: `'none'`, `'gzip'`, `'deflate'`, `'br'`, `'xz'`, `'zstd'`, `'lz4'`, `'bz2'`. @@ -191,7 +191,7 @@ INSERT INTO [TABLE] FUNCTION table_func ... ``` sql CREATE TABLE simple_table (id UInt32, text String) ENGINE=MergeTree() ORDER BY id; -INSERT INTO TABLE FUNCTION remote('localhost', default.simple_table) +INSERT INTO TABLE FUNCTION remote('localhost', default.simple_table) VALUES (100, 'inserted via remote()'); SELECT * FROM simple_table; ``` diff --git a/docs/en/sql-reference/statements/select/array-join.md b/docs/en/sql-reference/statements/select/array-join.md index a1b5e0cdb36..3d88a0f9b7a 100644 --- a/docs/en/sql-reference/statements/select/array-join.md +++ b/docs/en/sql-reference/statements/select/array-join.md @@ -146,7 +146,7 @@ ARRAY JOIN arr AS a, arrayEnumerate(arr) AS num, arrayMap(x -> x + 1, arr) AS ma └───────┴─────────┴───┴─────┴────────┘ ``` -The example below uses the [arrayEnumerate](../../../sql-reference/functions/array-functions#array_functions-arrayenumerate) function: +The example below uses the [arrayEnumerate](../../../sql-reference/functions/array-functions.md#array_functions-arrayenumerate) function: ``` sql SELECT s, arr, a, num, arrayEnumerate(arr) @@ -166,8 +166,8 @@ ARRAY JOIN arr AS a, arrayEnumerate(arr) AS num; Multiple arrays with different sizes can be joined by using: `SETTINGS enable_unaligned_array_join = 1`. Example: ```sql -SELECT s, arr, a, b -FROM arrays_test ARRAY JOIN arr as a, [['a','b'],['c']] as b +SELECT s, arr, a, b +FROM arrays_test ARRAY JOIN arr as a, [['a','b'],['c']] as b SETTINGS enable_unaligned_array_join = 1; ``` @@ -185,7 +185,7 @@ SETTINGS enable_unaligned_array_join = 1; ## ARRAY JOIN with Nested Data Structure -`ARRAY JOIN` also works with [nested data structures](../../../sql-reference/data-types/nested-data-structures/nested.md): +`ARRAY JOIN` also works with [nested data structures](../../../sql-reference/data-types/nested-data-structures/index.md): ``` sql CREATE TABLE nested_test @@ -278,7 +278,7 @@ ARRAY JOIN nest AS n; └───────┴─────┴─────┴─────────┴────────────┘ ``` -Example of using the [arrayEnumerate](../../../sql-reference/functions/array-functions#array_functions-arrayenumerate) function: +Example of using the [arrayEnumerate](../../../sql-reference/functions/array-functions.md#array_functions-arrayenumerate) function: ``` sql SELECT s, `n.x`, `n.y`, `nest.x`, `nest.y`, num diff --git a/docs/en/sql-reference/statements/select/group-by.md b/docs/en/sql-reference/statements/select/group-by.md index 2a4b06660c7..1018b24f50b 100644 --- a/docs/en/sql-reference/statements/select/group-by.md +++ b/docs/en/sql-reference/statements/select/group-by.md @@ -8,12 +8,12 @@ sidebar_label: GROUP BY `GROUP BY` clause switches the `SELECT` query into an aggregation mode, which works as follows: - `GROUP BY` clause contains a list of expressions (or a single expression, which is considered to be the list of length one). This list acts as a “grouping keyâ€, while each individual expression will be referred to as a “key expressionâ€. -- All the expressions in the [SELECT](../../../sql-reference/statements/select/index.md), [HAVING](../../../sql-reference/statements/select/having), and [ORDER BY](../../../sql-reference/statements/select/order-by.md) clauses **must** be calculated based on key expressions **or** on [aggregate functions](../../../sql-reference/aggregate-functions/index.md) over non-key expressions (including plain columns). In other words, each column selected from the table must be used either in a key expression or inside an aggregate function, but not both. +- All the expressions in the [SELECT](../../../sql-reference/statements/select/index.md), [HAVING](../../../sql-reference/statements/select/having.md), and [ORDER BY](../../../sql-reference/statements/select/order-by.md) clauses **must** be calculated based on key expressions **or** on [aggregate functions](../../../sql-reference/aggregate-functions/index.md) over non-key expressions (including plain columns). In other words, each column selected from the table must be used either in a key expression or inside an aggregate function, but not both. - Result of aggregating `SELECT` query will contain as many rows as there were unique values of “grouping key†in source table. Usually, this significantly reduces the row count, often by orders of magnitude, but not necessarily: row count stays the same if all “grouping key†values were distinct. When you want to group data in the table by column numbers instead of column names, enable the setting [enable_positional_arguments](../../../operations/settings/settings.md#enable-positional-arguments). -:::note +:::note There’s an additional way to run aggregation over a table. If a query contains table columns only inside aggregate functions, the `GROUP BY clause` can be omitted, and aggregation by an empty set of keys is assumed. Such queries always return exactly one row. ::: @@ -57,8 +57,8 @@ The subtotals are calculated in the reverse order: at first subtotals are calcul In the subtotals rows the values of already "grouped" key expressions are set to `0` or empty line. -:::note -Mind that [HAVING](../../../sql-reference/statements/select/having) clause can affect the subtotals results. +:::note +Mind that [HAVING](../../../sql-reference/statements/select/having.md) clause can affect the subtotals results. ::: **Example** @@ -125,8 +125,8 @@ SELECT year, month, day, count(*) FROM t GROUP BY year, month, day WITH ROLLUP; In the subtotals rows the values of all "grouped" key expressions are set to `0` or empty line. -:::note -Mind that [HAVING](../../../sql-reference/statements/select/having) clause can affect the subtotals results. +:::note +Mind that [HAVING](../../../sql-reference/statements/select/having.md) clause can affect the subtotals results. ::: **Example** @@ -226,11 +226,11 @@ This extra row is only produced in `JSON*`, `TabSeparated*`, and `Pretty*` forma - In `Template` format, the row is output according to specified template. - In the other formats it is not available. -:::note -totals is output in the results of `SELECT` queries, and is not output in `INSERT INTO ... SELECT`. +:::note +totals is output in the results of `SELECT` queries, and is not output in `INSERT INTO ... SELECT`. ::: -`WITH TOTALS` can be run in different ways when [HAVING](../../../sql-reference/statements/select/having) is present. The behavior depends on the `totals_mode` setting. +`WITH TOTALS` can be run in different ways when [HAVING](../../../sql-reference/statements/select/having.md) is present. The behavior depends on the `totals_mode` setting. ### Configuring Totals Processing diff --git a/docs/en/sql-reference/statements/select/index.md b/docs/en/sql-reference/statements/select/index.md index 5a8893f6f28..f65e40dede5 100644 --- a/docs/en/sql-reference/statements/select/index.md +++ b/docs/en/sql-reference/statements/select/index.md @@ -4,7 +4,7 @@ sidebar_position: 32 sidebar_label: SELECT --- -# SELECT Query +# SELECT Query `SELECT` queries perform data retrieval. By default, the requested data is returned to the client, while in conjunction with [INSERT INTO](../../../sql-reference/statements/insert-into.md) it can be forwarded to a different table. @@ -44,7 +44,7 @@ Specifics of each optional clause are covered in separate sections, which are li - [WHERE clause](../../../sql-reference/statements/select/where.md) - [GROUP BY clause](../../../sql-reference/statements/select/group-by.md) - [LIMIT BY clause](../../../sql-reference/statements/select/limit-by.md) -- [HAVING clause](../../../sql-reference/statements/select/having) +- [HAVING clause](../../../sql-reference/statements/select/having.md) - [LIMIT clause](../../../sql-reference/statements/select/limit.md) - [OFFSET clause](../../../sql-reference/statements/select/offset.md) - [UNION clause](../../../sql-reference/statements/select/union.md) diff --git a/docs/en/sql-reference/statements/select/join.md b/docs/en/sql-reference/statements/select/join.md index 62d3e9fd69a..49bd2672874 100644 --- a/docs/en/sql-reference/statements/select/join.md +++ b/docs/en/sql-reference/statements/select/join.md @@ -1,6 +1,6 @@ --- slug: /en/sql-reference/statements/select/join -sidebar_label: JOIN +sidebar_label: Joining Tables --- # JOIN Clause @@ -282,7 +282,7 @@ Each time a query is run with the same `JOIN`, the subquery is run again because In some cases, it is more efficient to use [IN](../../../sql-reference/operators/in.md) instead of `JOIN`. -If you need a `JOIN` for joining with dimension tables (these are relatively small tables that contain dimension properties, such as names for advertising campaigns), a `JOIN` might not be very convenient due to the fact that the right table is re-accessed for every query. For such cases, there is a “dictionaries†feature that you should use instead of `JOIN`. For more information, see the [Dictionaries](../../../sql-reference/dictionaries/external-dictionaries/external-dicts.md) section. +If you need a `JOIN` for joining with dimension tables (these are relatively small tables that contain dimension properties, such as names for advertising campaigns), a `JOIN` might not be very convenient due to the fact that the right table is re-accessed for every query. For such cases, there is a “dictionaries†feature that you should use instead of `JOIN`. For more information, see the [Dictionaries](../../../sql-reference/dictionaries/index.md) section. ### Memory Limitations diff --git a/docs/en/sql-reference/statements/show.md b/docs/en/sql-reference/statements/show.md index 18b019dd017..544c556d4b3 100644 --- a/docs/en/sql-reference/statements/show.md +++ b/docs/en/sql-reference/statements/show.md @@ -21,13 +21,13 @@ Note that if you use this statement to get `CREATE` query of system tables, you Prints a list of all databases. ```sql -SHOW DATABASES [LIKE | ILIKE | NOT LIKE ''] [LIMIT ] [INTO OUTFILE filename] [FORMAT format] +SHOW DATABASES [[NOT] LIKE | ILIKE ''] [LIMIT ] [INTO OUTFILE filename] [FORMAT format] ``` This statement is identical to the query: ```sql -SELECT name FROM system.databases [WHERE name LIKE | ILIKE | NOT LIKE ''] [LIMIT ] [INTO OUTFILE filename] [FORMAT format] +SELECT name FROM system.databases [WHERE name [NOT] LIKE | ILIKE ''] [LIMIT ] [INTO OUTFILE filename] [FORMAT format] ``` ### Examples @@ -117,7 +117,7 @@ $ watch -n1 "clickhouse-client --query='SHOW PROCESSLIST'" Displays a list of tables. ```sql -SHOW [TEMPORARY] TABLES [{FROM | IN} ] [LIKE | ILIKE | NOT LIKE ''] [LIMIT ] [INTO OUTFILE ] [FORMAT ] +SHOW [FULL] [TEMPORARY] TABLES [{FROM | IN} ] [[NOT] LIKE | ILIKE ''] [LIMIT ] [INTO OUTFILE ] [FORMAT ] ``` If the `FROM` clause is not specified, the query returns the list of tables from the current database. @@ -125,7 +125,7 @@ If the `FROM` clause is not specified, the query returns the list of tables from This statement is identical to the query: ```sql -SELECT name FROM system.tables [WHERE name LIKE | ILIKE | NOT LIKE ''] [LIMIT ] [INTO OUTFILE ] [FORMAT ] +SELECT name FROM system.tables [WHERE name [NOT] LIKE | ILIKE ''] [LIMIT ] [INTO OUTFILE ] [FORMAT ] ``` ### Examples @@ -198,7 +198,7 @@ Result: ## SHOW DICTIONARIES -Displays a list of [Dictionaries](../../sql-reference/dictionaries/external-dictionaries/external-dicts.md). +Displays a list of [Dictionaries](../../sql-reference/dictionaries/index.md). ``` sql SHOW DICTIONARIES [FROM ] [LIKE ''] [LIMIT ] [INTO OUTFILE ] [FORMAT ] @@ -293,7 +293,7 @@ SHOW CREATE [SETTINGS] PROFILE name1 [, name2 ...] ## SHOW USERS -Returns a list of [user account](../../operations/access-rights.md#user-account-management) names. To view user accounts parameters, see the system table [system.users](../../operations/system-tables/users.md#system_tables-users). +Returns a list of [user account](../../guides/sre/user-management/index.md#user-account-management) names. To view user accounts parameters, see the system table [system.users](../../operations/system-tables/users.md#system_tables-users). ### Syntax @@ -303,7 +303,7 @@ SHOW USERS ## SHOW ROLES -Returns a list of [roles](../../operations/access-rights.md#role-management). To view another parameters, see system tables [system.roles](../../operations/system-tables/roles.md#system_tables-roles) and [system.role_grants](../../operations/system-tables/role-grants.md#system_tables-role_grants). +Returns a list of [roles](../../guides/sre/user-management/index.md#role-management). To view another parameters, see system tables [system.roles](../../operations/system-tables/roles.md#system_tables-roles) and [system.role_grants](../../operations/system-tables/role-grants.md#system_tables-role_grants). ### Syntax @@ -312,7 +312,7 @@ SHOW [CURRENT|ENABLED] ROLES ``` ## SHOW PROFILES -Returns a list of [setting profiles](../../operations/access-rights.md#settings-profiles-management). To view user accounts parameters, see the system table [settings_profiles](../../operations/system-tables/settings_profiles.md#system_tables-settings_profiles). +Returns a list of [setting profiles](../../guides/sre/user-management/index.md#settings-profiles-management). To view user accounts parameters, see the system table [settings_profiles](../../operations/system-tables/settings_profiles.md#system_tables-settings_profiles). ### Syntax @@ -322,7 +322,7 @@ SHOW [SETTINGS] PROFILES ## SHOW POLICIES -Returns a list of [row policies](../../operations/access-rights.md#row-policy-management) for the specified table. To view user accounts parameters, see the system table [system.row_policies](../../operations/system-tables/row_policies.md#system_tables-row_policies). +Returns a list of [row policies](../../guides/sre/user-management/index.md#row-policy-management) for the specified table. To view user accounts parameters, see the system table [system.row_policies](../../operations/system-tables/row_policies.md#system_tables-row_policies). ### Syntax @@ -332,7 +332,7 @@ SHOW [ROW] POLICIES [ON [db.]table] ## SHOW QUOTAS -Returns a list of [quotas](../../operations/access-rights.md#quotas-management). To view quotas parameters, see the system table [system.quotas](../../operations/system-tables/quotas.md#system_tables-quotas). +Returns a list of [quotas](../../guides/sre/user-management/index.md#quotas-management). To view quotas parameters, see the system table [system.quotas](../../operations/system-tables/quotas.md#system_tables-quotas). ### Syntax @@ -351,7 +351,7 @@ SHOW [CURRENT] QUOTA ``` ## SHOW ACCESS -Shows all [users](../../operations/access-rights.md#user-account-management), [roles](../../operations/access-rights.md#role-management), [profiles](../../operations/access-rights.md#settings-profiles-management), etc. and all their [grants](../../sql-reference/statements/grant.md#grant-privileges). +Shows all [users](../../guides/sre/user-management/index.md#user-account-management), [roles](../../guides/sre/user-management/index.md#role-management), [profiles](../../guides/sre/user-management/index.md#settings-profiles-management), etc. and all their [grants](../../sql-reference/statements/grant.md#grant-privileges). ### Syntax @@ -370,7 +370,7 @@ Returns a list of clusters. All available clusters are listed in the [system.clu ``` sql SHOW CLUSTER '' -SHOW CLUSTERS [LIKE|NOT LIKE ''] [LIMIT ] +SHOW CLUSTERS [[NOT] LIKE|ILIKE ''] [LIMIT ] ``` ### Examples @@ -521,4 +521,4 @@ Outputs the content of the [system.table_engines](../../operations/system-tables **See Also** -- [system.table_engines](../../operations/system-tables/table_engines.md) table \ No newline at end of file +- [system.table_engines](../../operations/system-tables/table_engines.md) table diff --git a/docs/en/sql-reference/statements/system.md b/docs/en/sql-reference/statements/system.md index f9f55acfcec..d069ae8413a 100644 --- a/docs/en/sql-reference/statements/system.md +++ b/docs/en/sql-reference/statements/system.md @@ -8,7 +8,7 @@ sidebar_label: SYSTEM ## RELOAD EMBEDDED DICTIONARIES -Reload all [Internal dictionaries](../../sql-reference/dictionaries/internal-dicts.md). +Reload all [Internal dictionaries](../../sql-reference/dictionaries/index.md). By default, internal dictionaries are disabled. Always returns `Ok.` regardless of the result of the internal dictionary update. @@ -280,13 +280,13 @@ SYSTEM START REPLICATION QUEUES [[db.]replicated_merge_tree_family_table_name] ### SYNC REPLICA -Wait until a `ReplicatedMergeTree` table will be synced with other replicas in a cluster. Will run until `receive_timeout` if fetches currently disabled for the table. +Wait until a `ReplicatedMergeTree` table will be synced with other replicas in a cluster, but no more than `receive_timeout` seconds. ``` sql -SYSTEM SYNC REPLICA [ON CLUSTER cluster_name] [db.]replicated_merge_tree_family_table_name +SYSTEM SYNC REPLICA [ON CLUSTER cluster_name] [db.]replicated_merge_tree_family_table_name [STRICT] ``` -After running this statement the `[db.]replicated_merge_tree_family_table_name` fetches commands from the common replicated log into its own replication queue, and then the query waits till the replica processes all of the fetched commands. +After running this statement the `[db.]replicated_merge_tree_family_table_name` fetches commands from the common replicated log into its own replication queue, and then the query waits till the replica processes all of the fetched commands. If a `STRICT` modifier was specified then the query waits for the replication queue to become empty. The `STRICT` version may never succeed if new entries constantly appear in the replication queue. ### RESTART REPLICA @@ -369,7 +369,7 @@ SYSTEM DROP FILESYSTEM CACHE It's too heavy and has potential for misuse. ::: -Will do sync syscall. +Will do sync syscall. ```sql SYSTEM SYNC FILE CACHE diff --git a/docs/en/sql-reference/table-functions/dictionary.md b/docs/en/sql-reference/table-functions/dictionary.md index 8a8cba8ff24..ab511843d63 100644 --- a/docs/en/sql-reference/table-functions/dictionary.md +++ b/docs/en/sql-reference/table-functions/dictionary.md @@ -5,7 +5,7 @@ sidebar_label: dictionary function title: dictionary --- -Displays the [dictionary](../../sql-reference/dictionaries/external-dictionaries/external-dicts.md) data as a ClickHouse table. Works the same way as [Dictionary](../../engines/table-engines/special/dictionary.md) engine. +Displays the [dictionary](../../sql-reference/dictionaries/index.md) data as a ClickHouse table. Works the same way as [Dictionary](../../engines/table-engines/special/dictionary.md) engine. **Syntax** diff --git a/docs/en/sql-reference/table-functions/executable.md b/docs/en/sql-reference/table-functions/executable.md index 635188763cf..22c74eb8cfa 100644 --- a/docs/en/sql-reference/table-functions/executable.md +++ b/docs/en/sql-reference/table-functions/executable.md @@ -85,7 +85,7 @@ The response looks like: ## Passing Query Results to a Script -Be sure to check out the example in the `Executable` table engine on [how to pass query results to a script](../../engines/table-engines/special/executable#passing-query-results-to-a-script). Here is how you execute the same script in that example using the `executable` table function: +Be sure to check out the example in the `Executable` table engine on [how to pass query results to a script](../../engines/table-engines/special/executable.md#passing-query-results-to-a-script). Here is how you execute the same script in that example using the `executable` table function: ```sql SELECT * FROM executable( diff --git a/docs/en/sql-reference/table-functions/iceberg.md b/docs/en/sql-reference/table-functions/iceberg.md index fda4d274005..7c2648d3dcf 100644 --- a/docs/en/sql-reference/table-functions/iceberg.md +++ b/docs/en/sql-reference/table-functions/iceberg.md @@ -32,7 +32,13 @@ A table with the specified structure for reading data in the specified Iceberg t SELECT * FROM iceberg('http://test.s3.amazonaws.com/clickhouse-bucket/test_table', 'test', 'test') ``` -Using named collections: +:::warning +ClickHouse currently supports reading v1 (v2 support is coming soon!) of the Iceberg format via the `iceberg` table function and `Iceberg` table engine. +::: + +## Defining a named collection + +Here is an example of configuring a named collection for storing the URL and credentials: ```xml diff --git a/docs/en/sql-reference/table-functions/mongodb.md b/docs/en/sql-reference/table-functions/mongodb.md index dd063ae1796..706ab68fee4 100644 --- a/docs/en/sql-reference/table-functions/mongodb.md +++ b/docs/en/sql-reference/table-functions/mongodb.md @@ -70,5 +70,5 @@ SELECT * FROM mongodb( **See Also** -- [The `MongoDB` table engine](../../engines/table-engines/integrations/mongodb.md) -- [Using MongoDB as a dictionary source](../../sql-reference/dictionaries/external-dictionaries/external-dicts-dict-sources/#mongodb) +- [The `MongoDB` table engine](/docs/en/engines/table-engines/integrations/mongodb.md) +- [Using MongoDB as a dictionary source](/docs/en/sql-reference/dictionaries/index.md#mongodb) diff --git a/docs/en/sql-reference/table-functions/mysql.md b/docs/en/sql-reference/table-functions/mysql.md index b995319c645..64ddcd86f7f 100644 --- a/docs/en/sql-reference/table-functions/mysql.md +++ b/docs/en/sql-reference/table-functions/mysql.md @@ -56,7 +56,7 @@ SELECT name FROM mysql(`mysql1:3306|mysql2:3306|mysql3:3306`, 'mysql_database', A table object with the same columns as the original MySQL table. -:::note +:::note In the `INSERT` query to distinguish table function `mysql(...)` from table name with column names list, you must use keywords `FUNCTION` or `TABLE FUNCTION`. See examples below. ::: @@ -110,4 +110,4 @@ SELECT * FROM mysql('localhost:3306', 'test', 'test', 'bayonet', '123'); **See Also** - [The ‘MySQL’ table engine](../../engines/table-engines/integrations/mysql.md) -- [Using MySQL as a dictionary source](../../sql-reference/dictionaries/external-dictionaries/external-dicts-dict-sources.md#dicts-external_dicts_dict_sources-mysql) +- [Using MySQL as a dictionary source](../../sql-reference/dictionaries/index.md#dictionary-sources#dicts-external_dicts_dict_sources-mysql) diff --git a/docs/en/sql-reference/table-functions/odbc.md b/docs/en/sql-reference/table-functions/odbc.md index 7e13424bc8a..397a9ba6c89 100644 --- a/docs/en/sql-reference/table-functions/odbc.md +++ b/docs/en/sql-reference/table-functions/odbc.md @@ -101,5 +101,5 @@ SELECT * FROM odbc('DSN=mysqlconn', 'test', 'test') ## See Also -- [ODBC dictionaries](../../sql-reference/dictionaries/external-dictionaries/external-dicts-dict-sources.md#dicts-external_dicts_dict_sources-odbc) +- [ODBC dictionaries](../../sql-reference/dictionaries/index.md#dictionary-sources#dicts-external_dicts_dict_sources-odbc) - [ODBC table engine](../../engines/table-engines/integrations/odbc.md). diff --git a/docs/en/sql-reference/table-functions/postgresql.md b/docs/en/sql-reference/table-functions/postgresql.md index 87fc6ecb234..6cd13acaa77 100644 --- a/docs/en/sql-reference/table-functions/postgresql.md +++ b/docs/en/sql-reference/table-functions/postgresql.md @@ -27,7 +27,7 @@ postgresql('host:port', 'database', 'table', 'user', 'password'[, `schema`]) A table object with the same columns as the original PostgreSQL table. -:::note +:::note In the `INSERT` query to distinguish table function `postgresql(...)` from table name with column names list you must use keywords `FUNCTION` or `TABLE FUNCTION`. See examples below. ::: @@ -43,7 +43,7 @@ All joins, aggregations, sorting, `IN [ array ]` conditions and the `LIMIT` samp PostgreSQL Array types converts into ClickHouse arrays. -:::note +:::note Be careful, in PostgreSQL an array data type column like Integer[] may contain arrays of different dimensions in different rows, but in ClickHouse it is only allowed to have multidimensional arrays of the same dimension in all rows. ::: @@ -130,7 +130,7 @@ CREATE TABLE pg_table_schema_with_dots (a UInt32) **See Also** - [The PostgreSQL table engine](../../engines/table-engines/integrations/postgresql.md) -- [Using PostgreSQL as a dictionary source](../../sql-reference/dictionaries/external-dictionaries/external-dicts-dict-sources.md#dicts-external_dicts_dict_sources-postgresql) +- [Using PostgreSQL as a dictionary source](../../sql-reference/dictionaries/index.md#dictionary-sources#dicts-external_dicts_dict_sources-postgresql) ## Related content - Blog: [ClickHouse and PostgreSQL - a match made in data heaven - part 1](https://clickhouse.com/blog/migrating-data-between-clickhouse-postgres) diff --git a/docs/en/sql-reference/table-functions/s3.md b/docs/en/sql-reference/table-functions/s3.md index d7199717798..99b7832394d 100644 --- a/docs/en/sql-reference/table-functions/s3.md +++ b/docs/en/sql-reference/table-functions/s3.md @@ -43,35 +43,41 @@ A table with the specified structure for reading or writing data in the specifie **Examples** -Selecting the first two rows from the table from S3 file `https://clickhouse-public-datasets.s3.amazonaws.com/my-test-bucket-768/data.csv`: +Selecting the first 5 rows from the table from S3 file `https://datasets-documentation.s3.eu-west-3.amazonaws.com/aapl_stock.csv`: ``` sql SELECT * -FROM s3('https://clickhouse-public-datasets.s3.amazonaws.com/my-test-bucket-768/data.csv', 'CSV', 'column1 UInt32, column2 UInt32, column3 UInt32') -LIMIT 2; +FROM s3( + 'https://datasets-documentation.s3.eu-west-3.amazonaws.com/aapl_stock.csv', + 'CSVWithNames' +) +LIMIT 5; ``` -``` text -┌─column1─┬─column2─┬─column3─┠-│ 1 │ 2 │ 3 │ -│ 3 │ 2 │ 1 │ -└─────────┴─────────┴─────────┘ +```response +┌───────Date─┬────Open─┬────High─┬─────Low─┬───Close─┬───Volume─┬─OpenInt─┠+│ 1984-09-07 │ 0.42388 │ 0.42902 │ 0.41874 │ 0.42388 │ 23220030 │ 0 │ +│ 1984-09-10 │ 0.42388 │ 0.42516 │ 0.41366 │ 0.42134 │ 18022532 │ 0 │ +│ 1984-09-11 │ 0.42516 │ 0.43668 │ 0.42516 │ 0.42902 │ 42498199 │ 0 │ +│ 1984-09-12 │ 0.42902 │ 0.43157 │ 0.41618 │ 0.41618 │ 37125801 │ 0 │ +│ 1984-09-13 │ 0.43927 │ 0.44052 │ 0.43927 │ 0.43927 │ 57822062 │ 0 │ +└────────────┴─────────┴─────────┴─────────┴─────────┴──────────┴─────────┘ ``` -The similar but from file with `gzip` compression: +:::note +ClickHouse uses filename extensions to determine the format of the data. For example, we could have run the previous command without the `CSVWithNames`: ``` sql SELECT * -FROM s3('https://clickhouse-public-datasets.s3.amazonaws.com/my-test-bucket-768/data.csv.gz', 'CSV', 'column1 UInt32, column2 UInt32, column3 UInt32', 'gzip') -LIMIT 2; +FROM s3( + 'https://datasets-documentation.s3.eu-west-3.amazonaws.com/aapl_stock.csv' +) +LIMIT 5; ``` -``` text -┌─column1─┬─column2─┬─column3─┠-│ 1 │ 2 │ 3 │ -│ 3 │ 2 │ 1 │ -└─────────┴─────────┴─────────┘ -``` +ClickHouse also can determine the compression of the file. For example, if the file was zipped up with a `.csv.gz` extension, ClickHouse would decompress the file automatically. +::: + ## Usage @@ -179,6 +185,7 @@ INSERT INTO TABLE FUNCTION ``` As a result, the data is written into three files in different buckets: `my_bucket_1/file.csv`, `my_bucket_10/file.csv`, and `my_bucket_20/file.csv`. + **See Also** - [S3 engine](../../engines/table-engines/integrations/s3.md) diff --git a/docs/ru/engines/database-engines/materialized-mysql.md b/docs/ru/engines/database-engines/materialized-mysql.md index c214e08dce1..df56b7a0bd6 100644 --- a/docs/ru/engines/database-engines/materialized-mysql.md +++ b/docs/ru/engines/database-engines/materialized-mysql.md @@ -97,7 +97,7 @@ CREATE DATABASE mysql ENGINE = MaterializedMySQL('localhost:3306', 'db', 'user', ### DDL-запроÑÑ‹ {#ddl-queries} -DDL-запроÑÑ‹ в MySQL конвертируютÑÑ Ð² ÑоответÑтвующие DDL-запроÑÑ‹ в ClickHouse ([ALTER](../../sql-reference/statements/alter/index.md), [CREATE](../../sql-reference/statements/create/index.md), [DROP](../../sql-reference/statements/drop), [RENAME](../../sql-reference/statements/rename.md)). ЕÑли ClickHouse не может конвертировать какой-либо DDL-запроÑ, он его игнорирует. +DDL-запроÑÑ‹ в MySQL конвертируютÑÑ Ð² ÑоответÑтвующие DDL-запроÑÑ‹ в ClickHouse ([ALTER](../../sql-reference/statements/alter/index.md), [CREATE](../../sql-reference/statements/create/index.md), [DROP](../../sql-reference/statements/drop.md), [RENAME](../../sql-reference/statements/rename.md)). ЕÑли ClickHouse не может конвертировать какой-либо DDL-запроÑ, он его игнорирует. ### Ð ÐµÐ¿Ð»Ð¸ÐºÐ°Ñ†Ð¸Ñ Ð´Ð°Ð½Ð½Ñ‹Ñ… {#data-replication} diff --git a/docs/ru/engines/table-engines/mergetree-family/mergetree.md b/docs/ru/engines/table-engines/mergetree-family/mergetree.md index 24e0f8dbbb8..ef17a370dc6 100644 --- a/docs/ru/engines/table-engines/mergetree-family/mergetree.md +++ b/docs/ru/engines/table-engines/mergetree-family/mergetree.md @@ -89,7 +89,7 @@ ORDER BY expr - `min_merge_bytes_to_use_direct_io` — минимальный объём данных при ÑлиÑнии, необходимый Ð´Ð»Ñ Ð¿Ñ€Ñмого (небуферизованного) чтениÑ/запиÑи (direct I/O) на диÑк. При ÑлиÑнии чаÑтей данных ClickHouse вычиÑлÑет общий объём Ñ…Ñ€Ð°Ð½ÐµÐ½Ð¸Ñ Ð²Ñех данных, подлежащих ÑлиÑнию. ЕÑли общий объём Ñ…Ñ€Ð°Ð½ÐµÐ½Ð¸Ñ Ð²Ñех данных Ð´Ð»Ñ Ñ‡Ñ‚ÐµÐ½Ð¸Ñ Ð¿Ñ€ÐµÐ²Ñ‹ÑˆÐ°ÐµÑ‚ `min_bytes_to_use_direct_io` байт, тогда ClickHouse иÑпользует флаг `O_DIRECT` при чтении данных Ñ Ð´Ð¸Ñка. ЕÑли `min_merge_bytes_to_use_direct_io = 0`, тогда прÑмой ввод-вывод отключен. Значение по умолчанию: `10 * 1024 * 1024 * 1024` байтов. - `merge_with_ttl_timeout` — минимальное Ð²Ñ€ÐµÐ¼Ñ Ð² Ñекундах перед повторным ÑлиÑнием Ð´Ð»Ñ ÑƒÐ´Ð°Ð»ÐµÐ½Ð¸Ñ Ð´Ð°Ð½Ð½Ñ‹Ñ… Ñ Ð¸Ñтекшим TTL. По умолчанию: `14400` Ñекунд (4 чаÑа). - `merge_with_recompression_ttl_timeout` — минимальное Ð²Ñ€ÐµÐ¼Ñ Ð² Ñекундах перед повторным ÑлиÑнием Ð´Ð»Ñ Ð¿Ð¾Ð²Ñ‚Ð¾Ñ€Ð½Ð¾Ð³Ð¾ ÑÐ¶Ð°Ñ‚Ð¸Ñ Ð´Ð°Ð½Ð½Ñ‹Ñ… Ñ Ð¸Ñтекшим TTL. По умолчанию: `14400` Ñекунд (4 чаÑа). - - `try_fetch_recompressed_part_timeout` — Ð²Ñ€ÐµÐ¼Ñ Ð¾Ð¶Ð¸Ð´Ð°Ð½Ð¸Ñ (в Ñекундах) перед началом ÑлиÑÐ½Ð¸Ñ Ñ Ð¿Ð¾Ð²Ñ‚Ð¾Ñ€Ð½Ñ‹Ð¼ Ñжатием. Ð’ течение Ñтого времени ClickHouse пытаетÑÑ Ð¸Ð·Ð²Ð»ÐµÑ‡ÑŒ Ñжатую чаÑÑ‚ÑŒ из реплики, ÐºÐ¾Ñ‚Ð¾Ñ€Ð°Ñ Ð½Ð°Ð·Ð½Ð°Ñ‡Ð¸Ð»Ð° Ñто ÑлиÑние. Значение по умолчанию: `7200` Ñекунд (2 чаÑа). + - `try_fetch_recompressed_part_timeout` — Ð²Ñ€ÐµÐ¼Ñ Ð¾Ð¶Ð¸Ð´Ð°Ð½Ð¸Ñ (в Ñекундах) перед началом ÑлиÑÐ½Ð¸Ñ Ñ Ð¿Ð¾Ð²Ñ‚Ð¾Ñ€Ð½Ñ‹Ð¼ Ñжатием. Ð’ течение Ñтого времени ClickHouse пытаетÑÑ Ð¸Ð·Ð²Ð»ÐµÑ‡ÑŒ Ñжатую чаÑÑ‚ÑŒ из реплики, ÐºÐ¾Ñ‚Ð¾Ñ€Ð°Ñ Ð½Ð°Ð·Ð½Ð°Ñ‡Ð¸Ð»Ð° Ñто ÑлиÑние. Значение по умолчанию: `7200` Ñекунд (2 чаÑа). - `write_final_mark` — включает или отключает запиÑÑŒ поÑледней заÑечки индекÑа в конце куÑка данных, указывающей за поÑледний байт. По умолчанию — 1. Ðе отключайте её. - `merge_max_block_size` — макÑимальное количеÑтво Ñтрок в блоке Ð´Ð»Ñ Ð¾Ð¿ÐµÑ€Ð°Ñ†Ð¸Ð¹ ÑлиÑниÑ. Значение по умолчанию: 8192. - `storage_policy` — политика Ñ…Ñ€Ð°Ð½ÐµÐ½Ð¸Ñ Ð´Ð°Ð½Ð½Ñ‹Ñ…. Смотрите [Хранение данных таблицы на неÑкольких блочных уÑтройÑтвах](#table_engine-mergetree-multiple-volumes). @@ -337,7 +337,7 @@ SELECT count() FROM table WHERE u64 * i32 == 10 AND u64 * length(s) >= 1234 Поддерживаемые типы данных: `Int*`, `UInt*`, `Float*`, `Enum`, `Date`, `DateTime`, `String`, `FixedString`. - Фильтром могут пользоватьÑÑ Ñ„ÑƒÐ½ÐºÑ†Ð¸Ð¸: [equals](../../../sql-reference/functions/comparison-functions.md), [notEquals](../../../sql-reference/functions/comparison-functions.md), [in](../../../sql-reference/functions/in-functions), [notIn](../../../sql-reference/functions/in-functions), [has](../../../sql-reference/functions/array-functions#hasarr-elem), [hasAny](../../../sql-reference/functions/array-functions#hasany), [hasAll](../../../sql-reference/functions/array-functions#hasall). + Фильтром могут пользоватьÑÑ Ñ„ÑƒÐ½ÐºÑ†Ð¸Ð¸: [equals](../../../sql-reference/functions/comparison-functions.md), [notEquals](../../../sql-reference/functions/comparison-functions.md), [in](../../../sql-reference/functions/in-functions.md), [notIn](../../../sql-reference/functions/in-functions.md), [has](../../../sql-reference/functions/array-functions.md#hasarr-elem), [hasAny](../../../sql-reference/functions/array-functions.md#hasany), [hasAll](../../../sql-reference/functions/array-functions.md#hasall). **Примеры** @@ -361,14 +361,14 @@ INDEX b (u64 * length(str), i32 + f64 * 100, date, str) TYPE set(100) GRANULARIT | [startsWith](../../../sql-reference/functions/string-functions.md#startswith) | ✔ | ✔ | ✔ | ✔ | ✗ | | [endsWith](../../../sql-reference/functions/string-functions.md#endswith) | ✗ | ✗ | ✔ | ✔ | ✗ | | [multiSearchAny](../../../sql-reference/functions/string-search-functions.md#function-multisearchany) | ✗ | ✗ | ✔ | ✗ | ✗ | -| [in](../../../sql-reference/functions/in-functions#in-functions) | ✔ | ✔ | ✔ | ✔ | ✔ | -| [notIn](../../../sql-reference/functions/in-functions#in-functions) | ✔ | ✔ | ✔ | ✔ | ✔ | +| [in](../../../sql-reference/functions/in-functions.md#in-functions) | ✔ | ✔ | ✔ | ✔ | ✔ | +| [notIn](../../../sql-reference/functions/in-functions.md#in-functions) | ✔ | ✔ | ✔ | ✔ | ✔ | | [less (\<)](../../../sql-reference/functions/comparison-functions.md#function-less) | ✔ | ✔ | ✗ | ✗ | ✗ | | [greater (\>)](../../../sql-reference/functions/comparison-functions.md#function-greater) | ✔ | ✔ | ✗ | ✗ | ✗ | | [lessOrEquals (\<=)](../../../sql-reference/functions/comparison-functions.md#function-lessorequals) | ✔ | ✔ | ✗ | ✗ | ✗ | | [greaterOrEquals (\>=)](../../../sql-reference/functions/comparison-functions.md#function-greaterorequals) | ✔ | ✔ | ✗ | ✗ | ✗ | -| [empty](../../../sql-reference/functions/array-functions#function-empty) | ✔ | ✔ | ✗ | ✗ | ✗ | -| [notEmpty](../../../sql-reference/functions/array-functions#function-notempty) | ✔ | ✔ | ✗ | ✗ | ✗ | +| [empty](../../../sql-reference/functions/array-functions.md#function-empty) | ✔ | ✔ | ✗ | ✗ | ✗ | +| [notEmpty](../../../sql-reference/functions/array-functions.md#function-notempty) | ✔ | ✔ | ✗ | ✗ | ✗ | | hasToken | ✗ | ✗ | ✗ | ✔ | ✗ | Функции Ñ Ð¿Ð¾ÑтоÑнным агрументом, который меньше, чем размер ngram не могут иÑпользовать Ð¸Ð½Ð´ÐµÐºÑ `ngrambf_v1` Ð´Ð»Ñ Ð¾Ð¿Ñ‚Ð¸Ð¼Ð¸Ð·Ð°Ñ†Ð¸Ð¸ запроÑа. @@ -396,7 +396,7 @@ INDEX b (u64 * length(str), i32 + f64 * 100, date, str) TYPE set(100) GRANULARIT Проекции не поддерживаютÑÑ Ð´Ð»Ñ Ð·Ð°Ð¿Ñ€Ð¾Ñов `SELECT` Ñ Ð¼Ð¾Ð´Ð¸Ñ„Ð¸ÐºÐ°Ñ‚Ð¾Ñ€Ð¾Ð¼ [FINAL](../../../sql-reference/statements/select/from.md#select-from-final). ### Ð—Ð°Ð¿Ñ€Ð¾Ñ Ð¿Ñ€Ð¾ÐµÐºÑ†Ð¸Ð¸ {#projection-query} -Ð—Ð°Ð¿Ñ€Ð¾Ñ Ð¿Ñ€Ð¾ÐµÐºÑ†Ð¸Ð¸ — Ñто то, что определÑет проекцию. Такой Ð·Ð°Ð¿Ñ€Ð¾Ñ Ð½ÐµÑвно выбирает данные из родительÑкой таблицы. +Ð—Ð°Ð¿Ñ€Ð¾Ñ Ð¿Ñ€Ð¾ÐµÐºÑ†Ð¸Ð¸ — Ñто то, что определÑет проекцию. Такой Ð·Ð°Ð¿Ñ€Ð¾Ñ Ð½ÐµÑвно выбирает данные из родительÑкой таблицы. **СинтакÑиÑ** ```sql @@ -406,9 +406,9 @@ SELECT [GROUP BY] [ORDER BY] Проекции можно изменить или удалить Ñ Ð¿Ð¾Ð¼Ð¾Ñ‰ÑŒÑŽ запроÑа [ALTER](../../../sql-reference/statements/alter/projection.md). ### Хранение проекции {#projection-storage} -Проекции хранÑÑ‚ÑÑ Ð² каталоге куÑка данных. Это похоже на хранение индекÑов, но иÑпользуетÑÑ Ð¿Ð¾Ð´ÐºÐ°Ñ‚Ð°Ð»Ð¾Ð³, в котором хранитÑÑ Ð°Ð½Ð¾Ð½Ð¸Ð¼Ð½Ñ‹Ð¹ куÑок таблицы `MergeTree`. Таблица ÑоздаетÑÑ Ð·Ð°Ð¿Ñ€Ð¾Ñом Ð¾Ð¿Ñ€ÐµÐ´ÐµÐ»ÐµÐ½Ð¸Ñ Ð¿Ñ€Ð¾ÐµÐºÑ†Ð¸Ð¸. -ЕÑли приÑутÑтвует ÑÐµÐºÑ†Ð¸Ñ `GROUP BY`, то иÑпользуетÑÑ Ð´Ð²Ð¸Ð¶Ð¾Ðº [AggregatingMergeTree](aggregatingmergetree.md), а вÑе агрегатные функции преобразуютÑÑ Ð² `AggregateFunction`. -ЕÑли приÑутÑтвует ÑÐµÐºÑ†Ð¸Ñ `ORDER BY`, таблица `MergeTree` иÑпользует ее в качеÑтве Ð²Ñ‹Ñ€Ð°Ð¶ÐµÐ½Ð¸Ñ Ð´Ð»Ñ Ð¿ÐµÑ€Ð²Ð¸Ñ‡Ð½Ð¾Ð³Ð¾ ключа. +Проекции хранÑÑ‚ÑÑ Ð² каталоге куÑка данных. Это похоже на хранение индекÑов, но иÑпользуетÑÑ Ð¿Ð¾Ð´ÐºÐ°Ñ‚Ð°Ð»Ð¾Ð³, в котором хранитÑÑ Ð°Ð½Ð¾Ð½Ð¸Ð¼Ð½Ñ‹Ð¹ куÑок таблицы `MergeTree`. Таблица ÑоздаетÑÑ Ð·Ð°Ð¿Ñ€Ð¾Ñом Ð¾Ð¿Ñ€ÐµÐ´ÐµÐ»ÐµÐ½Ð¸Ñ Ð¿Ñ€Ð¾ÐµÐºÑ†Ð¸Ð¸. +ЕÑли приÑутÑтвует ÑÐµÐºÑ†Ð¸Ñ `GROUP BY`, то иÑпользуетÑÑ Ð´Ð²Ð¸Ð¶Ð¾Ðº [AggregatingMergeTree](aggregatingmergetree.md), а вÑе агрегатные функции преобразуютÑÑ Ð² `AggregateFunction`. +ЕÑли приÑутÑтвует ÑÐµÐºÑ†Ð¸Ñ `ORDER BY`, таблица `MergeTree` иÑпользует ее в качеÑтве Ð²Ñ‹Ñ€Ð°Ð¶ÐµÐ½Ð¸Ñ Ð´Ð»Ñ Ð¿ÐµÑ€Ð²Ð¸Ñ‡Ð½Ð¾Ð³Ð¾ ключа. Во Ð²Ñ€ÐµÐ¼Ñ Ð¿Ñ€Ð¾Ñ†ÐµÑÑа ÑлиÑÐ½Ð¸Ñ ÐºÑƒÑок данных проекции объединÑетÑÑ Ñ Ð¿Ð¾Ð¼Ð¾Ñ‰ÑŒÑŽ процедуры ÑлиÑÐ½Ð¸Ñ Ñ…Ñ€Ð°Ð½Ð¸Ð»Ð¸Ñ‰Ð°. ÐšÐ¾Ð½Ñ‚Ñ€Ð¾Ð»ÑŒÐ½Ð°Ñ Ñумма куÑка данных родительÑкой таблицы включает куÑок данных проекции. Другие процедуры аналогичны индекÑам пропуÑка данных. ### Ðнализ запроÑов {#projection-query-analysis} @@ -499,7 +499,7 @@ TTL expr За каждым `TTL` выражением может Ñледовать тип дейÑтвиÑ, которое выполнÑетÑÑ Ð¿Ð¾Ñле доÑÑ‚Ð¸Ð¶ÐµÐ½Ð¸Ñ Ð²Ñ€ÐµÐ¼ÐµÐ½Ð¸, ÑоответÑтвующего результату `TTL` выражениÑ: - `DELETE` - удалить данные (дейÑтвие по умолчанию); -- `RECOMPRESS codec_name` - повторно Ñжать данные Ñ Ð¿Ð¾Ð¼Ð¾Ñ‰ÑŒÑŽ кодека `codec_name`; +- `RECOMPRESS codec_name` - повторно Ñжать данные Ñ Ð¿Ð¾Ð¼Ð¾Ñ‰ÑŒÑŽ кодека `codec_name`; - `TO DISK 'aaa'` - перемеÑтить данные на диÑк `aaa`; - `TO VOLUME 'bbb'` - перемеÑтить данные на том `bbb`; - `GROUP BY` - агрегировать данные. @@ -679,7 +679,7 @@ TTL d + INTERVAL 1 MONTH GROUP BY k1, k2 SET x = max(x), y = min(y); - `policy_name_N` — название политики. ÐÐ°Ð·Ð²Ð°Ð½Ð¸Ñ Ð¿Ð¾Ð»Ð¸Ñ‚Ð¸Ðº должны быть уникальны. - `volume_name_N` — название тома. ÐÐ°Ð·Ð²Ð°Ð½Ð¸Ñ Ñ‚Ð¾Ð¼Ð¾Ð² должны быть уникальны. - `disk` — диÑк, находÑщийÑÑ Ð²Ð½ÑƒÑ‚Ñ€Ð¸ тома. -- `max_data_part_size_bytes` — макÑимальный размер куÑка данных, который может находитьÑÑ Ð½Ð° любом из диÑков Ñтого тома. ЕÑли в результате ÑлиÑÐ½Ð¸Ñ Ñ€Ð°Ð·Ð¼ÐµÑ€ куÑка ожидаетÑÑ Ð±Ð¾Ð»ÑŒÑˆÐµ, чем max_data_part_size_bytes, то Ñтот куÑок будет запиÑан в Ñледующий том. Ð’ оÑновном Ñта Ñ„ÑƒÐ½ÐºÑ†Ð¸Ñ Ð¿Ð¾Ð·Ð²Ð¾Ð»Ñет хранить новые / мелкие куÑки на горÑчем (SSD) томе и перемещать их на холодный (HDD) том, когда они доÑтигают большого размера. Ðе иÑпользуйте Ñтот параметр, еÑли политика имеет только один том. +- `max_data_part_size_bytes` — макÑимальный размер куÑка данных, который может находитьÑÑ Ð½Ð° любом из диÑков Ñтого тома. ЕÑли в результате ÑлиÑÐ½Ð¸Ñ Ñ€Ð°Ð·Ð¼ÐµÑ€ куÑка ожидаетÑÑ Ð±Ð¾Ð»ÑŒÑˆÐµ, чем max_data_part_size_bytes, то Ñтот куÑок будет запиÑан в Ñледующий том. Ð’ оÑновном Ñта Ñ„ÑƒÐ½ÐºÑ†Ð¸Ñ Ð¿Ð¾Ð·Ð²Ð¾Ð»Ñет хранить новые / мелкие куÑки на горÑчем (SSD) томе и перемещать их на холодный (HDD) том, когда они доÑтигают большого размера. Ðе иÑпользуйте Ñтот параметр, еÑли политика имеет только один том. - `move_factor` — Ð´Ð¾Ð»Ñ Ð´Ð¾Ñтупного Ñвободного меÑта на томе, еÑли меÑта ÑтановитÑÑ Ð¼ÐµÐ½ÑŒÑˆÐµ, то данные начнут перемещение на Ñледующий том, еÑли он еÑÑ‚ÑŒ (по умолчанию 0.1). Ð”Ð»Ñ Ð¿ÐµÑ€ÐµÐ¼ÐµÑ‰ÐµÐ½Ð¸Ñ ÐºÑƒÑки ÑортируютÑÑ Ð¿Ð¾ размеру от большего к меньшему (по убыванию) и выбираютÑÑ ÐºÑƒÑки, Ñовокупный размер которых доÑтаточен Ð´Ð»Ñ ÑÐ¾Ð±Ð»ÑŽÐ´ÐµÐ½Ð¸Ñ ÑƒÑÐ»Ð¾Ð²Ð¸Ñ `move_factor`, еÑли Ñовокупный размер вÑех партов недоÑтаточен, будут перемещены вÑе парты. - `prefer_not_to_merge` — Отключает ÑлиÑние куÑков данных, хранÑщихÑÑ Ð½Ð° данном томе. ЕÑли Ð´Ð°Ð½Ð½Ð°Ñ Ð½Ð°Ñтройка включена, то ÑлиÑние данных, хранÑщихÑÑ Ð½Ð° данном томе, не допуÑкаетÑÑ. Это позволÑет контролировать работу ClickHouse Ñ Ð¼ÐµÐ´Ð»ÐµÐ½Ð½Ñ‹Ð¼Ð¸ диÑками. diff --git a/docs/ru/engines/table-engines/special/buffer.md b/docs/ru/engines/table-engines/special/buffer.md index 4987dafc11f..574d9273088 100644 --- a/docs/ru/engines/table-engines/special/buffer.md +++ b/docs/ru/engines/table-engines/special/buffer.md @@ -66,4 +66,4 @@ CREATE TABLE merge.hits_buffer AS merge.hits ENGINE = Buffer(merge, hits, 16, 10 Таблицы типа Buffer иÑпользуютÑÑ Ð² тех ÑлучаÑÑ…, когда от большого количеÑтва Ñерверов поÑтупает Ñлишком много INSERT-ов в единицу времени, и нет возможноÑти заранее ÑамоÑтоÑтельно буферизовать данные перед вÑтавкой, в результате чего, INSERT-Ñ‹ не уÑпевают выполнÑÑ‚ÑŒÑÑ. -Заметим, что даже Ð´Ð»Ñ Ñ‚Ð°Ð±Ð»Ð¸Ñ† типа Buffer не имеет ÑмыÑла вÑтавлÑÑ‚ÑŒ данные по одной Ñтроке, так как таким образом будет доÑтигнута ÑкороÑÑ‚ÑŒ вÑего лишь в неÑколько Ñ‚Ñ‹ÑÑч Ñтрок в Ñекунду, тогда как при вÑтавке более крупными блоками, доÑтижимо более миллиона Ñтрок в Ñекунду (Ñмотрите раздел [«ПроизводительноÑть»](../../../introduction/performance/). +Заметим, что даже Ð´Ð»Ñ Ñ‚Ð°Ð±Ð»Ð¸Ñ† типа Buffer не имеет ÑмыÑла вÑтавлÑÑ‚ÑŒ данные по одной Ñтроке, так как таким образом будет доÑтигнута ÑкороÑÑ‚ÑŒ вÑего лишь в неÑколько Ñ‚Ñ‹ÑÑч Ñтрок в Ñекунду, тогда как при вÑтавке более крупными блоками, доÑтижимо более миллиона Ñтрок в Ñекунду (Ñмотрите раздел [«ПроизводительноÑть»](../../../introduction/performance.md). diff --git a/docs/ru/faq/operations/multi-region-replication.md b/docs/ru/faq/operations/multi-region-replication.md index bfe3231c247..eb53a69e7f6 100644 --- a/docs/ru/faq/operations/multi-region-replication.md +++ b/docs/ru/faq/operations/multi-region-replication.md @@ -10,4 +10,4 @@ The short answer is "yes". However, we recommend keeping latency between all reg Configuration-wise there's no difference compared to single-region replication, simply use hosts that are located in different locations for replicas. -For more information, see [full article on data replication](../../engines/table-engines/mergetree-family/replication/). +For more information, see [full article on data replication](../../engines/table-engines/mergetree-family/replication.md). diff --git a/docs/ru/getting-started/tutorial.md b/docs/ru/getting-started/tutorial.md index 803da2952fd..60a7463f70f 100644 --- a/docs/ru/getting-started/tutorial.md +++ b/docs/ru/getting-started/tutorial.md @@ -477,7 +477,7 @@ clickhouse-client --query "INSERT INTO tutorial.hits_v1 FORMAT TSV" --max_insert clickhouse-client --query "INSERT INTO tutorial.visits_v1 FORMAT TSV" --max_insert_block_size=100000 < visits_v1.tsv ``` -ClickHouse has a lot of [settings to tune](../operations/settings/) and one way to specify them in console client is via arguments, as we can see with `--max_insert_block_size`. The easiest way to figure out what settings are available, what do they mean and what the defaults are is to query the `system.settings` table: +ClickHouse has a lot of [settings to tune](../operations/settings/index.md) and one way to specify them in console client is via arguments, as we can see with `--max_insert_block_size`. The easiest way to figure out what settings are available, what do they mean and what the defaults are is to query the `system.settings` table: ``` sql SELECT name, value, changed, description diff --git a/docs/ru/interfaces/formats.md b/docs/ru/interfaces/formats.md index 59c77d082cf..bef5c223281 100644 --- a/docs/ru/interfaces/formats.md +++ b/docs/ru/interfaces/formats.md @@ -974,7 +974,7 @@ Array предÑтавлены как длина в формате varint (unsig Ñтолбцы из входных данных будут ÑопоÑтавлены Ñо Ñтолбцами таблицы по их именам, Ñтолбцы Ñ Ð½ÐµÐ¸Ð·Ð²ÐµÑтными именами будут пропущены, еÑли включен параметр [input_format_skip_unknown_fields](../operations/settings/settings.md#input_format_skip_unknown_fields). Ð’ противном Ñлучае Ð¿ÐµÑ€Ð²Ð°Ñ Ñтрока будет пропущена. ::: - + ## RowBinaryWithNamesAndTypes {#rowbinarywithnamesandtypes} То же Ñамое что [RowBinary](#rowbinary), но добавлÑетÑÑ Ð·Ð°Ð³Ð¾Ð»Ð¾Ð²Ð¾Ðº: @@ -1326,7 +1326,7 @@ ClickHouse поддерживает наÑтраиваемую точноÑÑ‚ÑŒ Ðеподдерживаемые типы данных Parquet: `TIME32`, `FIXED_SIZE_BINARY`, `JSON`, `UUID`, `ENUM`. -Типы данных Ñтолбцов в ClickHouse могут отличатьÑÑ Ð¾Ñ‚ типов данных ÑоответÑтвующих полей файла в формате Parquet. При вÑтавке данных ClickHouse интерпретирует типы данных в ÑоответÑтвии Ñ Ñ‚Ð°Ð±Ð»Ð¸Ñ†ÐµÐ¹ выше, а затем [приводит](../sql-reference/functions/type-conversion-functions/#type_conversion_function-cast) данные к тому типу, который уÑтановлен Ð´Ð»Ñ Ñтолбца таблицы. +Типы данных Ñтолбцов в ClickHouse могут отличатьÑÑ Ð¾Ñ‚ типов данных ÑоответÑтвующих полей файла в формате Parquet. При вÑтавке данных ClickHouse интерпретирует типы данных в ÑоответÑтвии Ñ Ñ‚Ð°Ð±Ð»Ð¸Ñ†ÐµÐ¹ выше, а затем [приводит](../sql-reference/functions/type-conversion-functions.md#type_conversion_function-cast) данные к тому типу, который уÑтановлен Ð´Ð»Ñ Ñтолбца таблицы. ### Ð’Ñтавка и выборка данных {#inserting-and-selecting-data} @@ -1386,7 +1386,7 @@ ClickHouse поддерживает наÑтраиваемую точноÑÑ‚ÑŒ Ðеподдерживаемые типы данных Arrow: `TIME32`, `FIXED_SIZE_BINARY`, `JSON`, `UUID`, `ENUM`. -Типы данных Ñтолбцов в ClickHouse могут отличатьÑÑ Ð¾Ñ‚ типов данных ÑоответÑтвующих полей файла в формате Arrow. При вÑтавке данных ClickHouse интерпретирует типы данных в ÑоответÑтвии Ñ Ñ‚Ð°Ð±Ð»Ð¸Ñ†ÐµÐ¹ выше, а затем [приводит](../sql-reference/functions/type-conversion-functions/#type_conversion_function-cast) данные к тому типу, который уÑтановлен Ð´Ð»Ñ Ñтолбца таблицы. +Типы данных Ñтолбцов в ClickHouse могут отличатьÑÑ Ð¾Ñ‚ типов данных ÑоответÑтвующих полей файла в формате Arrow. При вÑтавке данных ClickHouse интерпретирует типы данных в ÑоответÑтвии Ñ Ñ‚Ð°Ð±Ð»Ð¸Ñ†ÐµÐ¹ выше, а затем [приводит](../sql-reference/functions/type-conversion-functions.md#type_conversion_function-cast) данные к тому типу, который уÑтановлен Ð´Ð»Ñ Ñтолбца таблицы. ### Ð’Ñтавка данных {#inserting-data-arrow} @@ -1444,7 +1444,7 @@ ClickHouse поддерживает наÑтраиваемую точноÑÑ‚ÑŒ Ðеподдерживаемые типы данных ORC: `TIME32`, `FIXED_SIZE_BINARY`, `JSON`, `UUID`, `ENUM`. -Типы данных Ñтолбцов в таблицах ClickHouse могут отличатьÑÑ Ð¾Ñ‚ типов данных Ð´Ð»Ñ ÑоответÑтвующих полей ORC. При вÑтавке данных ClickHouse интерпретирует типы данных ORC ÑоглаÑно таблице ÑоответÑтвиÑ, а затем [приводит](../sql-reference/functions/type-conversion-functions/#type_conversion_function-cast) данные к типу, уÑтановленному Ð´Ð»Ñ Ñтолбца таблицы ClickHouse. +Типы данных Ñтолбцов в таблицах ClickHouse могут отличатьÑÑ Ð¾Ñ‚ типов данных Ð´Ð»Ñ ÑоответÑтвующих полей ORC. При вÑтавке данных ClickHouse интерпретирует типы данных ORC ÑоглаÑно таблице ÑоответÑтвиÑ, а затем [приводит](../sql-reference/functions/type-conversion-functions.md#type_conversion_function-cast) данные к типу, уÑтановленному Ð´Ð»Ñ Ñтолбца таблицы ClickHouse. ### Ð’Ñтавка данных {#inserting-data-2} diff --git a/docs/ru/interfaces/http.md b/docs/ru/interfaces/http.md index 62e97e3f61d..b8c5ee77f0c 100644 --- a/docs/ru/interfaces/http.md +++ b/docs/ru/interfaces/http.md @@ -243,7 +243,7 @@ $ echo 'SELECT 1' | curl -H 'X-ClickHouse-User: user' -H 'X-ClickHouse-Key: pass ЕÑли пользователь не задан,то иÑпользуетÑÑ `default`. ЕÑли пароль не задан, то иÑпользуетÑÑ Ð¿ÑƒÑтой пароль. Также в параметрах URL вы можете указать любые наÑтройки, которые будут иÑпользованы Ð´Ð»Ñ Ð¾Ð±Ñ€Ð°Ð±Ð¾Ñ‚ÐºÐ¸ одного запроÑа, или целые профили наÑтроек. Пример:http://localhost:8123/?profile=web&max_rows_to_read=1000000000&query=SELECT+1 -Подробнее Ñмотрите в разделе [ÐаÑтройки](../operations/settings/). +Подробнее Ñмотрите в разделе [ÐаÑтройки](../operations/settings/index.md). ``` bash $ echo 'SELECT number FROM system.numbers LIMIT 10' | curl 'http://localhost:8123/?' --data-binary @- diff --git a/docs/ru/interfaces/third-party/gui.md b/docs/ru/interfaces/third-party/gui.md index e7190362dc4..34d2f0e371a 100644 --- a/docs/ru/interfaces/third-party/gui.md +++ b/docs/ru/interfaces/third-party/gui.md @@ -187,7 +187,7 @@ sidebar_label: "Визуальные интерфейÑÑ‹ от Ñторонни DataLens [доÑтупен беÑплатно](https://cloud.yandex.ru/docs/datalens/pricing), в том чиÑле и Ð´Ð»Ñ ÐºÐ¾Ð¼Ð¼ÐµÑ€Ñ‡ÐµÑкого иÑпользованиÑ. -- [ЗнакомÑтво Ñ DataLens]((https://youtu.be/57ngi_6BINE). +- [ЗнакомÑтво Ñ DataLens](https://youtu.be/57ngi_6BINE). - [Чат ÑообщеÑтва DataLens](https://t.me/YandexDataLens) - [Ð”Ð¾ÐºÑƒÐ¼ÐµÐ½Ñ‚Ð°Ñ†Ð¸Ñ DataLens](https://cloud.yandex.ru/docs/datalens/). - [Сценарий по визуализации данных из ClickHouse](https://cloud.yandex.ru/docs/solutions/datalens/data-from-ch-visualization). diff --git a/docs/ru/operations/optimizing-performance/sampling-query-profiler.md b/docs/ru/operations/optimizing-performance/sampling-query-profiler.md index c77f6a1f290..3d5ec993fdf 100644 --- a/docs/ru/operations/optimizing-performance/sampling-query-profiler.md +++ b/docs/ru/operations/optimizing-performance/sampling-query-profiler.md @@ -30,7 +30,7 @@ To analyze the `trace_log` system table: - Use the `addressToLine`, `addressToSymbol` and `demangle` [introspection functions](../../sql-reference/functions/introspection.md) to get function names and their positions in ClickHouse code. To get a profile for some query, you need to aggregate data from the `trace_log` table. You can aggregate data by individual functions or by the whole stack traces. -If you need to visualize `trace_log` info, try [flamegraph](../../interfaces/third-party/gui/#clickhouse-flamegraph) and [speedscope](https://github.com/laplab/clickhouse-speedscope). +If you need to visualize `trace_log` info, try [flamegraph](../../interfaces/third-party/gui.md#clickhouse-flamegraph) and [speedscope](https://github.com/laplab/clickhouse-speedscope). ## Example {#example} diff --git a/docs/ru/operations/server-configuration-parameters/settings.md b/docs/ru/operations/server-configuration-parameters/settings.md index e29b9def9d4..4b1d8ce717f 100644 --- a/docs/ru/operations/server-configuration-parameters/settings.md +++ b/docs/ru/operations/server-configuration-parameters/settings.md @@ -47,7 +47,7 @@ ClickHouse перезагружает вÑтроенные Ñловари Ñ Ð· - `min_part_size` - Минимальный размер чаÑти таблицы. - `min_part_size_ratio` - Отношение размера минимальной чаÑти таблицы к полному размеру таблицы. - `method` - Метод ÑжатиÑ. Возможные значениÑ: `lz4`, `lz4hc`, `zstd`,`deflate_qpl`. -- `level` – Уровень ÑжатиÑ. См. [Кодеки](../../sql-reference/statements/create/table/#create-query-common-purpose-codecs). +- `level` – Уровень ÑжатиÑ. См. [Кодеки](../../sql-reference/statements/create/table.md#create-query-common-purpose-codecs). Можно Ñконфигурировать неÑколько разделов ``. @@ -152,7 +152,7 @@ ClickHouse проверÑет уÑÐ»Ð¾Ð²Ð¸Ñ Ð´Ð»Ñ `min_part_size` и `min_part ## custom_settings_prefixes {#custom_settings_prefixes} -СпиÑок префикÑов Ð´Ð»Ñ [пользовательÑких наÑтроек](../../operations/settings/#custom_settings). ПрефикÑÑ‹ должны перечиÑлÑÑ‚ÑŒÑÑ Ñ‡ÐµÑ€ÐµÐ· запÑтую. +СпиÑок префикÑов Ð´Ð»Ñ [пользовательÑких наÑтроек](../../operations/settings/index.md#custom_settings). ПрефикÑÑ‹ должны перечиÑлÑÑ‚ÑŒÑÑ Ñ‡ÐµÑ€ÐµÐ· запÑтую. **Пример** @@ -162,7 +162,7 @@ ClickHouse проверÑет уÑÐ»Ð¾Ð²Ð¸Ñ Ð´Ð»Ñ `min_part_size` и `min_part **См. также** -- [ПользовательÑкие наÑтройки](../../operations/settings#custom_settings) +- [ПользовательÑкие наÑтройки](../../operations/settings/index.md#custom_settings) ## core_dump {#server_configuration_parameters-core_dump} diff --git a/docs/ru/operations/settings/settings.md b/docs/ru/operations/settings/settings.md index 4025966ac21..d77f7ba6a46 100644 --- a/docs/ru/operations/settings/settings.md +++ b/docs/ru/operations/settings/settings.md @@ -4084,3 +4084,32 @@ ALTER TABLE test FREEZE SETTINGS alter_partition_verbose_result = 1; Задает Ñимвол, который интерпретируетÑÑ ÐºÐ°Ðº ÑÑƒÑ„Ñ„Ð¸ÐºÑ Ð¿Ð¾Ñле результирующего набора данных формата [CustomSeparated](../../interfaces/formats.md#format-customseparated). Значение по умолчанию: `''`. + +## stop_reading_on_first_cancel {#stop_reading_on_first_cancel} +ЕÑли уÑтановлено значение `true` и пользователь хочет прервать Ð·Ð°Ð¿Ñ€Ð¾Ñ (например, Ñ Ð¿Ð¾Ð¼Ð¾Ñ‰ÑŒÑŽ `Ctrl+C` на клиенте), то Ð·Ð°Ð¿Ñ€Ð¾Ñ Ð¿Ñ€Ð¾Ð´Ð¾Ð»Ð¶Ð°ÐµÑ‚ выполнение только Ð´Ð»Ñ Ð´Ð°Ð½Ð½Ñ‹Ñ…, которые уже были Ñчитаны из таблицы. ПоÑле Ñтого он вернет чаÑтичный результат запроÑа Ð´Ð»Ñ Ñ‚Ð¾Ð¹ чаÑти таблицы, ÐºÐ¾Ñ‚Ð¾Ñ€Ð°Ñ Ð±Ñ‹Ð»Ð° прочитана. Чтобы полноÑтью оÑтановить выполнение запроÑа без чаÑтичного результата, пользователь должен отправить 2 запроÑа отмены. + +**Пример Ñ Ð²Ñ‹ÐºÐ»ÑŽÑ‡ÐµÐ½Ð½Ð¾Ð¹ наÑтройкой при нажатии Ctrl+C** +```sql +SELECT sum(number) FROM numbers(10000000000) + +Cancelling query. +Ok. +Query was cancelled. + +0 rows in set. Elapsed: 1.334 sec. Processed 52.65 million rows, 421.23 MB (39.48 million rows/s., 315.85 MB/s.) +``` + +**Пример Ñ Ð²ÐºÐ»ÑŽÑ‡ÐµÐ½Ð½Ð¾Ð¹ наÑтройкой при нажатии Ctrl+C** +```sql +SELECT sum(number) FROM numbers(10000000000) SETTINGS stop_reading_on_first_cancel=true + +┌──────sum(number)─┠+│ 1355411451286266 │ +└──────────────────┘ + +1 row in set. Elapsed: 1.331 sec. Processed 52.13 million rows, 417.05 MB (39.17 million rows/s., 313.33 MB/s.) +``` + +Возможные значениÑ:: `true`, `false` + +Значение по умолчанию: `false` diff --git a/docs/ru/operations/system-tables/information_schema.md b/docs/ru/operations/system-tables/information_schema.md index 6a9b8134dad..691fec19039 100644 --- a/docs/ru/operations/system-tables/information_schema.md +++ b/docs/ru/operations/system-tables/information_schema.md @@ -178,7 +178,7 @@ table_type: BASE TABLE - `view_definition` ([String](../../sql-reference/data-types/string.md)) — `SELECT` Ð·Ð°Ð¿Ñ€Ð¾Ñ Ð´Ð»Ñ Ð¿Ñ€ÐµÐ´ÑтавлениÑ. - `check_option` ([String](../../sql-reference/data-types/string.md)) — `NONE`, нет проверки. - `is_updatable` ([Enum8](../../sql-reference/data-types/enum.md)) — `NO`, предÑтавление не обновлÑетÑÑ. -- `is_insertable_into` ([Enum8](../../sql-reference/data-types/enum.md)) — показывает ÑвлÑетÑÑ Ð»Ð¸ предÑтавление [материализованным](../../sql-reference/statements/create/view/#materialized). Возможные значениÑ: +- `is_insertable_into` ([Enum8](../../sql-reference/data-types/enum.md)) — показывает ÑвлÑетÑÑ Ð»Ð¸ предÑтавление [материализованным](../../sql-reference/statements/create/view.md#materialized). Возможные значениÑ: - `NO` — Ñоздано обычное предÑтавление. - `YES` — Ñоздано материализованное предÑтавление. - `is_trigger_updatable` ([Enum8](../../sql-reference/data-types/enum.md)) — `NO`, триггер не обновлÑетÑÑ. diff --git a/docs/ru/operations/system-tables/replicated_fetches.md b/docs/ru/operations/system-tables/replicated_fetches.md index 0b91a02cf14..c13f058aae1 100644 --- a/docs/ru/operations/system-tables/replicated_fetches.md +++ b/docs/ru/operations/system-tables/replicated_fetches.md @@ -68,4 +68,4 @@ thread_id: 54 **Смотрите также** -- [Управление таблицами ReplicatedMergeTree](../../sql-reference/statements/system/#query-language-system-replicated) +- [Управление таблицами ReplicatedMergeTree](../../sql-reference/statements/system.md#query-language-system-replicated) diff --git a/docs/ru/operations/utilities/clickhouse-benchmark.md b/docs/ru/operations/utilities/clickhouse-benchmark.md index d3185f4fcb0..73de78d1c15 100644 --- a/docs/ru/operations/utilities/clickhouse-benchmark.md +++ b/docs/ru/operations/utilities/clickhouse-benchmark.md @@ -60,7 +60,7 @@ clickhouse-benchmark [keys] < queries_file; - `--stage=WORD` — ÑÑ‚Ð°Ð´Ð¸Ñ Ð¾Ð±Ñ€Ð°Ð±Ð¾Ñ‚ÐºÐ¸ запроÑа на Ñервере. ClickHouse оÑтанавливает обработку запроÑа и возвращает ответ `clickhouse-benchmark` на заданной Ñтадии. Возможные значениÑ: `complete`, `fetch_columns`, `with_mergeable_state`. Значение по умолчанию: `complete`. - `--help` — показывает Ñправку. -ЕÑли нужно применить [наÑтройки](../../operations/settings/) Ð´Ð»Ñ Ð·Ð°Ð¿Ñ€Ð¾Ñов, их можно передать как ключ `--= SETTING_VALUE`. Ðапример, `--max_memory_usage=1048576`. +ЕÑли нужно применить [наÑтройки](../../operations/settings/index.md) Ð´Ð»Ñ Ð·Ð°Ð¿Ñ€Ð¾Ñов, их можно передать как ключ `--= SETTING_VALUE`. Ðапример, `--max_memory_usage=1048576`. ## Вывод {#clickhouse-benchmark-output} diff --git a/docs/ru/sql-reference/data-types/datetime.md b/docs/ru/sql-reference/data-types/datetime.md index b513c51397e..e8d4a3ee9fd 100644 --- a/docs/ru/sql-reference/data-types/datetime.md +++ b/docs/ru/sql-reference/data-types/datetime.md @@ -27,9 +27,9 @@ DateTime([timezone]) КонÑольный клиент ClickHouse по умолчанию иÑпользует чаÑовой поÑÑ Ñервера, еÑли Ð´Ð»Ñ Ð·Ð½Ð°Ñ‡ÐµÐ½Ð¸Ñ `DateTime` чаÑовой поÑÑ Ð½Ðµ был задан в Ñвном виде при инициализации типа данных. Чтобы иÑпользовать чаÑовой поÑÑ ÐºÐ»Ð¸ÐµÐ½Ñ‚Ð°, запуÑтите [clickhouse-client](../../interfaces/cli.md) Ñ Ð¿Ð°Ñ€Ð°Ð¼ÐµÑ‚Ñ€Ð¾Ð¼ `--use_client_time_zone`. -ClickHouse отображает Ð·Ð½Ð°Ñ‡ÐµÐ½Ð¸Ñ Ð² завиÑимоÑти от Ð·Ð½Ð°Ñ‡ÐµÐ½Ð¸Ñ Ð¿Ð°Ñ€Ð°Ð¼ÐµÑ‚Ñ€Ð° [date\_time\_output\_format](../../operations/settings/#settings-date_time_output_format). ТекÑтовый формат по умолчанию `YYYY-MM-DD hh:mm:ss`. Кроме того, вы можете поменÑÑ‚ÑŒ отображение Ñ Ð¿Ð¾Ð¼Ð¾Ñ‰ÑŒÑŽ функции [formatDateTime](../../sql-reference/functions/date-time-functions.md#formatdatetime). +ClickHouse отображает Ð·Ð½Ð°Ñ‡ÐµÐ½Ð¸Ñ Ð² завиÑимоÑти от Ð·Ð½Ð°Ñ‡ÐµÐ½Ð¸Ñ Ð¿Ð°Ñ€Ð°Ð¼ÐµÑ‚Ñ€Ð° [date\_time\_output\_format](../../operations/settings/index.md#settings-date_time_output_format). ТекÑтовый формат по умолчанию `YYYY-MM-DD hh:mm:ss`. Кроме того, вы можете поменÑÑ‚ÑŒ отображение Ñ Ð¿Ð¾Ð¼Ð¾Ñ‰ÑŒÑŽ функции [formatDateTime](../../sql-reference/functions/date-time-functions.md#formatdatetime). -При вÑтавке данных в ClickHouse, можно иÑпользовать различные форматы даты и времени в завиÑимоÑти от Ð·Ð½Ð°Ñ‡ÐµÐ½Ð¸Ñ Ð½Ð°Ñтройки [date_time_input_format](../../operations/settings/#settings-date_time_input_format). +При вÑтавке данных в ClickHouse, можно иÑпользовать различные форматы даты и времени в завиÑимоÑти от Ð·Ð½Ð°Ñ‡ÐµÐ½Ð¸Ñ Ð½Ð°Ñтройки [date_time_input_format](../../operations/settings/index.md#settings-date_time_input_format). ## Примеры {#primery} @@ -119,8 +119,8 @@ FROM dt - [Функции Ð¿Ñ€ÐµÐ¾Ð±Ñ€Ð°Ð·Ð¾Ð²Ð°Ð½Ð¸Ñ Ñ‚Ð¸Ð¿Ð¾Ð²](../../sql-reference/functions/type-conversion-functions.md) - [Функции Ð´Ð»Ñ Ñ€Ð°Ð±Ð¾Ñ‚Ñ‹ Ñ Ð´Ð°Ñ‚Ð¾Ð¹ и временем](../../sql-reference/functions/date-time-functions.md) - [Функции Ð´Ð»Ñ Ñ€Ð°Ð±Ð¾Ñ‚Ñ‹ Ñ Ð¼Ð°ÑÑивами](../../sql-reference/functions/array-functions.md) -- [ÐаÑтройка `date_time_input_format`](../../operations/settings/#settings-date_time_input_format) -- [ÐаÑтройка `date_time_output_format`](../../operations/settings/) +- [ÐаÑтройка `date_time_input_format`](../../operations/settings/index.md#settings-date_time_input_format) +- [ÐаÑтройка `date_time_output_format`](../../operations/settings/index.md) - [Конфигурационный параметр Ñервера `timezone`](../../operations/server-configuration-parameters/settings.md#server_configuration_parameters-timezone) - [Операторы Ð´Ð»Ñ Ñ€Ð°Ð±Ð¾Ñ‚Ñ‹ Ñ Ð´Ð°Ñ‚Ð¾Ð¹ и временем](../../sql-reference/operators/index.md#operators-datetime) - [Тип данных `Date`](date.md) diff --git a/docs/ru/sql-reference/functions/date-time-functions.md b/docs/ru/sql-reference/functions/date-time-functions.md index 8fbcaf9568b..a7e8a478edb 100644 --- a/docs/ru/sql-reference/functions/date-time-functions.md +++ b/docs/ru/sql-reference/functions/date-time-functions.md @@ -268,7 +268,7 @@ SELECT toUnixTimestamp('2017-11-05 08:07:47', 'Asia/Tokyo') AS unix_timestamp; ``` :::note -Тип возвращаемого Ð·Ð½Ð°Ñ‡ÐµÐ½Ð¸Ñ Ð¾Ð¿Ð¸Ñанными далее функциÑми `toStartOf*`, `toLastDayOfMonth`, `toMonday`, `timeSlot` определÑетÑÑ ÐºÐ¾Ð½Ñ„Ð¸Ð³ÑƒÑ€Ð°Ñ†Ð¸Ð¾Ð½Ð½Ñ‹Ð¼ параметром [enable_extended_results_for_datetime_functions](../../operations/settings/settings#enable-extended-results-for-datetime-functions) имеющим по умолчанию значение `0`. +Тип возвращаемого Ð·Ð½Ð°Ñ‡ÐµÐ½Ð¸Ñ Ð¾Ð¿Ð¸Ñанными далее функциÑми `toStartOf*`, `toLastDayOfMonth`, `toMonday`, `timeSlot` определÑетÑÑ ÐºÐ¾Ð½Ñ„Ð¸Ð³ÑƒÑ€Ð°Ñ†Ð¸Ð¾Ð½Ð½Ñ‹Ð¼ параметром [enable_extended_results_for_datetime_functions](../../operations/settings/settings.md#enable-extended-results-for-datetime-functions) имеющим по умолчанию значение `0`. Поведение Ð´Ð»Ñ * `enable_extended_results_for_datetime_functions = 0`: Функции `toStartOf*`, `toLastDayOfMonth`, `toMonday` возвращают `Date` или `DateTime`. Функции `toStartOfDay`, `toStartOfHour`, `toStartOfFifteenMinutes`, `toStartOfTenMinutes`, `toStartOfFiveMinutes`, `toStartOfMinute`, `timeSlot` возвращают `DateTime`. Ð¥Ð¾Ñ‚Ñ Ñти функции могут принимать Ð·Ð½Ð°Ñ‡ÐµÐ½Ð¸Ñ Ñ‚Ð¸Ð¿Ð° `Date32` или `DateTime64` в качеÑтве аргумента, при обработке аргумента вне нормального диапазона значений (`1970` - `2148` Ð´Ð»Ñ `Date` и `1970-01-01 00:00:00`-`2106-02-07 08:28:15` Ð´Ð»Ñ `DateTime`) будет получен некорректный результат. diff --git a/docs/ru/sql-reference/functions/other-functions.md b/docs/ru/sql-reference/functions/other-functions.md index f457b54ae28..de54f1b3607 100644 --- a/docs/ru/sql-reference/functions/other-functions.md +++ b/docs/ru/sql-reference/functions/other-functions.md @@ -2136,7 +2136,7 @@ countDigits(x) :::note "Примечание" Ð”Ð»Ñ `Decimal` значений учитываетÑÑ Ð¸Ñ… маÑштаб: вычиÑлÑетÑÑ Ñ€ÐµÐ·ÑƒÐ»ÑŒÑ‚Ð°Ñ‚ по базовому целочиÑленному типу, полученному как `(value * scale)`. Ðапример: `countDigits(42) = 2`, `countDigits(42.000) = 5`, `countDigits(0.04200) = 4`. То еÑÑ‚ÑŒ вы можете проверить деÑÑтичное переполнение Ð´Ð»Ñ `Decimal64` Ñ Ð¿Ð¾Ð¼Ð¾Ñ‰ÑŒÑŽ `countDecimal(x) > 18`. Это медленный вариант [isDecimalOverflow](#is-decimal-overflow). ::: - + **Пример** ЗапроÑ: @@ -2297,7 +2297,7 @@ enabledRoles() ## defaultRoles {#default-roles} -Возвращает имена ролей, которые задаютÑÑ Ð¿Ð¾ умолчанию Ð´Ð»Ñ Ñ‚ÐµÐºÑƒÑ‰ÐµÐ³Ð¾ Ð¿Ð¾Ð»ÑŒÐ·Ð¾Ð²Ð°Ñ‚ÐµÐ»Ñ Ð¿Ñ€Ð¸ входе в ÑиÑтему. Изначально Ñто вÑе роли, которые разрешено иÑпользовать текущему пользователю (Ñм. [GRANT](../../sql-reference/statements/grant/#grant-select)). СпиÑок ролей по умолчанию может быть изменен Ñ Ð¿Ð¾Ð¼Ð¾Ñ‰ÑŒÑŽ Ð²Ñ‹Ñ€Ð°Ð¶ÐµÐ½Ð¸Ñ [SET DEFAULT ROLE](../../sql-reference/statements/set-role.md#set-default-role-statement). +Возвращает имена ролей, которые задаютÑÑ Ð¿Ð¾ умолчанию Ð´Ð»Ñ Ñ‚ÐµÐºÑƒÑ‰ÐµÐ³Ð¾ Ð¿Ð¾Ð»ÑŒÐ·Ð¾Ð²Ð°Ñ‚ÐµÐ»Ñ Ð¿Ñ€Ð¸ входе в ÑиÑтему. Изначально Ñто вÑе роли, которые разрешено иÑпользовать текущему пользователю (Ñм. [GRANT](../../sql-reference/statements/grant.md#grant-select)). СпиÑок ролей по умолчанию может быть изменен Ñ Ð¿Ð¾Ð¼Ð¾Ñ‰ÑŒÑŽ Ð²Ñ‹Ñ€Ð°Ð¶ÐµÐ½Ð¸Ñ [SET DEFAULT ROLE](../../sql-reference/statements/set-role.md#set-default-role-statement). **СинтакÑиÑ** diff --git a/docs/ru/sql-reference/statements/select/array-join.md b/docs/ru/sql-reference/statements/select/array-join.md index 9d2dbf54a2b..6c7fcbba7cc 100644 --- a/docs/ru/sql-reference/statements/select/array-join.md +++ b/docs/ru/sql-reference/statements/select/array-join.md @@ -146,7 +146,7 @@ ARRAY JOIN arr AS a, arrayEnumerate(arr) AS num, arrayMap(x -> x + 1, arr) AS ma └───────┴─────────┴───┴─────┴────────┘ ``` -Ð’ приведенном ниже примере иÑпользуетÑÑ Ñ„ÑƒÐ½ÐºÑ†Ð¸Ñ [arrayEnumerate](../../../sql-reference/functions/array-functions#array_functions-arrayenumerate): +Ð’ приведенном ниже примере иÑпользуетÑÑ Ñ„ÑƒÐ½ÐºÑ†Ð¸Ñ [arrayEnumerate](../../../sql-reference/functions/array-functions.md#array_functions-arrayenumerate): ``` sql SELECT s, arr, a, num, arrayEnumerate(arr) @@ -259,7 +259,7 @@ ARRAY JOIN nest AS n; └───────┴─────┴─────┴─────────┴────────────┘ ``` -Пример иÑÐ¿Ð¾Ð»ÑŒÐ·Ð¾Ð²Ð°Ð½Ð¸Ñ Ñ„ÑƒÐ½ÐºÑ†Ð¸Ð¸ [arrayEnumerate](../../../sql-reference/functions/array-functions#array_functions-arrayenumerate): +Пример иÑÐ¿Ð¾Ð»ÑŒÐ·Ð¾Ð²Ð°Ð½Ð¸Ñ Ñ„ÑƒÐ½ÐºÑ†Ð¸Ð¸ [arrayEnumerate](../../../sql-reference/functions/array-functions.md#array_functions-arrayenumerate): ``` sql SELECT s, `n.x`, `n.y`, `nest.x`, `nest.y`, num diff --git a/docs/ru/sql-reference/statements/system.md b/docs/ru/sql-reference/statements/system.md index a7dec7abe27..847f60ca35c 100644 --- a/docs/ru/sql-reference/statements/system.md +++ b/docs/ru/sql-reference/statements/system.md @@ -269,13 +269,13 @@ SYSTEM START REPLICATION QUEUES [[db.]replicated_merge_tree_family_table_name] ### SYNC REPLICA {#query_language-system-sync-replica} -Ждет когда таблица ÑемейÑтва `ReplicatedMergeTree` будет Ñинхронизирована Ñ Ð´Ñ€ÑƒÐ³Ð¸Ð¼Ð¸ репликами в клаÑтере, будет работать до доÑÑ‚Ð¸Ð¶ÐµÐ½Ð¸Ñ `receive_timeout`, еÑли ÑÐ¸Ð½Ñ…Ñ€Ð¾Ð½Ð¸Ð·Ð°Ñ†Ð¸Ñ Ð´Ð»Ñ Ñ‚Ð°Ð±Ð»Ð¸Ñ†Ñ‹ отключена в наÑтоÑщий момент времени: +Ждет когда таблица ÑемейÑтва `ReplicatedMergeTree` будет Ñинхронизирована Ñ Ð´Ñ€ÑƒÐ³Ð¸Ð¼Ð¸ репликами в клаÑтере, но не более `receive_timeout` Ñекунд: ``` sql -SYSTEM SYNC REPLICA [db.]replicated_merge_tree_family_table_name +SYSTEM SYNC REPLICA [db.]replicated_merge_tree_family_table_name [STRICT] ``` -ПоÑле Ð²Ñ‹Ð¿Ð¾Ð»Ð½ÐµÐ½Ð¸Ñ Ñтого запроÑа таблица `[db.]replicated_merge_tree_family_table_name` Ñинхронизирует команды из общего реплицированного лога в Ñвою ÑобÑтвенную очередь репликации. Затем Ð·Ð°Ð¿Ñ€Ð¾Ñ Ð¶Ð´ÐµÑ‚, пока реплика не обработает вÑе Ñинхронизированные команды. +ПоÑле Ð²Ñ‹Ð¿Ð¾Ð»Ð½ÐµÐ½Ð¸Ñ Ñтого запроÑа таблица `[db.]replicated_merge_tree_family_table_name` загружает команды из общего реплицированного лога в Ñвою ÑобÑтвенную очередь репликации. Затем Ð·Ð°Ð¿Ñ€Ð¾Ñ Ð¶Ð´ÐµÑ‚, пока реплика не обработает вÑе загруженные команды. ЕÑли указан модификатор `STRICT`, то Ð·Ð°Ð¿Ñ€Ð¾Ñ Ð¶Ð´Ñ‘Ñ‚ когда очередь репликации Ñтанет пуÑтой. Строгий вариант запроÑа может никогда не завершитьÑÑ ÑƒÑпешно, еÑли в очереди репликации поÑтоÑнно поÑвлÑÑŽÑ‚ÑÑ Ð½Ð¾Ð²Ñ‹Ðµ запиÑи. ### RESTART REPLICA {#query_language-system-restart-replica} diff --git a/docs/zh/development/continuous-integration.md b/docs/zh/development/continuous-integration.md index a52d77a7a33..56e3e1dfd50 100644 --- a/docs/zh/development/continuous-integration.md +++ b/docs/zh/development/continuous-integration.md @@ -34,7 +34,7 @@ git push ## æè¿°ä¿¡æ¯æ£€æŸ¥ {#description-check} 检查pull请求的æ述是å¦ç¬¦åˆ[PULL_REQUEST_TEMPLATE.md](https://github.com/ClickHouse/ClickHouse/blob/master/.github/PULL_REQUEST_TEMPLATE.md)模æ¿. -您必须为您的更改指定一个更改日志类别(例如,Bugä¿®å¤), 并且为[CHANGELOG.md](../whats-new/changelog/)编写一æ¡ç”¨æˆ·å¯è¯»çš„消æ¯ç”¨æ¥æ述更改. +您必须为您的更改指定一个更改日志类别(例如,Bugä¿®å¤), 并且为[CHANGELOG.md](../whats-new/changelog/index.md)编写一æ¡ç”¨æˆ·å¯è¯»çš„消æ¯ç”¨æ¥æ述更改. ## 推é€åˆ°DockerHub {#push-to-dockerhub} 生æˆç”¨äºŽæž„建和测试的docker映åƒ, 然åŽå°†å®ƒä»¬æŽ¨é€åˆ°DockerHub. diff --git a/docs/zh/engines/database-engines/index.md b/docs/zh/engines/database-engines/index.md index 0b24590686e..2839f819671 100644 --- a/docs/zh/engines/database-engines/index.md +++ b/docs/zh/engines/database-engines/index.md @@ -16,7 +16,7 @@ sidebar_position: 27 - [MaterializeMySQL](../../engines/database-engines/materialized-mysql.md) -- [Lazy](../../engines/database-engines/lazy) +- [Lazy](../../engines/database-engines/lazy.md) - [Atomic](../../engines/database-engines/atomic.md) diff --git a/docs/zh/engines/database-engines/materialize-mysql.md b/docs/zh/engines/database-engines/materialize-mysql.md index 10049017c71..5d1394f9456 100644 --- a/docs/zh/engines/database-engines/materialize-mysql.md +++ b/docs/zh/engines/database-engines/materialize-mysql.md @@ -38,8 +38,8 @@ ENGINE = MaterializeMySQL('host:port', ['database' | database], 'user', 'passwor - `max_wait_time_when_mysql_unavailable` — 当MySQLä¸å¯ç”¨æ—¶é‡è¯•é—´éš”(毫秒)。负值ç¦æ­¢é‡è¯•ã€‚默认值: `1000`. - `allows_query_when_mysql_lost` — 当mysql丢失时,å…许查询物化表。默认值: `0` (`false`). ``` -CREATE DATABASE mysql ENGINE = MaterializeMySQL('localhost:3306', 'db', 'user', '***') - SETTINGS +CREATE DATABASE mysql ENGINE = MaterializeMySQL('localhost:3306', 'db', 'user', '***') + SETTINGS allows_query_when_mysql_lost=true, max_wait_time_when_mysql_unavailable=10000; ``` @@ -97,7 +97,7 @@ CREATE DATABASE mysql ENGINE = MaterializeMySQL('localhost:3306', 'db', 'user', ### DDL查询 {#ddl-queries} -MySQL DDL查询转æ¢ä¸ºç›¸åº”çš„ClickHouse DDL查询([ALTER](../../sql-reference/statements/alter/index.md), [CREATE](../../sql-reference/statements/create/index.md), [DROP](../../sql-reference/statements/drop), [RENAME](../../sql-reference/statements/rename.md))。如果ClickHouse无法解æžæŸä¸ªDDL查询,则该查询将被忽略。 +MySQL DDL查询转æ¢ä¸ºç›¸åº”çš„ClickHouse DDL查询([ALTER](../../sql-reference/statements/alter/index.md), [CREATE](../../sql-reference/statements/create.md), [DROP](../../sql-reference/statements/drop.md), [RENAME](../../sql-reference/statements/rename.md))。如果ClickHouse无法解æžæŸä¸ªDDL查询,则该查询将被忽略。 ### Data Replication {#data-replication} @@ -148,9 +148,9 @@ mysql> SELECT * FROM test; ``` ```text -+---+------+------+ ++---+------+------+ | a | b | c | -+---+------+------+ ++---+------+------+ | 2 | 222 | Wow! | +---+------+------+ ``` @@ -177,9 +177,9 @@ SELECT * FROM mysql.test; ``` ``` text -┌─a─┬──b─┠-│ 1 │ 11 │ -│ 2 │ 22 │ +┌─a─┬──b─┠+│ 1 │ 11 │ +│ 2 │ 22 │ └───┴────┘ ``` @@ -190,7 +190,7 @@ SELECT * FROM mysql.test; ``` ``` text -┌─a─┬───b─┬─c────┠-│ 2 │ 222 │ Wow! │ +┌─a─┬───b─┬─c────┠+│ 2 │ 222 │ Wow! │ └───┴─────┴──────┘ ``` diff --git a/docs/zh/engines/database-engines/materialized-mysql.md b/docs/zh/engines/database-engines/materialized-mysql.md index c34d3a6f20d..5c735556c48 100644 --- a/docs/zh/engines/database-engines/materialized-mysql.md +++ b/docs/zh/engines/database-engines/materialized-mysql.md @@ -109,7 +109,7 @@ MySQL中的Time 类型,会被ClickHouse转æ¢æˆå¾®ç§’æ¥å­˜å‚¨ ### DDL Queries {#ddl-queries} -MySQL DDL 语å¥ä¼šè¢«è½¬æ¢æˆå¯¹åº”çš„ClickHouse DDL 语å¥ï¼Œæ¯”如: ([ALTER](../../sql-reference/statements/alter/index.md), [CREATE](../../sql-reference/statements/create/index.md), [DROP](../../sql-reference/statements/drop), [RENAME](../../sql-reference/statements/rename.md)). 如果ClickHouse 无法解æžæŸäº›è¯­å¥DDL æ“作,则会跳过。 +MySQL DDL 语å¥ä¼šè¢«è½¬æ¢æˆå¯¹åº”çš„ClickHouse DDL 语å¥ï¼Œæ¯”如: ([ALTER](../../sql-reference/statements/alter/index.md), [CREATE](../../sql-reference/statements/create.md), [DROP](../../sql-reference/statements/drop.md), [RENAME](../../sql-reference/statements/rename.md)). 如果ClickHouse 无法解æžæŸäº›è¯­å¥DDL æ“作,则会跳过。 ### æ•°æ®å¤åˆ¶ {#data-replication} @@ -152,17 +152,17 @@ ClickHouseåªæœ‰ä¸€ä¸ªç‰©ç†æŽ’åºï¼Œç”± `order by` æ¡ä»¶å†³å®šã€‚è¦åˆ›å»ºä¸€ 这些是你å¯ä»¥å¯¹MaterializedMySQL表é‡å†™çš„模å¼è½¬æ¢æ“作: * 修改列类型。必须与原始类型兼容,å¦åˆ™å¤åˆ¶å°†å¤±è´¥ã€‚例如,å¯ä»¥å°†`UInt32`列修改为`UInt64`,ä¸èƒ½å°† `String` 列修改为 `Array(String)`。 - * 修改 [column TTL](../table-engines/mergetree-family/mergetree/#mergetree-column-ttl). + * 修改 [column TTL](../table-engines/mergetree-family/mergetree.md#mergetree-column-ttl). * 修改 [column compression codec](../../sql-reference/statements/create/table.mdx#codecs). * 增加 [ALIAS columns](../../sql-reference/statements/create/table.mdx#alias). - * 增加 [skipping indexes](../table-engines/mergetree-family/mergetree/#table_engine-mergetree-data_skipping-indexes) - * 增加 [projections](../table-engines/mergetree-family/mergetree/#projections). + * 增加 [skipping indexes](../table-engines/mergetree-family/mergetree.md#table_engine-mergetree-data_skipping-indexes) + * 增加 [projections](../table-engines/mergetree-family/mergetree.md#projections). 请注æ„,当使用 `SELECT ... FINAL ` (MaterializedMySQL默认是这样åšçš„) 时,预测优化是被ç¦ç”¨çš„,所以这里是å—é™çš„, `INDEX ... TYPE hypothesis `[在v21.12çš„åšå®¢æ–‡ç« ä¸­æè¿°]](https://clickhouse.com/blog/en/2021/clickhouse-v21.12-released/)å¯èƒ½åœ¨è¿™ç§æƒ…况下更有用。 - * 修改 [PARTITION BY](../table-engines/mergetree-family/custom-partitioning-key/) - * 修改 [ORDER BY](../table-engines/mergetree-family/mergetree/#mergetree-query-clauses) - * 修改 [PRIMARY KEY](../table-engines/mergetree-family/mergetree/#mergetree-query-clauses) - * 增加 [SAMPLE BY](../table-engines/mergetree-family/mergetree/#mergetree-query-clauses) - * 增加 [table TTL](../table-engines/mergetree-family/mergetree/#mergetree-query-clauses) + * 修改 [PARTITION BY](../table-engines/mergetree-family/custom-partitioning-key.md) + * 修改 [ORDER BY](../table-engines/mergetree-family/mergetree.md#mergetree-query-clauses) + * 修改 [PRIMARY KEY](../table-engines/mergetree-family/mergetree.md#mergetree-query-clauses) + * 增加 [SAMPLE BY](../table-engines/mergetree-family/mergetree.md#mergetree-query-clauses) + * 增加 [table TTL](../table-engines/mergetree-family/mergetree.md#mergetree-query-clauses) ```sql CREATE DATABASE db_name ENGINE = MaterializedMySQL(...) diff --git a/docs/zh/engines/table-engines/mergetree-family/mergetree.md b/docs/zh/engines/table-engines/mergetree-family/mergetree.md index 1fcf64fcd25..54524388650 100644 --- a/docs/zh/engines/table-engines/mergetree-family/mergetree.md +++ b/docs/zh/engines/table-engines/mergetree-family/mergetree.md @@ -192,7 +192,7 @@ ClickHouse 会为æ¯ä¸ªæ•°æ®ç‰‡æ®µåˆ›å»ºä¸€ä¸ªç´¢å¼•æ–‡ä»¶æ¥å­˜å‚¨è¿™äº›æ ‡è®° ClickHouse ä¸è¦æ±‚主键唯一,所以您å¯ä»¥æ’入多æ¡å…·æœ‰ç›¸åŒä¸»é”®çš„行。 -您å¯ä»¥åœ¨`PRIMARY KEY`与`ORDER BY`æ¡ä»¶ä¸­ä½¿ç”¨`å¯ä¸ºç©ºçš„`类型的表达å¼ï¼Œä½†å¼ºçƒˆå»ºè®®ä¸è¦è¿™ä¹ˆåšã€‚为了å¯ç”¨è¿™é¡¹åŠŸèƒ½ï¼Œè¯·æ‰“å¼€[allow_nullable_key](../../../operations/settings/#allow-nullable-key),[NULLS_LAST](../../../sql-reference/statements/select/order-by.md/#sorting-of-special-values)规则也适用于`ORDER BY`æ¡ä»¶ä¸­æœ‰NULL值的情况下。 +您å¯ä»¥åœ¨`PRIMARY KEY`与`ORDER BY`æ¡ä»¶ä¸­ä½¿ç”¨`å¯ä¸ºç©ºçš„`类型的表达å¼ï¼Œä½†å¼ºçƒˆå»ºè®®ä¸è¦è¿™ä¹ˆåšã€‚为了å¯ç”¨è¿™é¡¹åŠŸèƒ½ï¼Œè¯·æ‰“å¼€[allow_nullable_key](../../../operations/settings/index.md#allow-nullable-key),[NULLS_LAST](../../../sql-reference/statements/select/order-by.md#sorting-of-special-values)规则也适用于`ORDER BY`æ¡ä»¶ä¸­æœ‰NULL值的情况下。 ### 主键的选择 {#zhu-jian-de-xuan-ze} @@ -330,7 +330,7 @@ SELECT count() FROM table WHERE u64 * i32 == 10 AND u64 * length(s) >= 1234 支æŒçš„æ•°æ®ç±»åž‹ï¼š`Int*`, `UInt*`, `Float*`, `Enum`, `Date`, `DateTime`, `String`, `FixedString`, `Array`, `LowCardinality`, `Nullable`。 - 以下函数会用到这个索引: [equals](../../../sql-reference/functions/comparison-functions.md), [notEquals](../../../sql-reference/functions/comparison-functions.md), [in](../../../sql-reference/functions/in-functions), [notIn](../../../sql-reference/functions/in-functions), [has](../../../sql-reference/functions/array-functions) + 以下函数会用到这个索引: [equals](../../../sql-reference/functions/comparison-functions.md), [notEquals](../../../sql-reference/functions/comparison-functions.md), [in](../../../sql-reference/functions/in-functions.md), [notIn](../../../sql-reference/functions/in-functions.md), [has](../../../sql-reference/functions/array-functions.md) ``` sql INDEX sample_index (u64 * length(s)) TYPE minmax GRANULARITY 4 @@ -353,14 +353,14 @@ WHERE å­å¥ä¸­çš„æ¡ä»¶å¯ä»¥åŒ…å«å¯¹æŸåˆ—æ•°æ®è¿›è¡Œè¿ç®—的函数表达 | [startsWith](../../../sql-reference/functions/string-functions.md#startswith) | ✔ | ✔ | ✔ | ✔ | ✗ | | [endsWith](../../../sql-reference/functions/string-functions.md#endswith) | ✗ | ✗ | ✔ | ✔ | ✗ | | [multiSearchAny](../../../sql-reference/functions/string-search-functions.md#function-multisearchany) | ✗ | ✗ | ✔ | ✗ | ✗ | -| [in](../../../sql-reference/functions/in-functions#in-functions) | ✔ | ✔ | ✔ | ✔ | ✔ | -| [notIn](../../../sql-reference/functions/in-functions#in-functions) | ✔ | ✔ | ✔ | ✔ | ✔ | +| [in](../../../sql-reference/functions/in-functions.md#in-functions) | ✔ | ✔ | ✔ | ✔ | ✔ | +| [notIn](../../../sql-reference/functions/in-functions.md#in-functions) | ✔ | ✔ | ✔ | ✔ | ✔ | | [less (\<)](../../../sql-reference/functions/comparison-functions.md#function-less) | ✔ | ✔ | ✗ | ✗ | ✗ | | [greater (\>)](../../../sql-reference/functions/comparison-functions.md#function-greater) | ✔ | ✔ | ✗ | ✗ | ✗ | | [lessOrEquals (\<=)](../../../sql-reference/functions/comparison-functions.md#function-lessorequals) | ✔ | ✔ | ✗ | ✗ | ✗ | | [greaterOrEquals (\>=)](../../../sql-reference/functions/comparison-functions.md#function-greaterorequals) | ✔ | ✔ | ✗ | ✗ | ✗ | -| [empty](../../../sql-reference/functions/array-functions#function-empty) | ✔ | ✔ | ✗ | ✗ | ✗ | -| [notEmpty](../../../sql-reference/functions/array-functions#function-notempty) | ✔ | ✔ | ✗ | ✗ | ✗ | +| [empty](../../../sql-reference/functions/array-functions.md#function-empty) | ✔ | ✔ | ✗ | ✗ | ✗ | +| [notEmpty](../../../sql-reference/functions/array-functions.md#function-notempty) | ✔ | ✔ | ✗ | ✗ | ✗ | | hasToken | ✗ | ✗ | ✗ | ✔ | ✗ | 常é‡å‚æ•°å°äºŽ ngram 大å°çš„函数ä¸èƒ½ä½¿ç”¨ `ngrambf_v1` 进行查询优化。 diff --git a/docs/zh/faq/general.md b/docs/zh/faq/general.md index 5a95b9aad07..530be7f08d8 100644 --- a/docs/zh/faq/general.md +++ b/docs/zh/faq/general.md @@ -1,5 +1,5 @@ --- -slug: /zh/faq/general +slug: /zh/faq/general/overview --- # 常è§é—®é¢˜ {#chang-jian-wen-ti} diff --git a/docs/zh/faq/general/columnar-database.md b/docs/zh/faq/general/columnar-database.md index 57541aec69b..004da067900 100644 --- a/docs/zh/faq/general/columnar-database.md +++ b/docs/zh/faq/general/columnar-database.md @@ -7,20 +7,20 @@ sidebar_position: 101 # 什么是列存储数æ®åº“? {#what-is-a-columnar-database} -列存储数æ®åº“独立存储æ¯ä¸ªåˆ—çš„æ•°æ®ã€‚è¿™åªå…许从ç£ç›˜è¯»å–任何给定查询中使用的列的数æ®ã€‚其代价是,影å“整行的æ“作会按比例å˜å¾—更昂贵。列存储数æ®åº“çš„åŒä¹‰è¯æ˜¯é¢å‘列的数æ®åº“管ç†ç³»ç»Ÿã€‚ClickHouse就是这样一个典型的例å­ã€‚ +列存储数æ®åº“独立存储æ¯ä¸ªåˆ—çš„æ•°æ®ã€‚è¿™åªå…许从ç£ç›˜è¯»å–任何给定查询中使用的列的数æ®ã€‚其代价是,影å“整行的æ“作会按比例å˜å¾—更昂贵。列存储数æ®åº“çš„åŒä¹‰è¯æ˜¯é¢å‘列的数æ®åº“管ç†ç³»ç»Ÿã€‚ClickHouse 就是这样一个典型的例å­ã€‚ 列存储数æ®åº“的主è¦ä¼˜ç‚¹æ˜¯: - 查询åªä½¿ç”¨è®¸å¤šåˆ—其中的少数列。 -— èšåˆå¯¹å¤§é‡æ•°æ®çš„查询。 -— 按列压缩。 +- èšåˆå¯¹å¤§é‡æ•°æ®çš„查询。 +- 按列压缩。 下é¢æ˜¯æž„建报表时传统的é¢å‘行系统和柱状数æ®åº“之间的区别: **传统行存储** -!(传统行存储)(https://clickhouse.com/docs/en/images/row-oriented.gif) +![传统行存储](https://clickhouse.com/docs/assets/images/row-oriented-3e6fd5aa48e3075202d242b4799da8fa.gif) **列存储** -!(列存储)(https://clickhouse.com/docs/en/images/column-oriented.gif) +![列存储](https://clickhouse.com/docs/assets/images/column-oriented-d082e49b7743d4ded32c7952bfdb028f.gif) -列存储数æ®åº“是分æžåº”用程åºçš„首选,因为它å…许在一个表中有许多列以防万一,但ä¸ä¼šåœ¨è¯»å–查询执行时为未使用的列付出代价。é¢å‘列的数æ®åº“是为大数æ®å¤„ç†è€Œè®¾è®¡çš„,因为和数æ®ä»“库一样,它们通常使用分布å¼çš„低æˆæœ¬ç¡¬ä»¶é›†ç¾¤æ¥æ高åžåé‡ã€‚ClickHouse结åˆäº†[分布å¼](../../engines/table-engines/special/distributed.md)å’Œ[å¤åˆ¶å¼](../../engines/table-engines/mergetree-family/replication.md)两类表。 \ No newline at end of file +列存储数æ®åº“是分æžåº”用程åºçš„首选,因为它å…许在一个表中有许多列以防万一,但ä¸ä¼šåœ¨è¯»å–查询执行时为未使用的列付出代价。é¢å‘列的数æ®åº“是为大数æ®å¤„ç†è€Œè®¾è®¡çš„,因为和数æ®ä»“库一样,它们通常使用分布å¼çš„低æˆæœ¬ç¡¬ä»¶é›†ç¾¤æ¥æ高åžåé‡ã€‚ClickHouse 结åˆäº†[分布å¼](../../engines/table-engines/special/distributed.md)å’Œ[å¤åˆ¶å¼](../../engines/table-engines/mergetree-family/replication.md)两类表。 diff --git a/docs/zh/faq/general/index.md b/docs/zh/faq/general/index.md index 8b0b42cede2..9693e7ffc82 100644 --- a/docs/zh/faq/general/index.md +++ b/docs/zh/faq/general/index.md @@ -21,8 +21,7 @@ sidebar_label: General - [我如何为 ClickHouse贡献代ç ?](../../faq/general/how-do-i-contribute-code-to-clickhouse.md) - !!! info "没找到您需è¦çš„内容?" - 请查阅 [其他 F.A.Q. 类别](../../faq/) 或者从左侧导航æ æµè§ˆå…¶ä»–文档 - + 请查阅 [其他 F.A.Q. 类别](../../faq/index.md) 或者从左侧导航æ æµè§ˆå…¶ä»–文档 + {## [原始文档](https://clickhouse.com/docs/en/faq/general/) ##} diff --git a/docs/zh/faq/general/why-clickhouse-is-so-fast.md b/docs/zh/faq/general/why-clickhouse-is-so-fast.md index a30b56adb9a..1962b8b90c2 100644 --- a/docs/zh/faq/general/why-clickhouse-is-so-fast.md +++ b/docs/zh/faq/general/why-clickhouse-is-so-fast.md @@ -9,7 +9,7 @@ sidebar_position: 8 It was designed to be fast. Query execution performance has always been a top priority during the development process, but other important characteristics like user-friendliness, scalability, and security were also considered so ClickHouse could become a real production system. -ClickHouse was initially built as a prototype to do just a single task well: to filter and aggregate data as fast as possible. That’s what needs to be done to build a typical analytical report and that’s what a typical [GROUP BY](../../sql-reference/statements/select/group-by/) query does. ClickHouse team has made several high-level decisions that combined made achieving this task possible: +ClickHouse was initially built as a prototype to do just a single task well: to filter and aggregate data as fast as possible. That’s what needs to be done to build a typical analytical report and that’s what a typical [GROUP BY](../../sql-reference/statements/select/group-by.md) query does. ClickHouse team has made several high-level decisions that combined made achieving this task possible: Column-oriented storage : Source data often contain hundreds or even thousands of columns, while a report can use just a few of them. The system needs to avoid reading unnecessary columns, or most expensive disk read operations would be wasted. diff --git a/docs/zh/faq/integration/index.md b/docs/zh/faq/integration/index.md index 6678956a0b3..3a3f97761f3 100644 --- a/docs/zh/faq/integration/index.md +++ b/docs/zh/faq/integration/index.md @@ -1,5 +1,5 @@ --- -slug: /zh/faq/integration/ +slug: /zh/faq/integration/ title: 关于集æˆClickHouse和其他系统的问题 toc_hidden_folder: true sidebar_position: 4 @@ -17,6 +17,6 @@ sidebar_label: Integration !!! info "没看到你è¦æ‰¾çš„东西å—?" - 查看[其他faq类别](../../faq/)或æµè§ˆå·¦è¾¹æ ä¸­çš„主è¦æ–‡æ¡£æ–‡ç« ã€‚ + 查看[其他faq类别](../../faq/index.md)或æµè§ˆå·¦è¾¹æ ä¸­çš„主è¦æ–‡æ¡£æ–‡ç« ã€‚ {## [原文](https://clickhouse.com/docs/en/faq/integration/) ##} \ No newline at end of file diff --git a/docs/zh/faq/integration/json-import.md b/docs/zh/faq/integration/json-import.md index 861abacc1e1..2d5c687316d 100644 --- a/docs/zh/faq/integration/json-import.md +++ b/docs/zh/faq/integration/json-import.md @@ -7,29 +7,29 @@ sidebar_position: 11 # How to Import JSON Into ClickHouse? {#how-to-import-json-into-clickhouse} -ClickHouse supports a wide range of [data formats for input and output](../../interfaces/formats/). There are multiple JSON variations among them, but the most commonly used for data ingestion is [JSONEachRow](../../interfaces/formats/#jsoneachrow). It expects one JSON object per row, each object separated by a newline. +ClickHouse supports a wide range of [data formats for input and output](../../interfaces/formats.md). There are multiple JSON variations among them, but the most commonly used for data ingestion is [JSONEachRow](../../interfaces/formats.md#jsoneachrow). It expects one JSON object per row, each object separated by a newline. ## Examples {#examples} -Using [HTTP interface](../../interfaces/http/): +Using [HTTP interface](../../interfaces/http.md): ``` bash $ echo '{"foo":"bar"}' | curl 'http://localhost:8123/?query=INSERT%20INTO%20test%20FORMAT%20JSONEachRow' --data-binary @- ``` -Using [CLI interface](../../interfaces/cli/): +Using [CLI interface](../../interfaces/cli.md): ``` bash $ echo '{"foo":"bar"}' | clickhouse-client --query="INSERT INTO test FORMAT JSONEachRow" ``` -Instead of inserting data manually, you might consider to use one of [client libraries](../../interfaces/) instead. +Instead of inserting data manually, you might consider to use one of [client libraries](../../interfaces/index.md) instead. ## Useful Settings {#useful-settings} - `input_format_skip_unknown_fields` allows to insert JSON even if there were additional fields not present in table schema (by discarding them). -- `input_format_import_nested_json` allows to insert nested JSON objects into columns of [Nested](../../sql-reference/data-types/nested-data-structures/nested/) type. +- `input_format_import_nested_json` allows to insert nested JSON objects into columns of [Nested](../../sql-reference/data-types/nested-data-structures/nested.md) type. -:::note +:::note Settings are specified as `GET` parameters for the HTTP interface or as additional command-line arguments prefixed with `--` for the `CLI` interface. ::: \ No newline at end of file diff --git a/docs/zh/faq/operations/delete-old-data.md b/docs/zh/faq/operations/delete-old-data.md index b2229058cad..24181116bab 100644 --- a/docs/zh/faq/operations/delete-old-data.md +++ b/docs/zh/faq/operations/delete-old-data.md @@ -19,7 +19,7 @@ The key advantage of this approach is that it does not need any external system TTL can also be used to move data not only to [/dev/null](https://en.wikipedia.org/wiki/Null_device), but also between different storage systems, like from SSD to HDD. ::: -More details on [configuring TTL](../../engines/table-engines/mergetree-family/mergetree/#table_engine-mergetree-ttl). +More details on [configuring TTL](../../engines/table-engines/mergetree-family/mergetree.md#table_engine-mergetree-ttl). ## ALTER DELETE {#alter-delete} @@ -41,4 +41,4 @@ More details on [manipulating partitions](../../sql-reference/statements/alter/p It’s rather radical to drop all data from a table, but in some cases it might be exactly what you need. -More details on [table truncation](../../sql-reference/statements/truncate/). +More details on [table truncation](../../sql-reference/statements/truncate.md). \ No newline at end of file diff --git a/docs/zh/faq/operations/index.md b/docs/zh/faq/operations/index.md index 071cc872e4e..153eda6199a 100644 --- a/docs/zh/faq/operations/index.md +++ b/docs/zh/faq/operations/index.md @@ -1,5 +1,5 @@ --- -slug: /zh/faq/operations/ +slug: /zh/faq/operations/ title: 关于æ“作ClickHouseæœåŠ¡å™¨å’Œé›†ç¾¤çš„问题 toc_hidden_folder: true sidebar_position: 3 @@ -13,9 +13,9 @@ sidebar_label: Operations - [如果想在生产环境部署,需è¦ç”¨å“ªä¸ªç‰ˆæœ¬çš„ ClickHouse å‘¢?](../../faq/operations/production.md) - [是å¦å¯èƒ½ä»Ž ClickHouse æ•°æ®è¡¨ä¸­åˆ é™¤æ‰€æœ‰æ—§çš„æ•°æ®è®°å½•?](../../faq/operations/delete-old-data.md) - [ClickHouse支æŒå¤šåŒºåŸŸå¤åˆ¶å—?](../../faq/operations/multi-region-replication.md) - + !!! info "没看到你è¦æ‰¾çš„东西å—?" - 查看[其他faq类别](../../faq/)或æµè§ˆå·¦è¾¹æ ä¸­çš„主è¦æ–‡æ¡£æ–‡ç« ã€‚ + 查看[其他faq类别](../../faq/index.md)或æµè§ˆå·¦è¾¹æ ä¸­çš„主è¦æ–‡æ¡£æ–‡ç« ã€‚ {## [原文](https://clickhouse.com/docs/en/faq/production/) ##} diff --git a/docs/zh/faq/operations/production.md b/docs/zh/faq/operations/production.md index cc5cf6b9614..90db050e8d3 100644 --- a/docs/zh/faq/operations/production.md +++ b/docs/zh/faq/operations/production.md @@ -67,6 +67,6 @@ For production use, there are two key options: `stable` and `lts`. Here is some Many teams who initially thought that `lts` is the way to go, often switch to `stable` anyway because of some recent feature that’s important for their product. -:::warning -One more thing to keep in mind when upgrading ClickHouse: we’re always keeping eye on compatibility across releases, but sometimes it’s not reasonable to keep and some minor details might change. So make sure you check the [changelog](../../whats-new/changelog/) before upgrading to see if there are any notes about backward-incompatible changes. +:::warning +One more thing to keep in mind when upgrading ClickHouse: we’re always keeping eye on compatibility across releases, but sometimes it’s not reasonable to keep and some minor details might change. So make sure you check the [changelog](../../whats-new/changelog/index.md) before upgrading to see if there are any notes about backward-incompatible changes. ::: \ No newline at end of file diff --git a/docs/zh/faq/use-cases/index.md b/docs/zh/faq/use-cases/index.md index 75ef26368a3..ff0d873b37f 100644 --- a/docs/zh/faq/use-cases/index.md +++ b/docs/zh/faq/use-cases/index.md @@ -14,6 +14,6 @@ sidebar_label: 使用案例 - [我能把 ClickHouse 当åšKey-value 键值存储æ¥ä½¿ç”¨å—?](../../faq/use-cases/key-value.md) !!! info "没找到您所需è¦çš„内容?" - 请查看[其他常è§é—®é¢˜ç±»åˆ«](../../faq/)或æµè§ˆå·¦ä¾§è¾¹æ ä¸­çš„主è¦æ–‡æ¡£æ–‡ç« ã€‚ + 请查看[其他常è§é—®é¢˜ç±»åˆ«](../../faq/index.md)或æµè§ˆå·¦ä¾§è¾¹æ ä¸­çš„主è¦æ–‡æ¡£æ–‡ç« ã€‚ {## [原始文档](https://clickhouse.com/docs/en/faq/use-cases/) ##} diff --git a/docs/zh/getting-started/example-datasets/recipes.mdx b/docs/zh/getting-started/example-datasets/recipes.mdx index b7ed92962c5..b7f8fe8eafd 100644 --- a/docs/zh/getting-started/example-datasets/recipes.mdx +++ b/docs/zh/getting-started/example-datasets/recipes.mdx @@ -1,5 +1,5 @@ ---- -slug: /zh/getting-started/example-datasets/recipes +--- +slug: /zh/getting-started/example-datasets/recipes sidebar_label: 食谱数æ®é›† title: "食谱数æ®é›†" --- @@ -8,8 +8,8 @@ RecipeNLG æ•°æ®é›†å¯åœ¨ [此处](https://recipenlg.cs.put.poznan.pl/dataset) ## 下载并解压数æ®é›† -1. 进入下载页é¢[https://recipenlg.cs.put.poznan.pl/dataset](https://recipenlg.cs.put.poznan.pl/dataset)。 -2. 接å—æ¡æ¬¾å’Œæ¡ä»¶å¹¶ä¸‹è½½ zip 文件。 +1. 进入下载页é¢[https://recipenlg.cs.put.poznan.pl/dataset](https://recipenlg.cs.put.poznan.pl/dataset)。 +2. 接å—æ¡æ¬¾å’Œæ¡ä»¶å¹¶ä¸‹è½½ zip 文件。 3. 使用 `unzip` 解压 zip 文件,得到 `full_dataset.csv` 文件。 ## 创建表 @@ -49,13 +49,13 @@ clickhouse-client --query " 这是一个展示如何解æžè‡ªå®šä¹‰ CSV,这其中涉åŠäº†è®¸å¤šè°ƒæ•´ã€‚ -说明: -- æ•°æ®é›†ä¸º CSV æ ¼å¼ï¼Œä½†åœ¨æ’入时需è¦ä¸€äº›é¢„处ç†ï¼›ä½¿ç”¨è¡¨å‡½æ•° [input](../../sql-reference/table-functions/input.md) 进行预处ç†ï¼› -- CSV 文件的结构在表函数 `input` çš„å‚数中指定; -- 字段 `num`(行å·ï¼‰æ˜¯ä¸éœ€è¦çš„ - å¯ä»¥å¿½ç•¥å¹¶ä»Žæ–‡ä»¶ä¸­è¿›è¡Œè§£æžï¼› -- 使用 `FORMAT CSVWithNames`,因为标题ä¸åŒ…å«ç¬¬ä¸€ä¸ªå­—段的å称,因此 CSV 中的标题将被忽略(通过命令行å‚æ•° `--input_format_with_names_use_header 0`); -- 文件仅使用åŒå¼•å·å°† CSV 字符串括起æ¥ï¼›ä¸€äº›å­—符串没有用åŒå¼•å·æ‹¬èµ·æ¥ï¼Œå•å¼•å·ä¹Ÿä¸èƒ½è¢«è§£æžä¸ºæ‹¬èµ·æ¥çš„字符串 - 所以添加`--format_csv_allow_single_quote 0`å‚数接å—文件中的å•å¼•å·ï¼› -- 由于æŸäº› CSV çš„å­—ç¬¦ä¸²çš„å¼€å¤´åŒ…å« `\M/` 因此无法被解æžï¼› CSV 中唯一å¯èƒ½ä»¥åæ–œæ å¼€å¤´çš„值是 `\N`,这个值被解æžä¸º SQL NULL。通过添加`--input_format_allow_errors_num 10`å‚数,å…许在导入过程中跳过 10 个格å¼é”™è¯¯ï¼› +说明: +- æ•°æ®é›†ä¸º CSV æ ¼å¼ï¼Œä½†åœ¨æ’入时需è¦ä¸€äº›é¢„处ç†ï¼›ä½¿ç”¨è¡¨å‡½æ•° [input](../../sql-reference/table-functions/input.md) 进行预处ç†ï¼› +- CSV 文件的结构在表函数 `input` çš„å‚数中指定; +- 字段 `num`(行å·ï¼‰æ˜¯ä¸éœ€è¦çš„ - å¯ä»¥å¿½ç•¥å¹¶ä»Žæ–‡ä»¶ä¸­è¿›è¡Œè§£æžï¼› +- 使用 `FORMAT CSVWithNames`,因为标题ä¸åŒ…å«ç¬¬ä¸€ä¸ªå­—段的å称,因此 CSV 中的标题将被忽略(通过命令行å‚æ•° `--input_format_with_names_use_header 0`); +- 文件仅使用åŒå¼•å·å°† CSV 字符串括起æ¥ï¼›ä¸€äº›å­—符串没有用åŒå¼•å·æ‹¬èµ·æ¥ï¼Œå•å¼•å·ä¹Ÿä¸èƒ½è¢«è§£æžä¸ºæ‹¬èµ·æ¥çš„字符串 - 所以添加`--format_csv_allow_single_quote 0`å‚数接å—文件中的å•å¼•å·ï¼› +- 由于æŸäº› CSV çš„å­—ç¬¦ä¸²çš„å¼€å¤´åŒ…å« `\M/` 因此无法被解æžï¼› CSV 中唯一å¯èƒ½ä»¥åæ–œæ å¼€å¤´çš„值是 `\N`,这个值被解æžä¸º SQL NULL。通过添加`--input_format_allow_errors_num 10`å‚数,å…许在导入过程中跳过 10 个格å¼é”™è¯¯ï¼› - 在数æ®é›†ä¸­çš„ Ingredientsã€directions å’Œ NER 字段为数组;但这些数组并没有以一般形å¼è¡¨ç¤ºï¼šè¿™äº›å­—段作为 JSON åºåˆ—化为字符串,然åŽæ”¾å…¥ CSV 中 - 在导入是将它们解æžä¸ºå­—符串,然åŽä½¿ç”¨ [JSONExtract](../../sql-reference/functions/json-functions.md ) 函数将其转æ¢ä¸ºæ•°ç»„。 ## 验è¯æ’å…¥çš„æ•°æ® @@ -80,7 +80,7 @@ SELECT count() FROM recipes; ### 按é…方数é‡æŽ’列的顶级组件: -在此示例中,我们学习如何使用 [arrayJoin](../../sql-reference/functions/array-join/) 函数将数组扩展为行的集åˆã€‚ +在此示例中,我们学习如何使用 [arrayJoin](../../sql-reference/functions/array-join.md) 函数将数组扩展为行的集åˆã€‚ 请求: @@ -185,7 +185,7 @@ LIMIT 10 10 rows in set. Elapsed: 0.215 sec. Processed 2.23 million rows, 1.48 GB (10.35 million rows/s., 6.86 GB/s.) ``` -在此示例中,我们使用 [has](../../sql-reference/functions/array-functions/#hasarr-elem) 函数æ¥æŒ‰è¿‡æ»¤æ•°ç»„类型元素并按 directions çš„æ•°é‡è¿›è¡ŒæŽ’åºã€‚ +在此示例中,我们使用 [has](../../sql-reference/functions/array-functions.md#hasarr-elem) 函数æ¥æŒ‰è¿‡æ»¤æ•°ç»„类型元素并按 directions çš„æ•°é‡è¿›è¡ŒæŽ’åºã€‚ 有一个婚礼蛋糕需è¦æ•´ä¸ª126个步骤æ¥åˆ¶ä½œï¼æ˜¾ç¤º directions: diff --git a/docs/zh/guides/improving-query-performance/skipping-indexes.md b/docs/zh/guides/improving-query-performance/skipping-indexes.md index f5889898c2c..f9f43e46927 100644 --- a/docs/zh/guides/improving-query-performance/skipping-indexes.md +++ b/docs/zh/guides/improving-query-performance/skipping-indexes.md @@ -89,7 +89,7 @@ SELECT * FROM skip_table WHERE my_value IN (125, 700) 下图是更直观的展示,这就是如何读å–和选择my_value为125çš„4096行,以åŠå¦‚何跳过以下行而ä¸ä»Žç£ç›˜è¯»å–: -![Simple Skip](../../../en/guides/improving-query-performance/images/simple_skip.svg) +![Simple Skip](../../../en/guides/best-practices/images/simple_skip.svg) 通过在执行查询时å¯ç”¨è·Ÿè¸ªï¼Œç”¨æˆ·å¯ä»¥çœ‹åˆ°å…³äºŽè·³æ•°ç´¢å¼•ä½¿ç”¨æƒ…况的详细信æ¯ã€‚在clickhouse-client中设置send_logs_level: @@ -126,7 +126,7 @@ Bloom filter是一ç§æ•°æ®ç»“构,它å…许对集åˆæˆå‘˜è¿›è¡Œé«˜æ•ˆçš„是 * 基本的**bloom_filter**接å—一个å¯é€‰å‚数,该å‚数表示在0到1之间å…许的“å‡é˜³æ€§â€çŽ‡(如果未指定,则使用.025)。 * 更专业的**tokenbf_v1**。需è¦ä¸‰ä¸ªå‚数,用æ¥ä¼˜åŒ–布隆过滤器:(1)过滤器的大å°å­—节(大过滤器有更少的å‡é˜³æ€§ï¼Œæœ‰æ›´é«˜çš„存储æˆæœ¬),(2)哈希函数的个数(更多的散列函数å¯ä»¥å‡å°‘å‡é˜³æ€§)。(3)布隆过滤器哈希函数的ç§å­ã€‚有关这些å‚数如何影å“布隆过滤器功能的更多细节,请å‚阅 [这里](https://hur.st/bloomfilter/) 。此索引仅适用于Stringã€FixedStringå’ŒMap类型的数æ®ã€‚输入表达å¼è¢«åˆ†å‰²ä¸ºç”±éžå­—æ¯æ•°å­—字符分隔的字符åºåˆ—。例如,列值`This is a candidate for a "full text" search`将被分割为`This` `is` `a` `candidate` `for` `full` `text` `search`。它用于LIKEã€EQUALSã€inã€hasToken()和类似的长字符串中å•è¯å’Œå…¶ä»–值的æœç´¢ã€‚例如,一ç§å¯èƒ½çš„用途是在éžç»“构的应用程åºæ—¥å¿—行列中æœç´¢å°‘é‡çš„ç±»å或行å·ã€‚ - + * 更专业的**ngrambf_v1**。该索引的功能与tokenbf_v1相åŒã€‚在Bloom filter设置之å‰éœ€è¦ä¸€ä¸ªé¢å¤–çš„å‚数,å³è¦ç´¢å¼•çš„ngram的大å°ã€‚一个ngram是长度为n的任何字符串,比如如果n是4,`A short string`会被分割为`A sh`` sho`, `shor`, `hort`, `ort s`, `or st`, `r str`, ` stri`, `trin`, `ring`。这个索引对于文本æœç´¢ä¹Ÿå¾ˆæœ‰ç”¨ï¼Œç‰¹åˆ«æ˜¯æ²¡æœ‰å•è¯é—´æ–­çš„语言,比如中文。 ### 跳数索引函数 @@ -150,7 +150,7 @@ Bloom filter是一ç§æ•°æ®ç»“构,它å…许对集åˆæˆå‘˜è¿›è¡Œé«˜æ•ˆçš„是 考虑以下数æ®åˆ†å¸ƒï¼š -![Bad Skip!](../../../en/guides/improving-query-performance/images/bad_skip_1.svg) +![Bad Skip!](../../../en/guides/best-practices/images/bad_skip_1.svg) å‡è®¾ä¸»é”®/顺åºæ˜¯æ—¶é—´æˆ³ï¼Œå¹¶ä¸”在visitor_id上有一个索引。考虑下é¢çš„查询: diff --git a/docs/zh/guides/improving-query-performance/sparse-primary-indexes.md b/docs/zh/guides/improving-query-performance/sparse-primary-indexes.md index 18b23a79f86..51167521018 100644 --- a/docs/zh/guides/improving-query-performance/sparse-primary-indexes.md +++ b/docs/zh/guides/improving-query-performance/sparse-primary-indexes.md @@ -1,5 +1,5 @@ --- -slug: /zh/guides/improving-query-performance/sparse-primary-indexes +slug: /zh/guides/best-practices sidebar_label: 主键稀ç–索引 sidebar_position: 20 --- @@ -19,21 +19,21 @@ sidebar_position: 20 :::note 这篇文章主è¦å…³æ³¨ç¨€ç–索引。 -如果想了解二级跳数索引,请查看[教程](./skipping-indexes.md). +如果想了解二级跳数索引,请查看[教程](./skipping-indexes.md). ::: -## æ•°æ®é›† +## æ•°æ®é›† 在本文中,我们将使用一个匿åçš„webæµé‡æ•°æ®é›†ã€‚ -- 我们将使用样本数æ®é›†ä¸­çš„887万行(事件)çš„å­é›†ã€‚ +- 我们将使用样本数æ®é›†ä¸­çš„887万行(事件)çš„å­é›†ã€‚ - 未压缩的数æ®å¤§å°ä¸º887万个事件和大约700mb。当存储在ClickHouse时,压缩为200mb。 - 在我们的å­é›†ä¸­ï¼Œæ¯è¡ŒåŒ…å«ä¸‰åˆ—,表示在特定时间(EventTime列)å•å‡»URL (URL列)的互è”网用户(UserID列)。 通过这三个列,我们已ç»å¯ä»¥åˆ¶å®šä¸€äº›å…¸åž‹çš„web分æžæŸ¥è¯¢ï¼Œå¦‚: - + - æŸä¸ªç”¨æˆ·ç‚¹å‡»æ¬¡æ•°æœ€å¤šçš„å‰10个url是什么? - 点击æŸä¸ªURL次数最多的å‰10å用户是è°ï¼Ÿ - 用户点击特定URL的最频ç¹æ—¶é—´(比如一周中的几天)是什么? @@ -44,7 +44,7 @@ sidebar_position: 20 ## 全表扫æ -为了了解在没有主键的情况下如何对数æ®é›†æ‰§è¡ŒæŸ¥è¯¢ï¼Œæˆ‘们通过执行以下SQL DDL语å¥(使用MergeTree表引擎)创建了一个表: +为了了解在没有主键的情况下如何对数æ®é›†æ‰§è¡ŒæŸ¥è¯¢ï¼Œæˆ‘们通过执行以下SQL DDL语å¥(使用MergeTree表引擎)创建了一个表: ```sql CREATE TABLE hits_NoPrimaryKey @@ -70,11 +70,11 @@ FROM url('https://datasets.clickhouse.com/hits/tsv/hits_v1.tsv.xz') WHERE URL != ''; ``` 结果: -```response +```response Ok. 0 rows in set. Elapsed: 145.993 sec. Processed 8.87 million rows, 18.40 GB (60.78 thousand rows/s., 126.06 MB/s.) -``` +``` ClickHouse客户端输出了执行结果,æ’入了887万行数æ®ã€‚ @@ -102,7 +102,7 @@ ORDER BY Count DESC LIMIT 10; ``` 结果: -```response +```response ┌─URL────────────────────────────┬─Count─┠│ http://auto.ru/chatay-barana.. │ 170 │ │ http://auto.ru/chatay-id=371...│ 52 │ @@ -117,10 +117,10 @@ LIMIT 10; └────────────────────────────────┴───────┘ 10 rows in set. Elapsed: 0.022 sec. -// highlight-next-line -Processed 8.87 million rows, +// highlight-next-line +Processed 8.87 million rows, 70.45 MB (398.53 million rows/s., 3.17 GB/s.) -``` +``` ClickHouse客户端输出表明,ClickHouse执行了一个完整的表扫æï¼æˆ‘们的表的887万行中的æ¯ä¸€è¡Œéƒ½è¢«åŠ è½½åˆ°ClickHouse中,这ä¸æ˜¯å¯æ‰©å±•çš„。 @@ -131,7 +131,7 @@ ClickHouse客户端输出表明,ClickHouse执行了一个完整的表扫æï¼ ## 包å«ä¸»é”®çš„表 -创建一个包å«è”åˆä¸»é”®UserIDå’ŒURL列的表: +创建一个包å«è”åˆä¸»é”®UserIDå’ŒURL列的表: ```sql CREATE TABLE hits_UserID_URL @@ -141,7 +141,7 @@ CREATE TABLE hits_UserID_URL `EventTime` DateTime ) ENGINE = MergeTree -// highlight-next-line +// highlight-next-line PRIMARY KEY (UserID, URL) ORDER BY (UserID, URL, EventTime) SETTINGS index_granularity = 8192, index_granularity_bytes = 0; @@ -149,10 +149,10 @@ SETTINGS index_granularity = 8192, index_granularity_bytes = 0; [//]: # (
)
- + DDL详情 -

+

为了简化本文åŽé¢çš„讨论,并使图和结果å¯é‡çŽ°ï¼Œä½¿ç”¨DDL语å¥æœ‰å¦‚下说明:

    @@ -164,7 +164,7 @@ SETTINGS index_granularity = 8192, index_granularity_bytes = 0;
  • index_granularity: 显å¼è®¾ç½®ä¸ºå…¶é»˜è®¤å€¼8192。这æ„味ç€å¯¹äºŽæ¯ä¸€ç»„8192行,主索引将有一个索引æ¡ç›®ï¼Œä¾‹å¦‚,如果表包å«16384行,那么索引将有两个索引æ¡ç›®ã€‚

  • -
  • index_granularity_bytes: 设置为0表示ç¦æ­¢è‡ªé€‚应索引粒度。自适应索引粒度æ„味ç€ClickHouse自动为一组n行创建一个索引æ¡ç›® +
  • index_granularity_bytes: 设置为0表示ç¦æ­¢è‡ªé€‚应索引粒度。自适应索引粒度æ„味ç€ClickHouse自动为一组n行创建一个索引æ¡ç›®
    • 如果nå°äºŽ8192,但n行的åˆå¹¶è¡Œæ•°æ®å¤§å°å¤§äºŽæˆ–等于10MB (index_granularity_bytes的默认值)或
    • n达到8192
    • @@ -190,7 +190,7 @@ FROM url('https://datasets.clickhouse.com/hits/tsv/hits_v1.tsv.xz') WHERE URL != ''; ``` 结果: -```response +```response 0 rows in set. Elapsed: 149.432 sec. Processed 8.87 million rows, 18.40 GB (59.38 thousand rows/s., 123.16 MB/s.) ``` @@ -219,7 +219,7 @@ FROM system.parts WHERE (table = 'hits_UserID_URL') AND (active = 1) FORMAT Vertical; ``` - + 结果: ```response @@ -237,7 +237,7 @@ bytes_on_disk: 207.07 MiB ``` 客户端输出表明: - + - 表数æ®ä»¥wide format存储在一个特定目录,æ¯ä¸ªåˆ—有一个数æ®æ–‡ä»¶å’Œmark文件。 - 表有887万行数æ®ã€‚ - 未压缩的数æ®æœ‰733.28 MB。 @@ -278,8 +278,8 @@ bytes_on_disk: 207.07 MiB ## æ•°æ®æŒ‰ç…§ä¸»é”®æŽ’åºå­˜å‚¨åœ¨ç£ç›˜ä¸Š -上é¢åˆ›å»ºçš„表有: -- è”åˆä¸»é”® (UserID, URL) +上é¢åˆ›å»ºçš„表有: +- è”åˆä¸»é”® (UserID, URL) - è”åˆæŽ’åºé”® (UserID, URL, EventTime)。 :::note @@ -293,7 +293,7 @@ bytes_on_disk: 207.07 MiB ::: -æ’入的行按照主键列(以åŠæŽ’åºé”®çš„附加EventTime列)çš„å­—å…¸åº(从å°åˆ°å¤§)存储在ç£ç›˜ä¸Šã€‚ +æ’入的行按照主键列(以åŠæŽ’åºé”®çš„附加EventTime列)çš„å­—å…¸åº(从å°åˆ°å¤§)存储在ç£ç›˜ä¸Šã€‚ :::note ClickHouseå…许æ’入具有相åŒä¸»é”®åˆ—的多行数æ®ã€‚在这ç§æƒ…况下(å‚è§ä¸‹å›¾ä¸­çš„第1行和第2è¡Œ),最终的顺åºæ˜¯ç”±æŒ‡å®šçš„排åºé”®å†³å®šçš„,这里是EventTime列的值。 @@ -307,7 +307,7 @@ ClickHouseå…许æ’入具有相åŒä¸»é”®åˆ—的多行数æ®ã€‚在这ç§æƒ…况下( - 然åŽæ˜¯URL, - 最åŽæ˜¯EventTime: - + UserID.bin,URL.bin,和EventTime.bin是UserID,URL,和EventTime列的数æ®æ–‡ä»¶ã€‚
      @@ -331,13 +331,19 @@ UserID.bin,URL.bin,和EventTime.bin是UserID 下图显示了如何将表中的887万行(列值)组织æˆ1083个颗粒,这是表的DDL语å¥åŒ…å«è®¾ç½®index_granularity(设置为默认值8192)的结果。 - + 第一个(æ ¹æ®ç£ç›˜ä¸Šçš„物ç†é¡ºåº)8192è¡Œ(它们的列值)在逻辑上属于颗粒0,然åŽä¸‹ä¸€ä¸ª8192è¡Œ(它们的列值)属于颗粒1,以此类推。 :::note - 最åŽä¸€ä¸ªé¢—粒(1082颗粒)是少于8192行的。 +- 我们在本指å—开头的“DDL 语å¥è¯¦ç»†ä¿¡æ¯â€ä¸­æ到,我们ç¦ç”¨äº†è‡ªé€‚应索引粒度(为了简化本指å—中的讨论,并使图表和结果å¯é‡çŽ°ï¼‰ã€‚ + + 因此,示例表中所有颗粒(除了最åŽä¸€ä¸ªï¼‰éƒ½å…·æœ‰ç›¸åŒå¤§å°ã€‚ + +- 对于具有自适应索引粒度的表(默认情况下索引粒度是自适应的),æŸäº›ç²’度的大å°å¯ä»¥å°äºŽ 8192 行,具体å–决于行数æ®å¤§å°ã€‚ + - 我们将主键列(UserID, URL)中的一些列值标记为橙色。 这些橙色标记的列值是æ¯ä¸ªé¢—粒中æ¯ä¸ªä¸»é”®åˆ—的最å°å€¼ã€‚这里的例外是最åŽä¸€ä¸ªé¢—ç²’(上图中的颗粒1082),最åŽä¸€ä¸ªé¢—粒我们标记的是最大的值。 @@ -355,21 +361,21 @@ UserID.bin,URL.bin,和EventTime.bin是UserID 下é¢çš„图显示了索引存储了æ¯ä¸ªé¢—粒的最å°ä¸»é”®åˆ—值(在上é¢çš„图中用橙色标记的值)。 例如: -- 第一个索引æ¡ç›®(下图中的“mark 0â€)存储上图中颗粒0的主键列的最å°å€¼ï¼Œ -- 第二个索引æ¡ç›®(下图中的“mark 1â€)存储上图中颗粒1的主键列的最å°å€¼ï¼Œä»¥æ­¤ç±»æŽ¨ã€‚ +- 第一个索引æ¡ç›®(下图中的“mark 0â€)存储上图中颗粒0的主键列的最å°å€¼ï¼Œ +- 第二个索引æ¡ç›®(下图中的“mark 1â€)存储上图中颗粒1的主键列的最å°å€¼ï¼Œä»¥æ­¤ç±»æŽ¨ã€‚ - + -在我们的表中,索引总共有1083个æ¡ç›®ï¼Œ887万行数æ®å’Œ1083个颗粒: +在我们的表中,索引总共有1083个æ¡ç›®ï¼Œ887万行数æ®å’Œ1083个颗粒: - + :::note - 最åŽä¸€ä¸ªç´¢å¼•æ¡ç›®(上图中的“mark 1082â€)存储了上图中颗粒1082的主键列的最大值。 - 索引æ¡ç›®(索引标记)ä¸æ˜¯åŸºäºŽè¡¨ä¸­çš„特定行,而是基于颗粒。例如,对于上图中的索引æ¡ç›®â€˜mark 0’,在我们的表中没有UserID为240.923且URL为“goal://metry=10000467796a411…â€çš„行,相å,对于该表,有一个颗粒0,在该颗粒中,最å°UserID值是240.923,最å°URL值是“goal://metry=10000467796a411…â€ï¼Œè¿™ä¸¤ä¸ªå€¼æ¥è‡ªä¸åŒçš„行。 -- 主索引文件完全加载到主内存中。如果文件大于å¯ç”¨çš„空闲内存空间,则ClickHouseå°†å‘生错误。 +- 主索引文件完全加载到主内存中。如果文件大于å¯ç”¨çš„空闲内存空间,则ClickHouseå°†å‘生错误。 ::: @@ -377,11 +383,11 @@ UserID.bin,URL.bin,和EventTime.bin是UserID - UserID index marks:
      主索引中存储的UserID值按å‡åºæŽ’åºã€‚
      上图中的‘mark 1’指示颗粒1中所有表行的UserID值,以åŠéšåŽæ‰€æœ‰é¢—粒中的UserID值,都ä¿è¯å¤§äºŽæˆ–等于4.073.710。 - + [正如我们ç¨åŽå°†çœ‹åˆ°çš„](#query-on-userid-fast), 当查询对主键的第一列进行过滤时,此全局有åºä½¿ClickHouse能够对第一个键列的索引标记使用二分查找算法。 -- URL index marks:
      - 主键列UserIDå’ŒURL有相åŒçš„基数,这æ„味ç€ç¬¬ä¸€åˆ—之åŽçš„所有主键列的索引标记通常åªè¡¨ç¤ºæ¯ä¸ªé¢—ç²’çš„æ•°æ®èŒƒå›´ã€‚
      +- URL index marks:
      + 主键列UserIDå’ŒURL有相åŒçš„基数,这æ„味ç€ç¬¬ä¸€åˆ—之åŽçš„所有主键列的索引标记通常åªè¡¨ç¤ºæ¯ä¸ªé¢—ç²’çš„æ•°æ®èŒƒå›´ã€‚
      例如,‘mark 0’中的URL列所有的值都大于等于goal://metry=10000467796a411..., 然åŽé¢—ç²’1中的URL并ä¸æ˜¯å¦‚此,这是因为‘mark 1‘与‘mark 0‘具有ä¸åŒçš„UserID列值。 ç¨åŽæˆ‘们将更详细地讨论这对查询执行性能的影å“。 @@ -401,7 +407,7 @@ GROUP BY URL ORDER BY Count DESC LIMIT 10; ``` - + 结果: @@ -420,24 +426,24 @@ LIMIT 10; └────────────────────────────────┴───────┘ 10 rows in set. Elapsed: 0.005 sec. -// highlight-next-line -Processed 8.19 thousand rows, +// highlight-next-line +Processed 8.19 thousand rows, 740.18 KB (1.53 million rows/s., 138.59 MB/s.) ``` -ClickHouse客户端的输出显示,没有进行全表扫æ,åªæœ‰8.19万行æµåˆ°ClickHouse。 +ClickHouse客户端的输出显示,没有进行全表扫æ,åªæœ‰8.19åƒè¡Œæµåˆ°ClickHouse。 如果trace logging打开了,那ClickHouseæœåŠ¡ç«¯æ—¥å¿—会显示ClickHouse正在对1083个UserID索引标记执行二分查找以便识别å¯èƒ½åŒ…å«UserID列值为749927693的行的颗粒。这需è¦19个步骤,平å‡æ—¶é—´å¤æ‚度为O(log2 n): ```response ...Executor): Key condition: (column 0 in [749927693, 749927693]) -// highlight-next-line +// highlight-next-line ...Executor): Running binary search on index range for part all_1_9_2 (1083 marks) ...Executor): Found (LEFT) boundary mark: 176 ...Executor): Found (RIGHT) boundary mark: 177 ...Executor): Found continuous range in 19 steps ...Executor): Selected 1/1 parts by partition key, 1 parts by primary key, -// highlight-next-line +// highlight-next-line 1/1083 marks by primary key, 1 marks to read from 1 ranges ...Reading ...approx. 8192 rows starting from 1441792 ``` @@ -446,12 +452,12 @@ ClickHouse客户端的输出显示,没有进行全表扫æ,åªæœ‰8.19万行 我们å¯ä»¥åœ¨ä¸Šé¢çš„跟踪日志中看到,1083个现有标记中有一个满足查询。
      - + Trace Log详情 -

      +

      -Mark 176 was identified (the 'found left boundary mark' is inclusive, the 'found right boundary mark' is exclusive), and therefore all 8192 rows from granule 176 (which starts at row 1.441.792 - we will see that later on in this article) are then streamed into ClickHouse in order to find the actual rows with a UserID column value of 749927693. +Mark 176 was identified (the 'found left boundary mark' is inclusive, the 'found right boundary mark' is exclusive), and therefore all 8192 rows from granule 176 (which starts at row 1.441.792 - we will see that later on in this article) are then streamed into ClickHouse in order to find the actual rows with a UserID column value of 749927693.

      @@ -465,7 +471,7 @@ GROUP BY URL ORDER BY Count DESC LIMIT 10; ``` - + 结果如下: ```response @@ -507,23 +513,23 @@ LIMIT 10; 在**第二阶段(æ•°æ®è¯»å–中)**, ClickHouse定ä½æ‰€é€‰çš„颗粒,以便将它们的所有行æµåˆ°ClickHouse引擎中,以便找到实际匹é…查询的行。 -我们将在下一节更详细地讨论第二阶段。 +我们将在下一节更详细地讨论第二阶段。 ## 标记文件用æ¥å®šä½é¢—ç²’ -下图æ述了上表主索引文件的一部分。 +下图æ述了上表主索引文件的一部分。 - + 如上所述,通过对索引的1083个UserID标记进行二分æœç´¢ï¼Œç¡®å®šäº†ç¬¬176个标记。因此,它对应的颗粒176å¯èƒ½åŒ…å«UserID列值为749.927.693的行。
      - + 颗粒选择的具体过程 -

      +

      上图显示,标记176是第一个UserID值å°äºŽ749.927.693的索引æ¡ç›®ï¼Œå¹¶ä¸”下一个标记(标记177)的颗粒177的最å°UserID值大于该值的索引æ¡ç›®ã€‚因此,åªæœ‰æ ‡è®°176对应的颗粒176å¯èƒ½åŒ…å«UserID列值为749.927.693的行。

      @@ -537,7 +543,7 @@ LIMIT 10; 下图显示了三个标记文件UserID.mrkã€URL.mrkã€EventTime.mrk,为表的UserIDã€URLå’ŒEventTime列存储颗粒的物ç†ä½ç½®ã€‚ - + 我们已ç»è®¨è®ºäº†ä¸»ç´¢å¼•æ˜¯ä¸€ä¸ªæ‰å¹³çš„未压缩数组文件(primary.idx),其中包å«ä»Ž0开始编å·çš„索引标记。 @@ -545,9 +551,9 @@ LIMIT 10; 一旦ClickHouse确定并选择了å¯èƒ½åŒ…å«æŸ¥è¯¢æ‰€éœ€çš„匹é…行的颗粒的索引标记,就å¯ä»¥åœ¨æ ‡è®°æ–‡ä»¶æ•°ç»„中查找,以获得颗粒的物ç†ä½ç½®ã€‚ -æ¯ä¸ªç‰¹å®šåˆ—的标记文件æ¡ç›®ä»¥å移é‡çš„å½¢å¼å­˜å‚¨ä¸¤ä¸ªä½ç½®: +æ¯ä¸ªç‰¹å®šåˆ—的标记文件æ¡ç›®ä»¥å移é‡çš„å½¢å¼å­˜å‚¨ä¸¤ä¸ªä½ç½®: -- 第一个å移é‡(上图中的'block_offset')是在包å«æ‰€é€‰é¢—粒的压缩版本的压缩列数æ®æ–‡ä»¶ä¸­å®šä½å—。这个压缩å—å¯èƒ½åŒ…å«å‡ ä¸ªåŽ‹ç¼©çš„颗粒。所定ä½çš„压缩文件å—在读å–时被解压到内存中。 +- 第一个å移é‡(上图中的'block_offset')是在包å«æ‰€é€‰é¢—粒的压缩版本的压缩列数æ®æ–‡ä»¶ä¸­å®šä½å—。这个压缩å—å¯èƒ½åŒ…å«å‡ ä¸ªåŽ‹ç¼©çš„颗粒。所定ä½çš„压缩文件å—在读å–时被解压到内存中。 - 标记文件的第二个å移é‡(上图中的“granule_offsetâ€)æ供了颗粒在解压数æ®å—中的ä½ç½®ã€‚ @@ -576,7 +582,7 @@ LIMIT 10; 下é¢çš„图表和文本说明了我们的查询示例,ClickHouse如何在UserID.binæ•°æ®æ–‡ä»¶ä¸­å®šä½176颗粒。 - + 我们在本文å‰é¢è®¨è®ºè¿‡ï¼ŒClickHouse选择了主索引标记176,因此176颗粒å¯èƒ½åŒ…å«æŸ¥è¯¢æ‰€éœ€çš„匹é…行。 @@ -624,7 +630,7 @@ LIMIT 10; ``` 结果是: -```response +```response ┌─────UserID─┬─Count─┠│ 2459550954 │ 3741 │ │ 1084649151 │ 2484 │ @@ -639,26 +645,26 @@ LIMIT 10; └────────────┴───────┘ 10 rows in set. Elapsed: 0.086 sec. -// highlight-next-line -Processed 8.81 million rows, +// highlight-next-line +Processed 8.81 million rows, 799.69 MB (102.11 million rows/s., 9.27 GB/s.) -``` +``` 客户端输出表明,尽管URL列是è”åˆä¸»é”®çš„一部分,ClickHouse几乎执行了一一次全表扫æï¼ClickHouse从表的887万行中读å–881万行。 如果å¯ç”¨äº†trace日志,那么ClickHouseæœåŠ¡æ—¥å¿—文件显示,ClickHouse在1083个URL索引标记上使用了通用的排除æœç´¢ï¼Œä»¥ä¾¿è¯†åˆ«é‚£äº›å¯èƒ½åŒ…å«URL列值为"http://public_search"的行。 -```response -...Executor): Key condition: (column 1 in ['http://public_search', +```response +...Executor): Key condition: (column 1 in ['http://public_search', 'http://public_search']) -// highlight-next-line -...Executor): Used generic exclusion search over index for part all_1_9_2 +// highlight-next-line +...Executor): Used generic exclusion search over index for part all_1_9_2 with 1537 steps ...Executor): Selected 1/1 parts by partition key, 1 parts by primary key, -// highlight-next-line +// highlight-next-line 1076/1083 marks by primary key, 1076 marks to read from 5 ranges ...Executor): Reading approx. 8814592 rows with 10 streams -``` -我们å¯ä»¥åœ¨ä¸Šé¢çš„跟踪日志示例中看到,1083个颗粒中有1076个(通过标记)被选中,因为å¯èƒ½åŒ…å«å…·æœ‰åŒ¹é…URL值的行。 +``` +我们å¯ä»¥åœ¨ä¸Šé¢çš„跟踪日志示例中看到,1083个颗粒中有1076个(通过标记)被选中,因为å¯èƒ½åŒ…å«å…·æœ‰åŒ¹é…URL值的行。 这将导致881万行被读å–到ClickHouse引擎中(通过使用10个æµå¹¶è¡Œåœ°è¯»å–),以便识别实际包å«URL值"http://public_search"的行。 @@ -671,15 +677,15 @@ Processed 8.81 million rows, 为了说明,我们给出通用的排除æœç´¢ç®—法的工作原ç†ï¼š
      - - 通用排除æœç´¢ç®—法 + + 通用排除æœç´¢ç®—法 -

      +

      -下é¢å°†æ¼”示当通过第一个列之åŽçš„任何列选择颗粒时,当å‰ä¸€ä¸ªé”®åˆ—具有或高或低的基数时,ClickHouse通用排除æœç´¢ç®—法 是如何工作的。 +下é¢å°†æ¼”示当通过第一个列之åŽçš„任何列选择颗粒时,当å‰ä¸€ä¸ªé”®åˆ—具有或高或低的基数时,ClickHouse通用排除æœç´¢ç®—法 是如何工作的。 作为这两ç§æƒ…况的例å­ï¼Œæˆ‘们将å‡è®¾ï¼š - æœç´¢URL值为"W3"的行。 @@ -693,7 +699,7 @@ Processed 8.81 million rows, å‡è®¾UserID具有较低的基数。在这ç§æƒ…况下,相åŒçš„UserID值很å¯èƒ½åˆ†å¸ƒåœ¨å¤šä¸ªè¡¨è¡Œå’Œé¢—粒上,从而分布在索引标记上。对于具有相åŒUserID的索引标记,索引标记的URL值按å‡åºæŽ’åº(因为表行首先按UserID排åºï¼Œç„¶åŽæŒ‰URL排åº)。这使得有效的过滤如下所述: - + 在上图中,我们的抽象样本数æ®çš„颗粒选择过程有三ç§ä¸åŒçš„场景: @@ -704,13 +710,13 @@ Processed 8.81 million rows, 3. å¯ä»¥æŽ’除URL值大于W3的索引标记2å’Œ3,因为主索引的索引标记存储了æ¯ä¸ªé¢—粒的最å°é”®åˆ—值,因此颗粒2å’Œ3ä¸å¯èƒ½åŒ…å«URL值W3。 - + **å‰ç¼€ä¸»é”®é«˜åŸºæ•°** 当UserID具有较高的基数时,相åŒçš„UserID值ä¸å¤ªå¯èƒ½åˆ†å¸ƒåœ¨å¤šä¸ªè¡¨è¡Œå’Œé¢—粒上。这æ„味ç€ç´¢å¼•æ ‡è®°çš„URL值ä¸æ˜¯å•è°ƒé€’增的: - + 正如在上é¢çš„图表中所看到的,所有URL值å°äºŽW3的标记都被选中,以便将其关è”的颗粒的行加载到ClickHouse引擎中。 @@ -736,16 +742,16 @@ Processed 8.81 million rows, 在我们的示例数æ®é›†ä¸­ï¼Œä¸¤ä¸ªé”®åˆ—(UserIDã€URL)都具有类似的高基数,并且,如å‰æ‰€è¿°ï¼Œå½“URL列的å‰ä¸€ä¸ªé”®åˆ—具有较高基数时,通用排除æœç´¢ç®—法ä¸æ˜¯å¾ˆæœ‰æ•ˆã€‚ :::note 看下跳数索引 -因为UserIDå’ŒURL具有较高的基数,[æ ¹æ®URL过滤数æ®](#query-on-url)ä¸æ˜¯ç‰¹åˆ«æœ‰æ•ˆï¼Œå¯¹URL列创建[二级跳数索引](./skipping-indexes.md)åŒæ ·ä¹Ÿä¸ä¼šæœ‰å¤ªå¤šæ”¹å–„。 +因为UserIDå’ŒURL具有较高的基数,[æ ¹æ®URL过滤数æ®](#query-on-url)ä¸æ˜¯ç‰¹åˆ«æœ‰æ•ˆï¼Œå¯¹URL列创建[二级跳数索引](./skipping-indexes.md)åŒæ ·ä¹Ÿä¸ä¼šæœ‰å¤ªå¤šæ”¹å–„。 -例如,这两个语å¥åœ¨æˆ‘们的表的URL列上创建并填充一个minmax跳数索引。 +例如,这两个语å¥åœ¨æˆ‘们的表的URL列上创建并填充一个minmax跳数索引。 ```sql ALTER TABLE hits_UserID_URL ADD INDEX url_skipping_index URL TYPE minmax GRANULARITY 4; ALTER TABLE hits_UserID_URL MATERIALIZE INDEX url_skipping_index; ``` ClickHouse现在创建了一个é¢å¤–的索引æ¥å­˜å‚¨â€”æ¯ç»„4个连续的颗粒(注æ„上é¢ALTER TABLE语å¥ä¸­çš„GRANULARITY 4å­å¥)—最å°å’Œæœ€å¤§çš„URL值: - + 第一个索引æ¡ç›®(上图中的mark 0)存储属于表的å‰4个颗粒的行的最å°å’Œæœ€å¤§URL值。 @@ -786,15 +792,15 @@ ClickHouse现在创建了一个é¢å¤–的索引æ¥å­˜å‚¨â€”æ¯ç»„4个连续的颗 当创建有ä¸åŒä¸»é”®çš„第二个表时,查询必须显å¼åœ°å‘é€ç»™æœ€é€‚åˆæŸ¥è¯¢çš„表版本,并且必须显å¼åœ°æ’入新数æ®åˆ°ä¸¤ä¸ªè¡¨ä¸­ï¼Œä»¥ä¿æŒè¡¨çš„åŒæ­¥ï¼š - + 在物化视图中,é¢å¤–的表被éšè—,数æ®è‡ªåŠ¨åœ¨ä¸¤ä¸ªè¡¨ä¹‹é—´ä¿æŒåŒæ­¥ï¼š - + projectionæ–¹å¼æ˜¯æœ€é€æ˜Žçš„选项,因为除了自动ä¿æŒéšè—的附加表与数æ®å˜åŒ–åŒæ­¥å¤–,ClickHouse还会自动选择最有效的表版本进行查询: - + 下é¢æˆ‘们使用真实的例å­è¯¦ç»†è®¨è®ºä¸‹è¿™ä¸‰ç§æ–¹å¼ã€‚ @@ -813,7 +819,7 @@ CREATE TABLE hits_URL_UserID `EventTime` DateTime ) ENGINE = MergeTree -// highlight-next-line +// highlight-next-line PRIMARY KEY (URL, UserID) ORDER BY (URL, UserID, EventTime) SETTINGS index_granularity = 8192, index_granularity_bytes = 0; @@ -822,10 +828,10 @@ SETTINGS index_granularity = 8192, index_granularity_bytes = 0; 写入887万行æºè¡¨æ•°æ®ï¼š ```sql -INSERT INTO hits_URL_UserID +INSERT INTO hits_URL_UserID SELECT * from hits_UserID_URL; ``` - + 结果: ```response @@ -841,10 +847,10 @@ OPTIMIZE TABLE hits_URL_UserID FINAL; 因为我们切æ¢äº†ä¸»é”®ä¸­åˆ—的顺åºï¼Œæ’入的行现在以ä¸åŒçš„字典顺åºå­˜å‚¨åœ¨ç£ç›˜ä¸Š(与我们的原始表相比),因此该表的1083个颗粒也包å«äº†ä¸Žä»¥å‰ä¸åŒçš„值: - + 主键索引如下: - + 现在计算最频ç¹ç‚¹å‡»URL"http://public_search"çš„å‰10å用户,这时候的查询速度是明显加快的: ```sql @@ -856,7 +862,7 @@ GROUP BY UserID ORDER BY Count DESC LIMIT 10; ``` - + 结果: @@ -875,8 +881,8 @@ LIMIT 10; └────────────┴───────┘ 10 rows in set. Elapsed: 0.017 sec. -// highlight-next-line -Processed 319.49 thousand rows, +// highlight-next-line +Processed 319.49 thousand rows, 11.38 MB (18.41 million rows/s., 655.75 MB/s.) ``` @@ -887,15 +893,15 @@ Processed 319.49 thousand rows, å°†URL作为主索引的第一列,ClickHouse现在对索引标记è¿è¡ŒäºŒåˆ†æœç´¢ã€‚ClickHouseæœåŠ¡å™¨æ—¥å¿—文件中对应的跟踪日志: ```response -...Executor): Key condition: (column 0 in ['http://public_search', +...Executor): Key condition: (column 0 in ['http://public_search', 'http://public_search']) -// highlight-next-line +// highlight-next-line ...Executor): Running binary search on index range for part all_1_9_2 (1083 marks) ...Executor): Found (LEFT) boundary mark: 644 ...Executor): Found (RIGHT) boundary mark: 683 ...Executor): Found continuous range in 19 steps ...Executor): Selected 1/1 parts by partition key, 1 parts by primary key, -// highlight-next-line +// highlight-next-line 39/1083 marks by primary key, 39 marks to read from 1 ranges ...Executor): Reading approx. 319488 rows with 2 streams ``` @@ -907,10 +913,10 @@ ClickHouseåªé€‰æ‹©äº†39个索引标记,而ä¸æ˜¯ä½¿ç”¨é€šç”¨æŽ’除æœç´¢æ—¶ 点击下é¢äº†è§£è¯¦æƒ…:

      - + 对UserID的查询过滤性能较差 -

      +

      ```sql SELECT URL, count(URL) AS Count @@ -920,7 +926,7 @@ GROUP BY URL ORDER BY Count DESC LIMIT 10; ``` - + 结果 ```response @@ -938,8 +944,8 @@ LIMIT 10; └────────────────────────────────┴───────┘ 10 rows in set. Elapsed: 0.024 sec. -// highlight-next-line -Processed 8.02 million rows, +// highlight-next-line +Processed 8.02 million rows, 73.04 MB (340.26 million rows/s., 3.10 GB/s.) ``` @@ -947,10 +953,10 @@ Processed 8.02 million rows, ```response ...Executor): Key condition: (column 1 in [749927693, 749927693]) // highlight-next-line -...Executor): Used generic exclusion search over index for part all_1_9_2 +...Executor): Used generic exclusion search over index for part all_1_9_2 with 1453 steps ...Executor): Selected 1/1 parts by partition key, 1 parts by primary key, -// highlight-next-line +// highlight-next-line 980/1083 marks by primary key, 980 marks to read from 23 ranges ...Executor): Reading approx. 8028160 rows with 10 streams ``` @@ -960,7 +966,7 @@ Processed 8.02 million rows, 现在我们有了两张表。优化了对UserIDå’ŒURL的查询过滤,分别: - + @@ -981,7 +987,7 @@ ORDER BY (URL, UserID, EventTime) POPULATE AS SELECT * FROM hits_UserID_URL; ``` - + 结果: ```response @@ -993,20 +999,20 @@ Ok. :::note - 我们在视图的主键中切æ¢é”®åˆ—的顺åº(与原始表相比) - 物化视图由一个éšè—表支æŒï¼Œè¯¥è¡¨çš„行顺åºå’Œä¸»ç´¢å¼•åŸºäºŽç»™å®šçš„主键定义 -- 我们使用POPULATE关键字,以便用æºè¡¨hits_UserID_URL中的所有887万行立å³å¯¼å…¥æ–°çš„物化视图 +- 我们使用POPULATE关键字,以便用æºè¡¨hits_UserID_URL中的所有887万行立å³å¯¼å…¥æ–°çš„物化视图 - 如果在æºè¡¨hits_UserID_URL中æ’入了新行,那么这些行也会自动æ’入到éšè—表中 - 实际上,éšå¼åˆ›å»ºçš„éšè—表的行顺åºå’Œä¸»ç´¢å¼•ä¸Žæˆ‘们上é¢æ˜¾å¼åˆ›å»ºçš„辅助表相åŒ: - + ClickHouseå°†éšè—表的列数æ®æ–‡ä»¶(.bin)ã€æ ‡è®°æ–‡ä»¶(.mrk2)和主索引(primary.idx)存储在ClickHouseæœåŠ¡å™¨çš„æ•°æ®ç›®å½•çš„一个特殊文件夹中: - + ::: @@ -1021,7 +1027,7 @@ GROUP BY UserID ORDER BY Count DESC LIMIT 10; ``` - + 结果: ```response @@ -1039,8 +1045,8 @@ LIMIT 10; └────────────┴───────┘ 10 rows in set. Elapsed: 0.026 sec. -// highlight-next-line -Processed 335.87 thousand rows, +// highlight-next-line +Processed 335.87 thousand rows, 13.54 MB (12.91 million rows/s., 520.38 MB/s.) ``` @@ -1049,13 +1055,13 @@ Processed 335.87 thousand rows, ClickHouseæœåŠ¡å™¨æ—¥å¿—文件中相应的跟踪日志确认了ClickHouse正在对索引标记è¿è¡ŒäºŒåˆ†æœç´¢ï¼š ```response -...Executor): Key condition: (column 0 in ['http://public_search', +...Executor): Key condition: (column 0 in ['http://public_search', 'http://public_search']) // highlight-next-line ...Executor): Running binary search on index range ... ... ...Executor): Selected 4/4 parts by partition key, 4 parts by primary key, -// highlight-next-line +// highlight-next-line 41/1083 marks by primary key, 41 marks to read from 4 ranges ...Executor): Reading approx. 335872 rows with 4 streams ``` @@ -1095,11 +1101,11 @@ ALTER TABLE hits_UserID_URL - 查询总是(从语法上)针对æºè¡¨hits_UserID_URL,但是如果éšè—表的行顺åºå’Œä¸»ç´¢å¼•å…许更有效地执行查询,那么将使用该éšè—表 - 实际上,éšå¼åˆ›å»ºçš„éšè—表的行顺åºå’Œä¸»ç´¢å¼•ä¸Žæˆ‘们显å¼åˆ›å»ºçš„辅助表相åŒï¼š - + ClickHouseå°†éšè—表的列数æ®æ–‡ä»¶(.bin)ã€æ ‡è®°æ–‡ä»¶(.mrk2)和主索引(primary.idx)存储在一个特殊的文件夹中(在下é¢çš„截图中用橙色标记),紧挨ç€æºè¡¨çš„æ•°æ®æ–‡ä»¶ã€æ ‡è®°æ–‡ä»¶å’Œä¸»ç´¢å¼•æ–‡ä»¶ï¼š - + ::: 由投影创建的éšè—表(以åŠå®ƒçš„主索引)现在å¯ä»¥(éšå¼åœ°)用于显著加快URL列上查询过滤的执行。注æ„,查询在语法上针对投影的æºè¡¨ã€‚ @@ -1113,7 +1119,7 @@ GROUP BY UserID ORDER BY Count DESC LIMIT 10; ``` - + 结果: ```response @@ -1130,8 +1136,8 @@ LIMIT 10; │ 765730816 │ 536 │ └────────────┴───────┘ -10 rows in set. Elapsed: 0.029 sec. -// highlight-next-line +10 rows in set. Elapsed: 0.029 sec. +// highlight-next-line Processed 319.49 thousand rows, 1 1.38 MB (11.05 million rows/s., 393.58 MB/s.) ``` @@ -1142,16 +1148,16 @@ ClickHouseæœåŠ¡å™¨æ—¥å¿—文件中跟踪日志确认了ClickHouse正在对索引 ```response -...Executor): Key condition: (column 0 in ['http://public_search', +...Executor): Key condition: (column 0 in ['http://public_search', 'http://public_search']) -// highlight-next-line +// highlight-next-line ...Executor): Running binary search on index range for part prj_url_userid (1083 marks) ...Executor): ... // highlight-next-line ...Executor): Choose complete Normal projection prj_url_userid ...Executor): projection required columns: URL, UserID ...Executor): Selected 1/1 parts by partition key, 1 parts by primary key, -// highlight-next-line +// highlight-next-line 39/1083 marks by primary key, 39 marks to read from 1 ranges ...Executor): Reading approx. 319488 rows with 2 streams ``` diff --git a/docs/zh/interfaces/http.md b/docs/zh/interfaces/http.md index e0c12193a6a..c7a0f355a92 100644 --- a/docs/zh/interfaces/http.md +++ b/docs/zh/interfaces/http.md @@ -96,7 +96,7 @@ ECT 1 , expected One of: SHOW TABLES, SHOW DATABASES, SELECT, INSERT, CREATE, ATTACH, RENAME, DROP, DETACH, USE, SET, OPTIMIZE., e.what() = DB::Exception ``` -默认情况下,返回的数æ®æ˜¯`TabSeparated`æ ¼å¼çš„,更多信æ¯ï¼Œè§[Formats](../interfaces/formats/)部分。 +默认情况下,返回的数æ®æ˜¯`TabSeparated`æ ¼å¼çš„,更多信æ¯ï¼Œè§[Formats](../interfaces/formats.md)部分。 您å¯ä»¥ä½¿ç”¨æŸ¥è¯¢çš„FORMATå­å¥æ¥è®¾ç½®å…¶ä»–æ ¼å¼ã€‚ diff --git a/docs/zh/operations/optimizing-performance/sampling-query-profiler.md b/docs/zh/operations/optimizing-performance/sampling-query-profiler.md index 4206274ec0d..5d31ab9b245 100644 --- a/docs/zh/operations/optimizing-performance/sampling-query-profiler.md +++ b/docs/zh/operations/optimizing-performance/sampling-query-profiler.md @@ -32,7 +32,7 @@ ClickHouseè¿è¡Œå…许分æžæŸ¥è¯¢æ‰§è¡Œçš„采样探查器。 使用探查器, - 使用 `addressToLine`, `addressToSymbol` å’Œ `demangle` [内çœåŠŸèƒ½](../../sql-reference/functions/introspection.md) 获å–函数å称åŠå…¶åœ¨ClickHouse代ç ä¸­çš„ä½ç½®ã€‚ è¦èŽ·å–æŸäº›æŸ¥è¯¢çš„é…置文件,您需è¦ä»Žä»¥ä¸‹å†…å®¹æ±‡æ€»æ•°æ® `trace_log` æ¡Œå­ æ‚¨å¯ä»¥é€šè¿‡å•ä¸ªå‡½æ•°æˆ–整个堆栈跟踪èšåˆæ•°æ®ã€‚ -如果你需è¦æƒ³è±¡ `trace_log` ä¿¡æ¯ï¼Œå°è¯• [flamegraph](../../interfaces/third-party/gui/#clickhouse-flamegraph) å’Œ [测速镜](https://github.com/laplab/clickhouse-speedscope). +如果你需è¦æƒ³è±¡ `trace_log` ä¿¡æ¯ï¼Œå°è¯• [flamegraph](../../interfaces/third-party/gui.md#clickhouse-flamegraph) å’Œ [测速镜](https://github.com/laplab/clickhouse-speedscope). ## 示例 {#example} diff --git a/docs/zh/operations/settings/settings-users.md b/docs/zh/operations/settings/settings-users.md index 3fb97bbddb2..d7fe5bad3c3 100644 --- a/docs/zh/operations/settings/settings-users.md +++ b/docs/zh/operations/settings/settings-users.md @@ -11,7 +11,7 @@ sidebar_label: "\u7528\u6237\u8BBE\u7F6E" `user.xml` 中的 `users` é…置段包å«äº†ç”¨æˆ·é…ç½® :::note -ClickHouseè¿˜æ”¯æŒ [SQL驱动的工作æµ](../access-rights.md#access-control) 用于管ç†ç”¨æˆ·ã€‚ 我们建议使用它。 +ClickHouseè¿˜æ”¯æŒ [SQL驱动的工作æµ](/docs/en/operations/access-rights#access-control) 用于管ç†ç”¨æˆ·ã€‚ 我们建议使用它。 ::: `users` é…置段的结构: @@ -79,7 +79,7 @@ ClickHouseè¿˜æ”¯æŒ [SQL驱动的工作æµ](../access-rights.md#access-control) ### access_management {#access_management-user-setting} -此设置å¯ä¸ºç”¨æˆ·å¯ç”¨æˆ–ç¦ç”¨ SQL-driven [访问控制和å¸æˆ·ç®¡ç†](../access-rights.md#access-control) 。 +此设置å¯ä¸ºç”¨æˆ·å¯ç”¨æˆ–ç¦ç”¨ SQL-driven [访问控制和å¸æˆ·ç®¡ç†](/docs/en/operations/access-rights#access-control) 。 å¯èƒ½çš„值: diff --git a/docs/zh/operations/system-tables/data_type_families.md b/docs/zh/operations/system-tables/data_type_families.md index 18e9455476d..f0e3a9ef896 100644 --- a/docs/zh/operations/system-tables/data_type_families.md +++ b/docs/zh/operations/system-tables/data_type_families.md @@ -3,7 +3,7 @@ slug: /zh/operations/system-tables/data_type_families --- # system.data_type_families {#system_tables-data_type_families} -包å«æœ‰å…³å—支æŒçš„[æ•°æ®ç±»åž‹](../../sql-reference/data-types/)çš„ä¿¡æ¯. +包å«æœ‰å…³å—支æŒçš„[æ•°æ®ç±»åž‹](../../sql-reference/data-types/index.md)çš„ä¿¡æ¯. 列字段包括: diff --git a/docs/zh/operations/system-tables/replicated_fetches.md b/docs/zh/operations/system-tables/replicated_fetches.md index 7fd517c72ab..c6c37759755 100644 --- a/docs/zh/operations/system-tables/replicated_fetches.md +++ b/docs/zh/operations/system-tables/replicated_fetches.md @@ -68,4 +68,4 @@ thread_id: 54 **å¦è¯·å‚阅** -- [ç®¡ç† ReplicatedMergeTree 表](../../sql-reference/statements/system/#query-language-system-replicated) +- [ç®¡ç† ReplicatedMergeTree 表](../../sql-reference/statements/system.md#query-language-system-replicated) diff --git a/docs/zh/sql-reference/aggregate-functions/reference/grouparrayinsertat.md b/docs/zh/sql-reference/aggregate-functions/reference/grouparrayinsertat.md index 8431b5a1110..f0672d4fe45 100644 --- a/docs/zh/sql-reference/aggregate-functions/reference/grouparrayinsertat.md +++ b/docs/zh/sql-reference/aggregate-functions/reference/grouparrayinsertat.md @@ -20,7 +20,7 @@ groupArrayInsertAt(default_x, size)(x, pos); **å‚æ•°** -- `x` — è¦æ’入的值。生æˆæ‰€[支æŒçš„æ•°æ®ç±»åž‹](../../../sql-reference/data-types/index.md)(æ•°æ®)çš„[表达å¼](../../../sql-reference/syntax#syntax-expressions)。 +- `x` — è¦æ’入的值。生æˆæ‰€[支æŒçš„æ•°æ®ç±»åž‹](../../../sql-reference/data-types/index.md)(æ•°æ®)çš„[表达å¼](../../../sql-reference/syntax.md#syntax-expressions)。 - `pos` — 指定元素 `x` 将被æ’入的ä½ç½®ã€‚ 数组中的索引编å·ä»Žé›¶å¼€å§‹ã€‚ [UInt32](../../../sql-reference/data-types/int-uint.md#uint-ranges). - `default_x` — 在空ä½ç½®æ›¿æ¢çš„默认值。å¯é€‰å‚æ•°ã€‚ç”Ÿæˆ `x` æ•°æ®ç±»åž‹ (æ•°æ®) çš„[表达å¼](../../../sql-reference/syntax.md#syntax-expressions)。 如果 `default_x` 未定义,则 [默认值](../../../sql-reference/statements/create.md#create-default-values) 被使用。 - `size`— 结果数组的长度。å¯é€‰å‚数。如果使用该å‚数,必须指定默认值 `default_x` 。 [UInt32](../../../sql-reference/data-types/int-uint.md#uint-ranges)。 diff --git a/docs/zh/sql-reference/dictionaries/external-dictionaries/external-dicts-dict-polygon.mdx b/docs/zh/sql-reference/dictionaries/external-dictionaries/external-dicts-dict-polygon.md similarity index 59% rename from docs/zh/sql-reference/dictionaries/external-dictionaries/external-dicts-dict-polygon.mdx rename to docs/zh/sql-reference/dictionaries/external-dictionaries/external-dicts-dict-polygon.md index 0c924feda73..fe70d29f8da 100644 --- a/docs/zh/sql-reference/dictionaries/external-dictionaries/external-dicts-dict-polygon.mdx +++ b/docs/zh/sql-reference/dictionaries/external-dictionaries/external-dicts-dict-polygon.md @@ -5,6 +5,4 @@ sidebar_label: Polygon Dictionaries With Grids title: "Polygon dictionaries" --- -import Content from '@site/docs/en/sql-reference/dictionaries/external-dictionaries/external-dicts-dict-polygon.md'; - - +View the [english Dictionaries doc page for details](../../../../en/sql-reference/dictionaries/index.md). diff --git a/docs/zh/sql-reference/functions/geo/index.mdx b/docs/zh/sql-reference/functions/geo/index.mdx deleted file mode 100644 index fcfc4bd4717..00000000000 --- a/docs/zh/sql-reference/functions/geo/index.mdx +++ /dev/null @@ -1,10 +0,0 @@ ---- -slug: /zh/sql-reference/functions/geo/ -sidebar_label: Geo -sidebar_position: 62 -title: "Geo Functions" ---- - -import Content from '@site/docs/en/sql-reference/functions/geo/index.md'; - - diff --git a/docs/zh/sql-reference/statements/alter/index.md b/docs/zh/sql-reference/statements/alter/index.md index 8320b207725..e173837a16c 100644 --- a/docs/zh/sql-reference/statements/alter/index.md +++ b/docs/zh/sql-reference/statements/alter/index.md @@ -1,5 +1,5 @@ --- -slug: /zh/sql-reference/statements/alter/ +slug: /zh/sql-reference/statements/alter/overview sidebar_position: 35 sidebar_label: ALTER --- diff --git a/docs/zh/sql-reference/statements/create/database.md b/docs/zh/sql-reference/statements/create/database.md index 2c6e53c0f06..3e5b71fb196 100644 --- a/docs/zh/sql-reference/statements/create/database.md +++ b/docs/zh/sql-reference/statements/create/database.md @@ -27,4 +27,4 @@ ClickHouse在指定集群的所有æœåŠ¡å™¨ä¸Šåˆ›å»º`db_name`æ•°æ®åº“。 更多 ### ENGINE {#engine} -[MySQL](../../../engines/database-engines/mysql.md) å…许您从远程MySQLæœåŠ¡å™¨æ£€ç´¢æ•°æ®. 默认情况下,ClickHouse使用自己的[database engine](../../../engines/database-engines/index.md). 还有一个[lazy](../../../engines/database-engines/lazy)引擎. +[MySQL](../../../engines/database-engines/mysql.md) å…许您从远程MySQLæœåŠ¡å™¨æ£€ç´¢æ•°æ®. 默认情况下,ClickHouse使用自己的[database engine](../../../engines/database-engines/index.md). 还有一个[lazy](../../../engines/database-engines/lazy.md)引擎. diff --git a/docs/zh/sql-reference/statements/create/index.md b/docs/zh/sql-reference/statements/create/index.md deleted file mode 100644 index f63ed0a7acd..00000000000 --- a/docs/zh/sql-reference/statements/create/index.md +++ /dev/null @@ -1,11 +0,0 @@ ---- -slug: /zh/sql-reference/statements/create/ -sidebar_label: CREATE -sidebar_position: 34 ---- - -# CREATE语法 {#create-queries} - -CREATE语法包å«ä»¥ä¸‹å­é›†: - -- [DATABASE](../../../sql-reference/statements/create/database.md) diff --git a/docs/zh/sql-reference/statements/create/view.md b/docs/zh/sql-reference/statements/create/view.md index a000c69f1ef..8ce2d20a10c 100644 --- a/docs/zh/sql-reference/statements/create/view.md +++ b/docs/zh/sql-reference/statements/create/view.md @@ -63,7 +63,7 @@ ClickHouse 中的物化视图更åƒæ˜¯æ’入触å‘器。 如果视图查询中 视图看起æ¥ä¸Žæ™®é€šè¡¨ç›¸åŒã€‚ 例如,它们列在`SHOW TABLES`查询的结果中。 -删除视图,使用[DROP VIEW](../../../sql-reference/statements/drop#drop-view). `DROP TABLE`也适用于视图。 +删除视图,使用[DROP VIEW](../../../sql-reference/statements/drop.md#drop-view). `DROP TABLE`也适用于视图。 ## Live View (实验性) {#live-view} diff --git a/docs/zh/sql-reference/statements/index.md b/docs/zh/sql-reference/statements/index.md index cf51dadc8f1..2fdfeb1786f 100644 --- a/docs/zh/sql-reference/statements/index.md +++ b/docs/zh/sql-reference/statements/index.md @@ -10,7 +10,7 @@ sidebar_position: 31 - [SELECT](../../sql-reference/statements/select/index.md) - [INSERT INTO](../../sql-reference/statements/insert-into.md) -- [CREATE](../../sql-reference/statements/create/index.md) +- [CREATE](../../sql-reference/statements/create.md) - [ALTER](../../sql-reference/statements/alter/index.md) - [SYSTEM](../../sql-reference/statements/system.md) - [SHOW](../../sql-reference/statements/show.md) @@ -20,7 +20,7 @@ sidebar_position: 31 - [CHECK TABLE](../../sql-reference/statements/check-table.mdx) - [DESCRIBE TABLE](../../sql-reference/statements/describe-table.mdx) - [DETACH](../../sql-reference/statements/detach.mdx) -- [DROP](../../sql-reference/statements/drop) +- [DROP](../../sql-reference/statements/drop.md) - [EXISTS](../../sql-reference/statements/exists.md) - [KILL](../../sql-reference/statements/kill.mdx) - [OPTIMIZE](../../sql-reference/statements/optimize.mdx) diff --git a/docs/zh/sql-reference/statements/select/array-join.md b/docs/zh/sql-reference/statements/select/array-join.md index b0352a7bb0a..4162a39f399 100644 --- a/docs/zh/sql-reference/statements/select/array-join.md +++ b/docs/zh/sql-reference/statements/select/array-join.md @@ -146,7 +146,7 @@ ARRAY JOIN arr AS a, arrayEnumerate(arr) AS num, arrayMap(x -> x + 1, arr) AS ma └───────┴─────────┴───┴─────┴────────┘ ``` -下é¢çš„例å­ä½¿ç”¨ [arrayEnumerate](../../../sql-reference/functions/array-functions#array_functions-arrayenumerate) 功能: +下é¢çš„例å­ä½¿ç”¨ [arrayEnumerate](../../../sql-reference/functions/array-functions.md#array_functions-arrayenumerate) 功能: ``` sql SELECT s, arr, a, num, arrayEnumerate(arr) @@ -259,7 +259,7 @@ ARRAY JOIN nest AS n; └───────┴─────┴─────┴─────────┴────────────┘ ``` -使用功能 [arrayEnumerate](../../../sql-reference/functions/array-functions#array_functions-arrayenumerate) 的例å­: +使用功能 [arrayEnumerate](../../../sql-reference/functions/array-functions.md#array_functions-arrayenumerate) 的例å­: ``` sql SELECT s, `n.x`, `n.y`, `nest.x`, `nest.y`, num diff --git a/docs/zh/sql-reference/statements/select/group-by.md b/docs/zh/sql-reference/statements/select/group-by.md index 29c72ce7e45..86511470538 100644 --- a/docs/zh/sql-reference/statements/select/group-by.md +++ b/docs/zh/sql-reference/statements/select/group-by.md @@ -8,7 +8,7 @@ sidebar_label: GROUP BY `GROUP BY` å­å¥å°† `SELECT` 查询结果转æ¢ä¸ºèšåˆæ¨¡å¼ï¼Œå…¶å·¥ä½œåŽŸç†å¦‚下: - `GROUP BY` å­å¥åŒ…å«è¡¨è¾¾å¼åˆ—表(或å•ä¸ªè¡¨è¾¾å¼ -- å¯ä»¥è®¤ä¸ºæ˜¯é•¿åº¦ä¸º1的列表)。 这份åå•å……当 “grouping keyâ€ï¼Œè€Œæ¯ä¸ªå•ç‹¬çš„表达å¼å°†è¢«ç§°ä¸º “key expressionsâ€. -- 在所有的表达å¼åœ¨ [SELECT](../../../sql-reference/statements/select/index.md), [HAVING](../../../sql-reference/statements/select/having),和 [ORDER BY](../../../sql-reference/statements/select/order-by.md) å­å¥ä¸­ **å¿…é¡»** 基于键表达å¼è¿›è¡Œè®¡ç®— **或** 上 [èšåˆå‡½æ•°](../../../sql-reference/aggregate-functions/index.md) 在éžé”®è¡¨è¾¾å¼ï¼ˆåŒ…括纯列)上。 æ¢å¥è¯è¯´ï¼Œä»Žè¡¨ä¸­é€‰æ‹©çš„æ¯ä¸ªåˆ—必须用于键表达å¼æˆ–èšåˆå‡½æ•°å†…,但ä¸èƒ½åŒæ—¶ä½¿ç”¨ã€‚ +- 在所有的表达å¼åœ¨ [SELECT](../../../sql-reference/statements/select/index.md), [HAVING](../../../sql-reference/statements/select/having.md),和 [ORDER BY](../../../sql-reference/statements/select/order-by.md) å­å¥ä¸­ **å¿…é¡»** 基于键表达å¼è¿›è¡Œè®¡ç®— **或** 上 [èšåˆå‡½æ•°](../../../sql-reference/aggregate-functions/index.md) 在éžé”®è¡¨è¾¾å¼ï¼ˆåŒ…括纯列)上。 æ¢å¥è¯è¯´ï¼Œä»Žè¡¨ä¸­é€‰æ‹©çš„æ¯ä¸ªåˆ—必须用于键表达å¼æˆ–èšåˆå‡½æ•°å†…,但ä¸èƒ½åŒæ—¶ä½¿ç”¨ã€‚ - èšåˆç»“æžœ `SELECT` 查询将包å«å°½å¯èƒ½å¤šçš„行,因为有唯一值 “grouping key†在æºè¡¨ä¸­ã€‚ 通常这会显ç€å‡å°‘行数,通常是数é‡çº§ï¼Œä½†ä¸ä¸€å®šï¼šå¦‚果所有行数ä¿æŒä¸å˜ “grouping key†值是ä¸åŒçš„。 :::note @@ -58,7 +58,7 @@ sidebar_label: GROUP BY - 在 `Pretty*` æ ¼å¼æ—¶ï¼Œè¯¥è¡Œåœ¨ä¸»ç»“果之åŽä½œä¸ºå•ç‹¬çš„表输出。 - 在其他格å¼ä¸­ï¼Œå®ƒä¸å¯ç”¨ã€‚ -`WITH TOTALS` å¯ä»¥ä»¥ä¸åŒçš„æ–¹å¼è¿è¡Œæ—¶ [HAVING](../../../sql-reference/statements/select/having) 是存在的。 该行为å–决于 `totals_mode` 设置。 +`WITH TOTALS` å¯ä»¥ä»¥ä¸åŒçš„æ–¹å¼è¿è¡Œæ—¶ [HAVING](../../../sql-reference/statements/select/having.md) 是存在的。 该行为å–决于 `totals_mode` 设置。 ### é…ç½®æ€»å’Œå¤„ç† {#configuring-totals-processing} diff --git a/docs/zh/sql-reference/statements/select/index.md b/docs/zh/sql-reference/statements/select/index.md index 2d4044cbd20..fdf196e198b 100644 --- a/docs/zh/sql-reference/statements/select/index.md +++ b/docs/zh/sql-reference/statements/select/index.md @@ -41,7 +41,7 @@ SELECT [DISTINCT] expr_list - [WHERE å­å¥](../../../sql-reference/statements/select/where.md) - [GROUP BY å­å¥](../../../sql-reference/statements/select/group-by.md) - [LIMIT BY å­å¥](../../../sql-reference/statements/select/limit-by.md) -- [HAVING å­å¥](../../../sql-reference/statements/select/having) +- [HAVING å­å¥](../../../sql-reference/statements/select/having.md) - [SELECT å­å¥](#select-clause) - [DISTINCT å­å¥](../../../sql-reference/statements/select/distinct.md) - [LIMIT å­å¥](../../../sql-reference/statements/select/limit.md) diff --git a/programs/install/Install.cpp b/programs/install/Install.cpp index 80f3b0bbc63..b142159fbdf 100644 --- a/programs/install/Install.cpp +++ b/programs/install/Install.cpp @@ -222,6 +222,8 @@ int mainEntryClickHouseInstall(int argc, char ** argv) ("pid-path", po::value()->default_value("var/run/clickhouse-server"), "directory for pid file") ("user", po::value()->default_value(DEFAULT_CLICKHOUSE_SERVER_USER), "clickhouse user to create") ("group", po::value()->default_value(DEFAULT_CLICKHOUSE_SERVER_GROUP), "clickhouse group to create") + ("noninteractive,y", "run non-interactively") + ("link", "create symlink to the binary instead of copying to binary-path") ; po::variables_map options; @@ -267,8 +269,6 @@ int mainEntryClickHouseInstall(int argc, char ** argv) /// Copy binary to the destination directory. - /// TODO An option to link instead of copy - useful for developers. - fs::path prefix = options["prefix"].as(); fs::path bin_dir = prefix / options["binary-path"].as(); @@ -281,76 +281,129 @@ int mainEntryClickHouseInstall(int argc, char ** argv) bool old_binary_exists = fs::exists(main_bin_path); bool already_installed = false; - /// Check if the binary is the same file (already installed). - if (old_binary_exists && binary_self_canonical_path == fs::canonical(main_bin_path)) + if (options.count("link")) { - already_installed = true; - fmt::print("ClickHouse binary is already located at {}\n", main_bin_path.string()); - } - /// Check if binary has the same content. - else if (old_binary_exists && binary_size == fs::file_size(main_bin_path)) - { - fmt::print("Found already existing ClickHouse binary at {} having the same size. Will check its contents.\n", - main_bin_path.string()); - - if (filesEqual(binary_self_path.string(), main_bin_path.string())) + if (old_binary_exists) { - already_installed = true; - fmt::print("ClickHouse binary is already located at {} and it has the same content as {}\n", - main_bin_path.string(), binary_self_canonical_path.string()); - } - } + bool is_symlink = FS::isSymlink(main_bin_path); + fs::path points_to; + if (is_symlink) + points_to = fs::weakly_canonical(FS::readSymlink(main_bin_path)); - if (already_installed) - { - if (0 != chmod(main_bin_path.string().c_str(), S_IRUSR | S_IRGRP | S_IROTH | S_IXUSR | S_IXGRP | S_IXOTH)) - throwFromErrno(fmt::format("Cannot chmod {}", main_bin_path.string()), ErrorCodes::SYSTEM_ERROR); + if (is_symlink && points_to == binary_self_canonical_path) + { + already_installed = true; + } + else + { + if (!is_symlink) + { + fmt::print("File {} already exists but it's not a symlink. Will rename to {}.\n", + main_bin_path.string(), main_bin_old_path.string()); + fs::rename(main_bin_path, main_bin_old_path); + } + else if (points_to != main_bin_path) + { + fmt::print("Symlink {} already exists but it points to {}. Will replace the old symlink to {}.\n", + main_bin_path.string(), points_to.string(), binary_self_canonical_path.string()); + fs::remove(main_bin_path); + } + } + } + + if (!already_installed) + { + if (!fs::exists(bin_dir)) + { + fmt::print("Creating binary directory {}.\n", bin_dir.string()); + fs::create_directories(bin_dir); + } + + fmt::print("Creating symlink {} to {}.\n", main_bin_path.string(), binary_self_canonical_path.string()); + fs::create_symlink(binary_self_canonical_path, main_bin_path); + + if (0 != chmod(binary_self_canonical_path.string().c_str(), S_IRUSR | S_IRGRP | S_IROTH | S_IXUSR | S_IXGRP | S_IXOTH)) + throwFromErrno(fmt::format("Cannot chmod {}", binary_self_canonical_path.string()), ErrorCodes::SYSTEM_ERROR); + } } else { - if (!fs::exists(bin_dir)) + bool is_symlink = FS::isSymlink(main_bin_path); + + if (!is_symlink) { - fmt::print("Creating binary directory {}.\n", bin_dir.string()); - fs::create_directories(bin_dir); + /// Check if the binary is the same file (already installed). + if (old_binary_exists && binary_self_canonical_path == fs::canonical(main_bin_path)) + { + already_installed = true; + fmt::print("ClickHouse binary is already located at {}\n", main_bin_path.string()); + } + /// Check if binary has the same content. + else if (old_binary_exists && binary_size == fs::file_size(main_bin_path)) + { + fmt::print("Found already existing ClickHouse binary at {} having the same size. Will check its contents.\n", + main_bin_path.string()); + + if (filesEqual(binary_self_path.string(), main_bin_path.string())) + { + already_installed = true; + fmt::print("ClickHouse binary is already located at {} and it has the same content as {}\n", + main_bin_path.string(), binary_self_canonical_path.string()); + } + } } - size_t available_space = fs::space(bin_dir).available; - if (available_space < binary_size) - throw Exception(ErrorCodes::NOT_ENOUGH_SPACE, "Not enough space for clickhouse binary in {}, required {}, available {}.", - bin_dir.string(), ReadableSize(binary_size), ReadableSize(available_space)); - - fmt::print("Copying ClickHouse binary to {}\n", main_bin_tmp_path.string()); - - try + if (already_installed) { - ReadBufferFromFile in(binary_self_path.string()); - WriteBufferFromFile out(main_bin_tmp_path.string()); - copyData(in, out); - out.sync(); - - if (0 != fchmod(out.getFD(), S_IRUSR | S_IRGRP | S_IROTH | S_IXUSR | S_IXGRP | S_IXOTH)) - throwFromErrno(fmt::format("Cannot chmod {}", main_bin_tmp_path.string()), ErrorCodes::SYSTEM_ERROR); - - out.finalize(); + if (0 != chmod(main_bin_path.string().c_str(), S_IRUSR | S_IRGRP | S_IROTH | S_IXUSR | S_IXGRP | S_IXOTH)) + throwFromErrno(fmt::format("Cannot chmod {}", main_bin_path.string()), ErrorCodes::SYSTEM_ERROR); } - catch (const Exception & e) + else { - if (e.code() == ErrorCodes::CANNOT_OPEN_FILE && geteuid() != 0) - std::cerr << "Install must be run as root: " << formatWithSudo("./clickhouse install") << '\n'; - throw; + if (!fs::exists(bin_dir)) + { + fmt::print("Creating binary directory {}.\n", bin_dir.string()); + fs::create_directories(bin_dir); + } + + size_t available_space = fs::space(bin_dir).available; + if (available_space < binary_size) + throw Exception(ErrorCodes::NOT_ENOUGH_SPACE, "Not enough space for clickhouse binary in {}, required {}, available {}.", + bin_dir.string(), ReadableSize(binary_size), ReadableSize(available_space)); + + fmt::print("Copying ClickHouse binary to {}\n", main_bin_tmp_path.string()); + + try + { + ReadBufferFromFile in(binary_self_path.string()); + WriteBufferFromFile out(main_bin_tmp_path.string()); + copyData(in, out); + out.sync(); + + if (0 != fchmod(out.getFD(), S_IRUSR | S_IRGRP | S_IROTH | S_IXUSR | S_IXGRP | S_IXOTH)) + throwFromErrno(fmt::format("Cannot chmod {}", main_bin_tmp_path.string()), ErrorCodes::SYSTEM_ERROR); + + out.finalize(); + } + catch (const Exception & e) + { + if (e.code() == ErrorCodes::CANNOT_OPEN_FILE && geteuid() != 0) + std::cerr << "Install must be run as root: " << formatWithSudo("./clickhouse install") << '\n'; + throw; + } + + if (old_binary_exists) + { + fmt::print("{} already exists, will rename existing binary to {} and put the new binary in place\n", + main_bin_path.string(), main_bin_old_path.string()); + + /// There is file exchange operation in Linux but it's not portable. + fs::rename(main_bin_path, main_bin_old_path); + } + + fmt::print("Renaming {} to {}.\n", main_bin_tmp_path.string(), main_bin_path.string()); + fs::rename(main_bin_tmp_path, main_bin_path); } - - if (old_binary_exists) - { - fmt::print("{} already exists, will rename existing binary to {} and put the new binary in place\n", - main_bin_path.string(), main_bin_old_path.string()); - - /// There is file exchange operation in Linux but it's not portable. - fs::rename(main_bin_path, main_bin_old_path); - } - - fmt::print("Renaming {} to {}.\n", main_bin_tmp_path.string(), main_bin_path.string()); - fs::rename(main_bin_tmp_path, main_bin_path); } /// Create symlinks. @@ -384,7 +437,7 @@ int mainEntryClickHouseInstall(int argc, char ** argv) if (is_symlink) points_to = fs::weakly_canonical(FS::readSymlink(symlink_path)); - if (is_symlink && points_to == main_bin_path) + if (is_symlink && (points_to == main_bin_path || (options.count("link") && points_to == binary_self_canonical_path))) { need_to_create = false; } @@ -709,7 +762,9 @@ int mainEntryClickHouseInstall(int argc, char ** argv) /// dpkg or apt installers can ask for non-interactive work explicitly. const char * debian_frontend_var = getenv("DEBIAN_FRONTEND"); // NOLINT(concurrency-mt-unsafe) - bool noninteractive = debian_frontend_var && debian_frontend_var == std::string_view("noninteractive"); + bool noninteractive = (debian_frontend_var && debian_frontend_var == std::string_view("noninteractive")) + || options.count("noninteractive"); + bool is_interactive = !noninteractive && stdin_is_a_tty && stdout_is_a_tty; diff --git a/programs/local/LocalServer.cpp b/programs/local/LocalServer.cpp index 8e092bdf8e4..5768e744f94 100644 --- a/programs/local/LocalServer.cpp +++ b/programs/local/LocalServer.cpp @@ -600,13 +600,13 @@ void LocalServer::processConfig() String uncompressed_cache_policy = config().getString("uncompressed_cache_policy", ""); size_t uncompressed_cache_size = config().getUInt64("uncompressed_cache_size", 0); if (uncompressed_cache_size) - global_context->setUncompressedCache(uncompressed_cache_size, uncompressed_cache_policy); + global_context->setUncompressedCache(uncompressed_cache_policy, uncompressed_cache_size); /// Size of cache for marks (index of MergeTree family of tables). String mark_cache_policy = config().getString("mark_cache_policy", ""); size_t mark_cache_size = config().getUInt64("mark_cache_size", 5368709120); if (mark_cache_size) - global_context->setMarkCache(mark_cache_size, mark_cache_policy); + global_context->setMarkCache(mark_cache_policy, mark_cache_size); /// Size of cache for uncompressed blocks of MergeTree indices. Zero means disabled. size_t index_uncompressed_cache_size = config().getUInt64("index_uncompressed_cache_size", 0); diff --git a/programs/server/Server.cpp b/programs/server/Server.cpp index 711dfb3820a..5d172aa4f82 100644 --- a/programs/server/Server.cpp +++ b/programs/server/Server.cpp @@ -1456,7 +1456,7 @@ try LOG_INFO(log, "Uncompressed cache size was lowered to {} because the system has low amount of memory", formatReadableSizeWithBinarySuffix(uncompressed_cache_size)); } - global_context->setUncompressedCache(uncompressed_cache_size, uncompressed_cache_policy); + global_context->setUncompressedCache(uncompressed_cache_policy, uncompressed_cache_size); /// Load global settings from default_profile and system_profile. global_context->setDefaultProfiles(config()); @@ -1481,7 +1481,7 @@ try LOG_INFO(log, "Mark cache size was lowered to {} because the system has low amount of memory", formatReadableSizeWithBinarySuffix(mark_cache_size)); } - global_context->setMarkCache(mark_cache_size, mark_cache_policy); + global_context->setMarkCache(mark_cache_policy, mark_cache_size); if (server_settings.index_uncompressed_cache_size) global_context->setIndexUncompressedCache(server_settings.index_uncompressed_cache_size); diff --git a/programs/server/config.xml b/programs/server/config.xml index 0ea2de18e22..cfcd2ff93e0 100644 --- a/programs/server/config.xml +++ b/programs/server/config.xml @@ -703,6 +703,9 @@ actions of previous constraint (defined in other profiles) for the same specific setting, including fields that are not set by new constraint. It also enables 'changeable_in_readonly' constraint type --> false + + + 600 @@ -1288,6 +1291,10 @@ *_function.xml + + + @@ -1510,7 +1517,7 @@ - + diff --git a/src/Access/AccessControl.cpp b/src/Access/AccessControl.cpp index 91283da241d..875f4965e0b 100644 --- a/src/Access/AccessControl.cpp +++ b/src/Access/AccessControl.cpp @@ -247,7 +247,7 @@ private: AccessControl::AccessControl() : MultipleAccessStorage("user directories"), context_access_cache(std::make_unique(*this)), - role_cache(std::make_unique(*this)), + role_cache(std::make_unique(*this, 600)), row_policy_cache(std::make_unique(*this)), quota_cache(std::make_unique(*this)), settings_profiles_cache(std::make_unique(*this)), @@ -282,6 +282,8 @@ void AccessControl::setUpFromMainConfig(const Poco::Util::AbstractConfiguration setSettingsConstraintsReplacePrevious(config_.getBool("access_control_improvements.settings_constraints_replace_previous", false)); addStoragesFromMainConfig(config_, config_path_, get_zookeeper_function_); + + role_cache = std::make_unique(*this, config_.getInt("access_control_improvements.role_cache_expiration_time_seconds", 600)); } diff --git a/src/Access/AccessRights.cpp b/src/Access/AccessRights.cpp index 7e21e3c2c4d..cfa14e6c88b 100644 --- a/src/Access/AccessRights.cpp +++ b/src/Access/AccessRights.cpp @@ -61,14 +61,25 @@ namespace res.any_database = true; res.any_table = true; res.any_column = true; + res.any_parameter = true; break; } case 1: { - res.any_database = false; - res.database = full_name[0]; - res.any_table = true; - res.any_column = true; + if (access_flags.isGlobalWithParameter()) + { + res.parameter = full_name[0]; + res.any_parameter = false; + res.any_database = false; + } + else + { + res.database = full_name[0]; + res.any_database = false; + res.any_parameter = false; + res.any_table = true; + res.any_column = true; + } break; } case 2: @@ -110,10 +121,35 @@ namespace size_t count_elements_with_diff_columns = sorted.countElementsWithDifferenceInColumnOnly(i); if (count_elements_with_diff_columns == 1) { - /// Easy case: one Element is converted to one AccessRightsElement. const auto & element = sorted[i]; if (element.access_flags) - res.emplace_back(element.getResult()); + { + const bool all_granted = sorted.size() == 1 && element.access_flags.contains(AccessFlags::allFlags()); + if (all_granted) + { + /// Easy case: one Element is converted to one AccessRightsElement. + res.emplace_back(element.getResult()); + } + else + { + auto per_parameter = element.access_flags.splitIntoParameterTypes(); + if (per_parameter.size() == 1) + { + /// Easy case: one Element is converted to one AccessRightsElement. + res.emplace_back(element.getResult()); + } + else + { + /// Difficult case: one element is converted into multiple AccessRightsElements. + for (const auto & [_, parameter_flags] : per_parameter) + { + auto current_element{element}; + current_element.access_flags = parameter_flags; + res.emplace_back(current_element.getResult()); + } + } + } + } ++i; } else @@ -137,6 +173,8 @@ namespace { return (element.full_name.size() != 3) || (element.full_name[0] != start_element.full_name[0]) || (element.full_name[1] != start_element.full_name[1]) || (element.grant_option != start_element.grant_option) + || (element.access_flags.isGlobalWithParameter() != start_element.access_flags.isGlobalWithParameter()) + || (element.access_flags.getParameterType() != start_element.access_flags.getParameterType()) || (element.is_partial_revoke != start_element.is_partial_revoke); }); @@ -191,11 +229,19 @@ namespace } }; + /** + * Levels: + * 1. GLOBAL + * 2. DATABASE_LEVEL 2. GLOBAL_WITH_PARAMETER (parameter example: named collection) + * 3. TABLE_LEVEL + * 4. COLUMN_LEVEL + */ enum Level { GLOBAL_LEVEL, DATABASE_LEVEL, + GLOBAL_WITH_PARAMETER = DATABASE_LEVEL, TABLE_LEVEL, COLUMN_LEVEL, }; @@ -205,7 +251,7 @@ namespace switch (level) { case GLOBAL_LEVEL: return AccessFlags::allFlagsGrantableOnGlobalLevel(); - case DATABASE_LEVEL: return AccessFlags::allFlagsGrantableOnDatabaseLevel(); + case DATABASE_LEVEL: return AccessFlags::allFlagsGrantableOnDatabaseLevel() | AccessFlags::allFlagsGrantableOnGlobalWithParameterLevel(); case TABLE_LEVEL: return AccessFlags::allFlagsGrantableOnTableLevel(); case COLUMN_LEVEL: return AccessFlags::allFlagsGrantableOnColumnLevel(); } @@ -783,7 +829,14 @@ void AccessRights::grantImplHelper(const AccessRightsElement & element) { assert(!element.is_partial_revoke); assert(!element.grant_option || with_grant_option); - if (element.any_database) + if (element.isGlobalWithParameter()) + { + if (element.any_parameter) + grantImpl(element.access_flags); + else + grantImpl(element.access_flags, element.parameter); + } + else if (element.any_database) grantImpl(element.access_flags); else if (element.any_table) grantImpl(element.access_flags, element.database); @@ -858,7 +911,14 @@ template void AccessRights::revokeImplHelper(const AccessRightsElement & element) { assert(!element.grant_option || grant_option); - if (element.any_database) + if (element.isGlobalWithParameter()) + { + if (element.any_parameter) + revokeImpl(element.access_flags); + else + revokeImpl(element.access_flags, element.parameter); + } + else if (element.any_database) revokeImpl(element.access_flags); else if (element.any_table) revokeImpl(element.access_flags, element.database); @@ -948,7 +1008,14 @@ template bool AccessRights::isGrantedImplHelper(const AccessRightsElement & element) const { assert(!element.grant_option || grant_option); - if (element.any_database) + if (element.isGlobalWithParameter()) + { + if (element.any_parameter) + return isGrantedImpl(element.access_flags); + else + return isGrantedImpl(element.access_flags, element.parameter); + } + else if (element.any_database) return isGrantedImpl(element.access_flags); else if (element.any_table) return isGrantedImpl(element.access_flags, element.database); diff --git a/src/Access/Common/AccessFlags.cpp b/src/Access/Common/AccessFlags.cpp index bef165ba4e6..8612fc2309e 100644 --- a/src/Access/Common/AccessFlags.cpp +++ b/src/Access/Common/AccessFlags.cpp @@ -15,6 +15,7 @@ namespace ErrorCodes { extern const int UNKNOWN_ACCESS_TYPE; extern const int LOGICAL_ERROR; + extern const int MIXED_ACCESS_PARAMETER_TYPES; } namespace @@ -96,11 +97,14 @@ namespace const Flags & getAllFlags() const { return all_flags; } const Flags & getGlobalFlags() const { return all_flags_for_target[GLOBAL]; } + const Flags & getGlobalWithParameterFlags() const { return all_flags_grantable_on_global_with_parameter_level; } const Flags & getDatabaseFlags() const { return all_flags_for_target[DATABASE]; } const Flags & getTableFlags() const { return all_flags_for_target[TABLE]; } const Flags & getColumnFlags() const { return all_flags_for_target[COLUMN]; } const Flags & getDictionaryFlags() const { return all_flags_for_target[DICTIONARY]; } + const Flags & getNamedCollectionFlags() const { return all_flags_for_target[NAMED_COLLECTION]; } const Flags & getAllFlagsGrantableOnGlobalLevel() const { return getAllFlags(); } + const Flags & getAllFlagsGrantableOnGlobalWithParameterLevel() const { return getGlobalWithParameterFlags(); } const Flags & getAllFlagsGrantableOnDatabaseLevel() const { return all_flags_grantable_on_database_level; } const Flags & getAllFlagsGrantableOnTableLevel() const { return all_flags_grantable_on_table_level; } const Flags & getAllFlagsGrantableOnColumnLevel() const { return getColumnFlags(); } @@ -116,6 +120,7 @@ namespace VIEW = TABLE, COLUMN, DICTIONARY, + NAMED_COLLECTION, }; struct Node; @@ -295,6 +300,7 @@ namespace collectAllFlags(child.get()); all_flags_grantable_on_table_level = all_flags_for_target[TABLE] | all_flags_for_target[DICTIONARY] | all_flags_for_target[COLUMN]; + all_flags_grantable_on_global_with_parameter_level = all_flags_for_target[NAMED_COLLECTION]; all_flags_grantable_on_database_level = all_flags_for_target[DATABASE] | all_flags_grantable_on_table_level; } @@ -345,12 +351,44 @@ namespace std::unordered_map keyword_to_flags_map; std::vector access_type_to_flags_mapping; Flags all_flags; - Flags all_flags_for_target[static_cast(DICTIONARY) + 1]; + Flags all_flags_for_target[static_cast(NAMED_COLLECTION) + 1]; Flags all_flags_grantable_on_database_level; Flags all_flags_grantable_on_table_level; + Flags all_flags_grantable_on_global_with_parameter_level; }; } +bool AccessFlags::isGlobalWithParameter() const +{ + return getParameterType() != AccessFlags::NONE; +} + +std::unordered_map AccessFlags::splitIntoParameterTypes() const +{ + std::unordered_map result; + + auto named_collection_flags = AccessFlags::allNamedCollectionFlags() & *this; + if (named_collection_flags) + result.emplace(ParameterType::NAMED_COLLECTION, named_collection_flags); + + auto other_flags = (~AccessFlags::allNamedCollectionFlags()) & *this; + if (other_flags) + result.emplace(ParameterType::NONE, other_flags); + + return result; +} + +AccessFlags::ParameterType AccessFlags::getParameterType() const +{ + if (isEmpty() || !AccessFlags::allGlobalWithParameterFlags().contains(*this)) + return AccessFlags::NONE; + + /// All flags refer to NAMED COLLECTION access type. + if (AccessFlags::allNamedCollectionFlags().contains(*this)) + return AccessFlags::NAMED_COLLECTION; + + throw Exception(ErrorCodes::MIXED_ACCESS_PARAMETER_TYPES, "Having mixed parameter types: {}", toString()); +} AccessFlags::AccessFlags(AccessType type) : flags(Helper::instance().accessTypeToFlags(type)) {} AccessFlags::AccessFlags(std::string_view keyword) : flags(Helper::instance().keywordToFlags(keyword)) {} @@ -361,11 +399,14 @@ std::vector AccessFlags::toAccessTypes() const { return Helper::inst std::vector AccessFlags::toKeywords() const { return Helper::instance().flagsToKeywords(flags); } AccessFlags AccessFlags::allFlags() { return Helper::instance().getAllFlags(); } AccessFlags AccessFlags::allGlobalFlags() { return Helper::instance().getGlobalFlags(); } +AccessFlags AccessFlags::allGlobalWithParameterFlags() { return Helper::instance().getGlobalWithParameterFlags(); } AccessFlags AccessFlags::allDatabaseFlags() { return Helper::instance().getDatabaseFlags(); } AccessFlags AccessFlags::allTableFlags() { return Helper::instance().getTableFlags(); } AccessFlags AccessFlags::allColumnFlags() { return Helper::instance().getColumnFlags(); } AccessFlags AccessFlags::allDictionaryFlags() { return Helper::instance().getDictionaryFlags(); } +AccessFlags AccessFlags::allNamedCollectionFlags() { return Helper::instance().getNamedCollectionFlags(); } AccessFlags AccessFlags::allFlagsGrantableOnGlobalLevel() { return Helper::instance().getAllFlagsGrantableOnGlobalLevel(); } +AccessFlags AccessFlags::allFlagsGrantableOnGlobalWithParameterLevel() { return Helper::instance().getAllFlagsGrantableOnGlobalWithParameterLevel(); } AccessFlags AccessFlags::allFlagsGrantableOnDatabaseLevel() { return Helper::instance().getAllFlagsGrantableOnDatabaseLevel(); } AccessFlags AccessFlags::allFlagsGrantableOnTableLevel() { return Helper::instance().getAllFlagsGrantableOnTableLevel(); } AccessFlags AccessFlags::allFlagsGrantableOnColumnLevel() { return Helper::instance().getAllFlagsGrantableOnColumnLevel(); } diff --git a/src/Access/Common/AccessFlags.h b/src/Access/Common/AccessFlags.h index c4e0b7ac281..270ee1c0045 100644 --- a/src/Access/Common/AccessFlags.h +++ b/src/Access/Common/AccessFlags.h @@ -48,8 +48,17 @@ public: AccessFlags operator ~() const { AccessFlags res; res.flags = ~flags; return res; } bool isEmpty() const { return flags.none(); } + bool isAll() const { return flags.all(); } explicit operator bool() const { return !isEmpty(); } bool contains(const AccessFlags & other) const { return (flags & other.flags) == other.flags; } + bool isGlobalWithParameter() const; + enum ParameterType + { + NONE, + NAMED_COLLECTION, + }; + ParameterType getParameterType() const; + std::unordered_map splitIntoParameterTypes() const; friend bool operator ==(const AccessFlags & left, const AccessFlags & right) { return left.flags == right.flags; } friend bool operator !=(const AccessFlags & left, const AccessFlags & right) { return !(left == right); } @@ -76,6 +85,8 @@ public: /// Returns all the global flags. static AccessFlags allGlobalFlags(); + static AccessFlags allGlobalWithParameterFlags(); + /// Returns all the flags related to a database. static AccessFlags allDatabaseFlags(); @@ -88,10 +99,16 @@ public: /// Returns all the flags related to a dictionary. static AccessFlags allDictionaryFlags(); + /// Returns all the flags related to a named collection. + static AccessFlags allNamedCollectionFlags(); + /// Returns all the flags which could be granted on the global level. /// The same as allFlags(). static AccessFlags allFlagsGrantableOnGlobalLevel(); + /// Returns all the flags which could be granted on the global with parameter level. + static AccessFlags allFlagsGrantableOnGlobalWithParameterLevel(); + /// Returns all the flags which could be granted on the database level. /// Returns allDatabaseFlags() | allTableFlags() | allDictionaryFlags() | allColumnFlags(). static AccessFlags allFlagsGrantableOnDatabaseLevel(); diff --git a/src/Access/Common/AccessRightsElement.cpp b/src/Access/Common/AccessRightsElement.cpp index 69a2354f25d..e11d43634ec 100644 --- a/src/Access/Common/AccessRightsElement.cpp +++ b/src/Access/Common/AccessRightsElement.cpp @@ -21,24 +21,31 @@ namespace result += ")"; } - void formatONClause(const String & database, bool any_database, const String & table, bool any_table, String & result) + void formatONClause(const AccessRightsElement & element, String & result) { result += "ON "; - if (any_database) + if (element.isGlobalWithParameter()) + { + if (element.any_parameter) + result += "*"; + else + result += backQuoteIfNeed(element.parameter); + } + else if (element.any_database) { result += "*.*"; } else { - if (!database.empty()) + if (!element.database.empty()) { - result += backQuoteIfNeed(database); + result += backQuoteIfNeed(element.database); result += "."; } - if (any_table) + if (element.any_table) result += "*"; else - result += backQuoteIfNeed(table); + result += backQuoteIfNeed(element.table); } } @@ -96,7 +103,7 @@ namespace String result; formatAccessFlagsWithColumns(element.access_flags, element.columns, element.any_column, result); result += " "; - formatONClause(element.database, element.any_database, element.table, element.any_table, result); + formatONClause(element, result); if (with_options) formatOptions(element.grant_option, element.is_partial_revoke, result); return result; @@ -122,14 +129,16 @@ namespace if (i != elements.size() - 1) { const auto & next_element = elements[i + 1]; - if (element.sameDatabaseAndTable(next_element) && element.sameOptions(next_element)) + if (element.sameDatabaseAndTableAndParameter(next_element) && element.sameOptions(next_element)) + { next_element_uses_same_table_and_options = true; + } } if (!next_element_uses_same_table_and_options) { part += " "; - formatONClause(element.database, element.any_database, element.table, element.any_table, part); + formatONClause(element, part); if (with_options) formatOptions(element.grant_option, element.is_partial_revoke, part); if (result.empty()) @@ -164,6 +173,7 @@ AccessRightsElement::AccessRightsElement( , any_database(false) , any_table(false) , any_column(false) + , any_parameter(false) { } @@ -188,12 +198,15 @@ AccessRightsElement::AccessRightsElement( , any_database(false) , any_table(false) , any_column(false) + , any_parameter(false) { } void AccessRightsElement::eraseNonGrantable() { - if (!any_column) + if (isGlobalWithParameter() && !any_parameter) + access_flags &= AccessFlags::allFlagsGrantableOnGlobalWithParameterLevel(); + else if (!any_column) access_flags &= AccessFlags::allFlagsGrantableOnColumnLevel(); else if (!any_table) access_flags &= AccessFlags::allFlagsGrantableOnTableLevel(); @@ -215,6 +228,11 @@ String AccessRightsElement::toStringWithoutOptions() const { return toStringImpl bool AccessRightsElements::empty() const { return std::all_of(begin(), end(), [](const AccessRightsElement & e) { return e.empty(); }); } +bool AccessRightsElements::sameDatabaseAndTableAndParameter() const +{ + return (size() < 2) || std::all_of(std::next(begin()), end(), [this](const AccessRightsElement & e) { return e.sameDatabaseAndTableAndParameter(front()); }); +} + bool AccessRightsElements::sameDatabaseAndTable() const { return (size() < 2) || std::all_of(std::next(begin()), end(), [this](const AccessRightsElement & e) { return e.sameDatabaseAndTable(front()); }); diff --git a/src/Access/Common/AccessRightsElement.h b/src/Access/Common/AccessRightsElement.h index 5f65b6bcd12..ba625fc43df 100644 --- a/src/Access/Common/AccessRightsElement.h +++ b/src/Access/Common/AccessRightsElement.h @@ -11,12 +11,17 @@ namespace DB struct AccessRightsElement { AccessFlags access_flags; + String database; String table; Strings columns; + String parameter; + bool any_database = true; bool any_table = true; bool any_column = true; + bool any_parameter = false; + bool grant_option = false; bool is_partial_revoke = false; @@ -44,14 +49,26 @@ struct AccessRightsElement bool empty() const { return !access_flags || (!any_column && columns.empty()); } - auto toTuple() const { return std::tie(access_flags, any_database, database, any_table, table, any_column, columns, grant_option, is_partial_revoke); } + auto toTuple() const { return std::tie(access_flags, any_database, database, any_table, table, any_column, columns, any_parameter, parameter, grant_option, is_partial_revoke); } friend bool operator==(const AccessRightsElement & left, const AccessRightsElement & right) { return left.toTuple() == right.toTuple(); } friend bool operator!=(const AccessRightsElement & left, const AccessRightsElement & right) { return !(left == right); } + bool sameDatabaseAndTableAndParameter(const AccessRightsElement & other) const + { + return sameDatabaseAndTable(other) && sameParameter(other); + } + + bool sameParameter(const AccessRightsElement & other) const + { + return (parameter == other.parameter) && (any_parameter == other.any_parameter) + && (access_flags.getParameterType() == other.access_flags.getParameterType()) + && (isGlobalWithParameter() == other.isGlobalWithParameter()); + } + bool sameDatabaseAndTable(const AccessRightsElement & other) const { - return (database == other.database) && (any_database == other.any_database) && (table == other.table) - && (any_table == other.any_table); + return (database == other.database) && (any_database == other.any_database) + && (table == other.table) && (any_table == other.any_table); } bool sameOptions(const AccessRightsElement & other) const @@ -67,6 +84,8 @@ struct AccessRightsElement /// If the database is empty, replaces it with `current_database`. Otherwise does nothing. void replaceEmptyDatabase(const String & current_database); + bool isGlobalWithParameter() const { return access_flags.isGlobalWithParameter(); } + /// Returns a human-readable representation like "GRANT SELECT, UPDATE(x, y) ON db.table". String toString() const; String toStringWithoutOptions() const; @@ -81,6 +100,7 @@ public: using Base::Base; bool empty() const; + bool sameDatabaseAndTableAndParameter() const; bool sameDatabaseAndTable() const; bool sameOptions() const; diff --git a/src/Access/Common/AccessType.h b/src/Access/Common/AccessType.h index c73c0499fbe..a7827ee7c59 100644 --- a/src/Access/Common/AccessType.h +++ b/src/Access/Common/AccessType.h @@ -12,7 +12,7 @@ enum class AccessType /// Macro M should be defined as M(name, aliases, node_type, parent_group_name) /// where name is identifier with underscores (instead of spaces); /// aliases is a string containing comma-separated list; -/// node_type either specifies access type's level (GLOBAL/DATABASE/TABLE/DICTIONARY/VIEW/COLUMNS), +/// node_type either specifies access type's level (GLOBAL/NAMED_COLLECTION/DATABASE/TABLE/DICTIONARY/VIEW/COLUMNS), /// or specifies that the access type is a GROUP of other access types; /// parent_group_name is the name of the group containing this access type (or NONE if there is no such group). /// NOTE A parent group must be declared AFTER all its children. @@ -70,7 +70,7 @@ enum class AccessType M(ALTER_FREEZE_PARTITION, "FREEZE PARTITION, UNFREEZE", TABLE, ALTER_TABLE) \ \ M(ALTER_DATABASE_SETTINGS, "ALTER DATABASE SETTING, ALTER MODIFY DATABASE SETTING, MODIFY DATABASE SETTING", DATABASE, ALTER_DATABASE) /* allows to execute ALTER MODIFY SETTING */\ - M(ALTER_NAMED_COLLECTION, "", GROUP, ALTER) /* allows to execute ALTER NAMED COLLECTION */\ + M(ALTER_NAMED_COLLECTION, "", NAMED_COLLECTION, NAMED_COLLECTION_CONTROL) /* allows to execute ALTER NAMED COLLECTION */\ \ M(ALTER_TABLE, "", GROUP, ALTER) \ M(ALTER_DATABASE, "", GROUP, ALTER) \ @@ -92,7 +92,7 @@ enum class AccessType M(CREATE_ARBITRARY_TEMPORARY_TABLE, "", GLOBAL, CREATE) /* allows to create and manipulate temporary tables with arbitrary table engine */\ M(CREATE_FUNCTION, "", GLOBAL, CREATE) /* allows to execute CREATE FUNCTION */ \ - M(CREATE_NAMED_COLLECTION, "", GLOBAL, CREATE) /* allows to execute CREATE NAMED COLLECTION */ \ + M(CREATE_NAMED_COLLECTION, "", NAMED_COLLECTION, NAMED_COLLECTION_CONTROL) /* allows to execute CREATE NAMED COLLECTION */ \ M(CREATE, "", GROUP, ALL) /* allows to execute {CREATE|ATTACH} */ \ \ M(DROP_DATABASE, "", DATABASE, DROP) /* allows to execute {DROP|DETACH} DATABASE */\ @@ -101,7 +101,7 @@ enum class AccessType implicitly enabled by the grant DROP_TABLE */\ M(DROP_DICTIONARY, "", DICTIONARY, DROP) /* allows to execute {DROP|DETACH} DICTIONARY */\ M(DROP_FUNCTION, "", GLOBAL, DROP) /* allows to execute DROP FUNCTION */\ - M(DROP_NAMED_COLLECTION, "", GLOBAL, DROP) /* allows to execute DROP NAMED COLLECTION */\ + M(DROP_NAMED_COLLECTION, "", NAMED_COLLECTION, NAMED_COLLECTION_CONTROL) /* allows to execute DROP NAMED COLLECTION */\ M(DROP, "", GROUP, ALL) /* allows to execute {DROP|DETACH} */\ \ M(TRUNCATE, "TRUNCATE TABLE", TABLE, ALL) \ @@ -137,9 +137,10 @@ enum class AccessType M(SHOW_QUOTAS, "SHOW CREATE QUOTA", GLOBAL, SHOW_ACCESS) \ M(SHOW_SETTINGS_PROFILES, "SHOW PROFILES, SHOW CREATE SETTINGS PROFILE, SHOW CREATE PROFILE", GLOBAL, SHOW_ACCESS) \ M(SHOW_ACCESS, "", GROUP, ACCESS_MANAGEMENT) \ - M(SHOW_NAMED_COLLECTIONS, "SHOW NAMED COLLECTIONS", GLOBAL, ACCESS_MANAGEMENT) \ - M(SHOW_NAMED_COLLECTIONS_SECRETS, "SHOW NAMED COLLECTIONS SECRETS", GLOBAL, ACCESS_MANAGEMENT) \ M(ACCESS_MANAGEMENT, "", GROUP, ALL) \ + M(SHOW_NAMED_COLLECTIONS, "SHOW NAMED COLLECTIONS", NAMED_COLLECTION, NAMED_COLLECTION_CONTROL) \ + M(SHOW_NAMED_COLLECTIONS_SECRETS, "SHOW NAMED COLLECTIONS SECRETS", NAMED_COLLECTION, NAMED_COLLECTION_CONTROL) \ + M(NAMED_COLLECTION_CONTROL, "", NAMED_COLLECTION, ALL) \ \ M(SYSTEM_SHUTDOWN, "SYSTEM KILL, SHUTDOWN", GLOBAL, SYSTEM) \ M(SYSTEM_DROP_DNS_CACHE, "SYSTEM DROP DNS, DROP DNS CACHE, DROP DNS", GLOBAL, SYSTEM_DROP_CACHE) \ diff --git a/src/Access/ContextAccess.cpp b/src/Access/ContextAccess.cpp index cc51183c51f..04756162b46 100644 --- a/src/Access/ContextAccess.cpp +++ b/src/Access/ContextAccess.cpp @@ -507,13 +507,17 @@ bool ContextAccess::checkAccessImplHelper(AccessFlags flags, const Args &... arg if (!flags) return true; - /// Access to temporary tables is controlled in an unusual way, not like normal tables. - /// Creating of temporary tables is controlled by AccessType::CREATE_TEMPORARY_TABLES grant, - /// and other grants are considered as always given. - /// The DatabaseCatalog class won't resolve StorageID for temporary tables - /// which shouldn't be accessed. - if (getDatabase(args...) == DatabaseCatalog::TEMPORARY_DATABASE) - return access_granted(); + const auto parameter_type = flags.getParameterType(); + if (parameter_type == AccessFlags::NONE) + { + /// Access to temporary tables is controlled in an unusual way, not like normal tables. + /// Creating of temporary tables is controlled by AccessType::CREATE_TEMPORARY_TABLES grant, + /// and other grants are considered as always given. + /// The DatabaseCatalog class won't resolve StorageID for temporary tables + /// which shouldn't be accessed. + if (getDatabase(args...) == DatabaseCatalog::TEMPORARY_DATABASE) + return access_granted(); + } auto acs = getAccessRightsWithImplicit(); bool granted; @@ -611,7 +615,14 @@ template bool ContextAccess::checkAccessImplHelper(const AccessRightsElement & element) const { assert(!element.grant_option || grant_option); - if (element.any_database) + if (element.isGlobalWithParameter()) + { + if (element.any_parameter) + return checkAccessImpl(element.access_flags); + else + return checkAccessImpl(element.access_flags, element.parameter); + } + else if (element.any_database) return checkAccessImpl(element.access_flags); else if (element.any_table) return checkAccessImpl(element.access_flags, element.database); diff --git a/src/Access/ReplicatedAccessStorage.cpp b/src/Access/ReplicatedAccessStorage.cpp index a7cb2b6e08e..ddc5e8bfed1 100644 --- a/src/Access/ReplicatedAccessStorage.cpp +++ b/src/Access/ReplicatedAccessStorage.cpp @@ -674,18 +674,16 @@ void ReplicatedAccessStorage::backup(BackupEntriesCollector & backup_entries_col backup_entries_collector.getContext()->getAccessControl()); auto backup_coordination = backup_entries_collector.getBackupCoordination(); - String current_host_id = backup_entries_collector.getBackupSettings().host_id; - backup_coordination->addReplicatedAccessFilePath(zookeeper_path, type, current_host_id, backup_entry_with_path.first); + backup_coordination->addReplicatedAccessFilePath(zookeeper_path, type, backup_entry_with_path.first); backup_entries_collector.addPostTask( [backup_entry = backup_entry_with_path.second, zookeeper_path = zookeeper_path, type, - current_host_id, &backup_entries_collector, backup_coordination] { - for (const String & path : backup_coordination->getReplicatedAccessFilePaths(zookeeper_path, type, current_host_id)) + for (const String & path : backup_coordination->getReplicatedAccessFilePaths(zookeeper_path, type)) backup_entries_collector.addBackupEntry(path, backup_entry); }); } diff --git a/src/Access/RoleCache.cpp b/src/Access/RoleCache.cpp index bfc6200929d..2d94df2eea5 100644 --- a/src/Access/RoleCache.cpp +++ b/src/Access/RoleCache.cpp @@ -56,8 +56,8 @@ namespace } -RoleCache::RoleCache(const AccessControl & access_control_) - : access_control(access_control_), cache(600000 /* 10 minutes */) +RoleCache::RoleCache(const AccessControl & access_control_, int expiration_time_seconds) + : access_control(access_control_), cache(expiration_time_seconds * 1000 /* 10 minutes by default*/) { } diff --git a/src/Access/RoleCache.h b/src/Access/RoleCache.h index 24f19cb9d94..b5712a24f46 100644 --- a/src/Access/RoleCache.h +++ b/src/Access/RoleCache.h @@ -16,7 +16,7 @@ using RolePtr = std::shared_ptr; class RoleCache { public: - explicit RoleCache(const AccessControl & access_control_); + explicit RoleCache(const AccessControl & access_control_, int expiration_time_seconds); ~RoleCache(); std::shared_ptr getEnabledRoles( diff --git a/src/Access/UsersConfigAccessStorage.cpp b/src/Access/UsersConfigAccessStorage.cpp index b893554cb8a..562df61e8aa 100644 --- a/src/Access/UsersConfigAccessStorage.cpp +++ b/src/Access/UsersConfigAccessStorage.cpp @@ -233,10 +233,10 @@ namespace user->access.revokeGrantOption(AccessType::ALL); } - bool show_named_collections = config.getBool(user_config + ".show_named_collections", false); - if (!show_named_collections) + bool named_collection_control = config.getBool(user_config + ".named_collection_control", false); + if (!named_collection_control) { - user->access.revoke(AccessType::SHOW_NAMED_COLLECTIONS); + user->access.revoke(AccessType::NAMED_COLLECTION_CONTROL); } bool show_named_collections_secrets = config.getBool(user_config + ".show_named_collections_secrets", false); diff --git a/src/Access/tests/gtest_access_rights_ops.cpp b/src/Access/tests/gtest_access_rights_ops.cpp index e21ebda2a31..025f70af587 100644 --- a/src/Access/tests/gtest_access_rights_ops.cpp +++ b/src/Access/tests/gtest_access_rights_ops.cpp @@ -53,7 +53,7 @@ TEST(AccessRights, Union) "SHOW ROW POLICIES, SYSTEM MERGES, SYSTEM TTL MERGES, SYSTEM FETCHES, " "SYSTEM MOVES, SYSTEM SENDS, SYSTEM REPLICATION QUEUES, " "SYSTEM DROP REPLICA, SYSTEM SYNC REPLICA, SYSTEM RESTART REPLICA, " - "SYSTEM RESTORE REPLICA, SYSTEM WAIT LOADING PARTS, SYSTEM SYNC DATABASE REPLICA, SYSTEM FLUSH DISTRIBUTED, dictGet ON db1.*"); + "SYSTEM RESTORE REPLICA, SYSTEM WAIT LOADING PARTS, SYSTEM SYNC DATABASE REPLICA, SYSTEM FLUSH DISTRIBUTED, dictGet ON db1.*, GRANT NAMED COLLECTION CONTROL ON db1"); } diff --git a/src/AggregateFunctions/AggregateFunctionCorr.cpp b/src/AggregateFunctions/AggregateFunctionCorr.cpp new file mode 100644 index 00000000000..2e8ff3af933 --- /dev/null +++ b/src/AggregateFunctions/AggregateFunctionCorr.cpp @@ -0,0 +1,15 @@ +#include +#include + + +namespace DB +{ + +template using AggregateFunctionCorr = AggregateFunctionVarianceSimple>; + +void registerAggregateFunctionsStatisticsCorr(AggregateFunctionFactory & factory) +{ + factory.registerFunction("corr", createAggregateFunctionStatisticsBinary, AggregateFunctionFactory::CaseInsensitive); +} + +} diff --git a/src/AggregateFunctions/AggregateFunctionCovar.cpp b/src/AggregateFunctions/AggregateFunctionCovar.cpp new file mode 100644 index 00000000000..9645685483f --- /dev/null +++ b/src/AggregateFunctions/AggregateFunctionCovar.cpp @@ -0,0 +1,20 @@ +#include +#include + + +namespace DB +{ + +template using AggregateFunctionCovar = AggregateFunctionVarianceSimple>; + +void registerAggregateFunctionsStatisticsCovar(AggregateFunctionFactory & factory) +{ + factory.registerFunction("covarSamp", createAggregateFunctionStatisticsBinary); + factory.registerFunction("covarPop", createAggregateFunctionStatisticsBinary); + + /// Synonyms for compatibility. + factory.registerAlias("COVAR_SAMP", "covarSamp", AggregateFunctionFactory::CaseInsensitive); + factory.registerAlias("COVAR_POP", "covarPop", AggregateFunctionFactory::CaseInsensitive); +} + +} diff --git a/src/AggregateFunctions/AggregateFunctionFactory.cpp b/src/AggregateFunctions/AggregateFunctionFactory.cpp index 44e206890aa..6cacf66500f 100644 --- a/src/AggregateFunctions/AggregateFunctionFactory.cpp +++ b/src/AggregateFunctions/AggregateFunctionFactory.cpp @@ -2,12 +2,10 @@ #include #include -#include #include #include #include -#include #include #include @@ -21,6 +19,9 @@ #include +static constexpr size_t MAX_AGGREGATE_FUNCTION_NAME_LENGTH = 1000; + + namespace DB { struct Settings; @@ -30,6 +31,7 @@ namespace ErrorCodes extern const int UNKNOWN_AGGREGATE_FUNCTION; extern const int LOGICAL_ERROR; extern const int ILLEGAL_AGGREGATION; + extern const int TOO_LARGE_STRING_SIZE; } const String & getAggregateFunctionCanonicalNameIfAny(const String & name) @@ -70,12 +72,17 @@ static DataTypes convertLowCardinalityTypesToNested(const DataTypes & types) AggregateFunctionPtr AggregateFunctionFactory::get( const String & name, const DataTypes & argument_types, const Array & parameters, AggregateFunctionProperties & out_properties) const { + /// This to prevent costly string manipulation in parsing the aggregate function combinators. + /// Example: avgArrayArrayArrayArray...(1000 times)...Array + if (name.size() > MAX_AGGREGATE_FUNCTION_NAME_LENGTH) + throw Exception(ErrorCodes::TOO_LARGE_STRING_SIZE, "Too long name of aggregate function, maximum: {}", MAX_AGGREGATE_FUNCTION_NAME_LENGTH); + auto types_without_low_cardinality = convertLowCardinalityTypesToNested(argument_types); /// If one of the types is Nullable, we apply aggregate function combinator "Null" if it's not window function. /// Window functions are not real aggregate functions. Applying combinators doesn't make sense for them, /// they must handle the nullability themselves - auto properties = tryGetPropertiesImpl(name); + auto properties = tryGetProperties(name); bool is_window_function = properties.has_value() && properties->is_window_function; if (!is_window_function && std::any_of(types_without_low_cardinality.begin(), types_without_low_cardinality.end(), [](const auto & type) { return type->isNullable(); })) @@ -216,61 +223,67 @@ AggregateFunctionPtr AggregateFunctionFactory::tryGet( } -std::optional AggregateFunctionFactory::tryGetPropertiesImpl(const String & name_param) const +std::optional AggregateFunctionFactory::tryGetProperties(String name) const { - String name = getAliasToOrName(name_param); - Value found; + if (name.size() > MAX_AGGREGATE_FUNCTION_NAME_LENGTH) + throw Exception(ErrorCodes::TOO_LARGE_STRING_SIZE, "Too long name of aggregate function, maximum: {}", MAX_AGGREGATE_FUNCTION_NAME_LENGTH); - /// Find by exact match. - if (auto it = aggregate_functions.find(name); it != aggregate_functions.end()) + while (true) { - found = it->second; - } + name = getAliasToOrName(name); + Value found; - if (auto jt = case_insensitive_aggregate_functions.find(Poco::toLower(name)); jt != case_insensitive_aggregate_functions.end()) - found = jt->second; + /// Find by exact match. + if (auto it = aggregate_functions.find(name); it != aggregate_functions.end()) + { + found = it->second; + } - if (found.creator) - return found.properties; + if (auto jt = case_insensitive_aggregate_functions.find(Poco::toLower(name)); jt != case_insensitive_aggregate_functions.end()) + found = jt->second; - /// Combinators of aggregate functions. - /// For every aggregate function 'agg' and combiner '-Comb' there is a combined aggregate function with the name 'aggComb', - /// that can have different number and/or types of arguments, different result type and different behaviour. + if (found.creator) + return found.properties; - if (AggregateFunctionCombinatorPtr combinator = AggregateFunctionCombinatorFactory::instance().tryFindSuffix(name)) - { - if (combinator->isForInternalUsageOnly()) + /// Combinators of aggregate functions. + /// For every aggregate function 'agg' and combiner '-Comb' there is a combined aggregate function with the name 'aggComb', + /// that can have different number and/or types of arguments, different result type and different behaviour. + + if (AggregateFunctionCombinatorPtr combinator = AggregateFunctionCombinatorFactory::instance().tryFindSuffix(name)) + { + if (combinator->isForInternalUsageOnly()) + return {}; + + /// NOTE: It's reasonable to also allow to transform properties by combinator. + name = name.substr(0, name.size() - combinator->getName().size()); + } + else return {}; - - String nested_name = name.substr(0, name.size() - combinator->getName().size()); - - /// NOTE: It's reasonable to also allow to transform properties by combinator. - return tryGetPropertiesImpl(nested_name); } - - return {}; } -std::optional AggregateFunctionFactory::tryGetProperties(const String & name) const +bool AggregateFunctionFactory::isAggregateFunctionName(String name) const { - return tryGetPropertiesImpl(name); -} + if (name.size() > MAX_AGGREGATE_FUNCTION_NAME_LENGTH) + throw Exception(ErrorCodes::TOO_LARGE_STRING_SIZE, "Too long name of aggregate function, maximum: {}", MAX_AGGREGATE_FUNCTION_NAME_LENGTH); + while (true) + { + if (aggregate_functions.contains(name) || isAlias(name)) + return true; -bool AggregateFunctionFactory::isAggregateFunctionName(const String & name) const -{ - if (aggregate_functions.contains(name) || isAlias(name)) - return true; + String name_lowercase = Poco::toLower(name); + if (case_insensitive_aggregate_functions.contains(name_lowercase) || isAlias(name_lowercase)) + return true; - String name_lowercase = Poco::toLower(name); - if (case_insensitive_aggregate_functions.contains(name_lowercase) || isAlias(name_lowercase)) - return true; - - if (AggregateFunctionCombinatorPtr combinator = AggregateFunctionCombinatorFactory::instance().tryFindSuffix(name)) - return isAggregateFunctionName(name.substr(0, name.size() - combinator->getName().size())); - - return false; + if (AggregateFunctionCombinatorPtr combinator = AggregateFunctionCombinatorFactory::instance().tryFindSuffix(name)) + { + name = name.substr(0, name.size() - combinator->getName().size()); + } + else + return false; + } } AggregateFunctionFactory & AggregateFunctionFactory::instance() diff --git a/src/AggregateFunctions/AggregateFunctionFactory.h b/src/AggregateFunctions/AggregateFunctionFactory.h index 0a966e4d8b5..dab0d28e851 100644 --- a/src/AggregateFunctions/AggregateFunctionFactory.h +++ b/src/AggregateFunctions/AggregateFunctionFactory.h @@ -77,9 +77,9 @@ public: AggregateFunctionProperties & out_properties) const; /// Get properties if the aggregate function exists. - std::optional tryGetProperties(const String & name) const; + std::optional tryGetProperties(String name) const; - bool isAggregateFunctionName(const String & name) const; + bool isAggregateFunctionName(String name) const; private: AggregateFunctionPtr getImpl( @@ -89,8 +89,6 @@ private: AggregateFunctionProperties & out_properties, bool has_null_arguments) const; - std::optional tryGetPropertiesImpl(const String & name) const; - using AggregateFunctions = std::unordered_map; AggregateFunctions aggregate_functions; diff --git a/src/AggregateFunctions/AggregateFunctionForEach.h b/src/AggregateFunctions/AggregateFunctionForEach.h index f041dd11209..81ba298bb8a 100644 --- a/src/AggregateFunctions/AggregateFunctionForEach.h +++ b/src/AggregateFunctions/AggregateFunctionForEach.h @@ -2,6 +2,7 @@ #include #include +#include #include #include @@ -20,6 +21,8 @@ namespace ErrorCodes extern const int ILLEGAL_TYPE_OF_ARGUMENT; extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH; extern const int SIZES_OF_ARRAYS_DONT_MATCH; + extern const int TOO_LARGE_ARRAY_SIZE; + extern const int LOGICAL_ERROR; } @@ -65,11 +68,17 @@ private: size_t old_size = state.dynamic_array_size; if (old_size < new_size) { + static constexpr size_t MAX_ARRAY_SIZE = 100_GiB; + if (new_size > MAX_ARRAY_SIZE) + throw Exception(ErrorCodes::TOO_LARGE_ARRAY_SIZE, "Suspiciously large array size ({}) in -ForEach aggregate function", new_size); + + size_t allocation_size = 0; + if (common::mulOverflow(new_size, nested_size_of_data, allocation_size)) + throw Exception(ErrorCodes::LOGICAL_ERROR, "Allocation size ({} * {}) overflows in -ForEach aggregate function, but it should've been prevented by previous checks", new_size, nested_size_of_data); + char * old_state = state.array_of_aggregate_datas; - char * new_state = arena.alignedAlloc( - new_size * nested_size_of_data, - nested_func->alignOfData()); + char * new_state = arena.alignedAlloc(allocation_size, nested_func->alignOfData()); size_t i; try diff --git a/src/AggregateFunctions/AggregateFunctionFourthMoment.cpp b/src/AggregateFunctions/AggregateFunctionFourthMoment.cpp new file mode 100644 index 00000000000..c0af79c6f56 --- /dev/null +++ b/src/AggregateFunctions/AggregateFunctionFourthMoment.cpp @@ -0,0 +1,16 @@ +#include +#include + + +namespace DB +{ + +template using AggregateFunctionFourthMoment = AggregateFunctionVarianceSimple>; + +void registerAggregateFunctionsStatisticsFourthMoment(AggregateFunctionFactory & factory) +{ + factory.registerFunction("kurtSamp", createAggregateFunctionStatisticsUnary); + factory.registerFunction("kurtPop", createAggregateFunctionStatisticsUnary); +} + +} diff --git a/src/AggregateFunctions/AggregateFunctionGroupArray.h b/src/AggregateFunctions/AggregateFunctionGroupArray.h index eaffb04e2a9..5a799dc3641 100644 --- a/src/AggregateFunctions/AggregateFunctionGroupArray.h +++ b/src/AggregateFunctions/AggregateFunctionGroupArray.h @@ -366,6 +366,8 @@ struct GroupArrayNodeBase { UInt64 size; readVarUInt(size, buf); + if (unlikely(size > AGGREGATE_FUNCTION_GROUP_ARRAY_MAX_ARRAY_SIZE)) + throw Exception(ErrorCodes::TOO_LARGE_ARRAY_SIZE, "Too large array size"); Node * node = reinterpret_cast(arena->alignedAlloc(sizeof(Node) + size, alignof(Node))); node->size = size; diff --git a/src/AggregateFunctions/AggregateFunctionGroupBitmap.h b/src/AggregateFunctions/AggregateFunctionGroupBitmap.h index 5fe3128fa20..a32bb330884 100644 --- a/src/AggregateFunctions/AggregateFunctionGroupBitmap.h +++ b/src/AggregateFunctions/AggregateFunctionGroupBitmap.h @@ -31,22 +31,28 @@ public: void add(AggregateDataPtr __restrict place, const IColumn ** columns, size_t row_num, Arena *) const override { - this->data(place).rbs.add(assert_cast &>(*columns[0]).getData()[row_num]); + this->data(place).roaring_bitmap_with_small_set.add(assert_cast &>(*columns[0]).getData()[row_num]); } void merge(AggregateDataPtr __restrict place, ConstAggregateDataPtr rhs, Arena *) const override { - this->data(place).rbs.merge(this->data(rhs).rbs); + this->data(place).roaring_bitmap_with_small_set.merge(this->data(rhs).roaring_bitmap_with_small_set); } - void serialize(ConstAggregateDataPtr __restrict place, WriteBuffer & buf, std::optional /* version */) const override { this->data(place).rbs.write(buf); } + void serialize(ConstAggregateDataPtr __restrict place, WriteBuffer & buf, std::optional /* version */) const override + { + this->data(place).roaring_bitmap_with_small_set.write(buf); + } - void deserialize(AggregateDataPtr __restrict place, ReadBuffer & buf, std::optional /* version */, Arena *) const override { this->data(place).rbs.read(buf); } + void deserialize(AggregateDataPtr __restrict place, ReadBuffer & buf, std::optional /* version */, Arena *) const override + { + this->data(place).roaring_bitmap_with_small_set.read(buf); + } void insertResultInto(AggregateDataPtr __restrict place, IColumn & to, Arena *) const override { assert_cast &>(to).getData().push_back( - static_cast(this->data(place).rbs.size())); + static_cast(this->data(place).roaring_bitmap_with_small_set.size())); } }; @@ -81,7 +87,7 @@ public: if (!data_lhs.init) { data_lhs.init = true; - data_lhs.rbs.merge(data_rhs.rbs); + data_lhs.roaring_bitmap_with_small_set.merge(data_rhs.roaring_bitmap_with_small_set); } else { @@ -100,7 +106,7 @@ public: if (!data_lhs.init) { data_lhs.init = true; - data_lhs.rbs.merge(data_rhs.rbs); + data_lhs.roaring_bitmap_with_small_set.merge(data_rhs.roaring_bitmap_with_small_set); } else { @@ -128,7 +134,7 @@ public: if (*version >= 1) DB::writeBoolText(this->data(place).init, buf); - this->data(place).rbs.write(buf); + this->data(place).roaring_bitmap_with_small_set.write(buf); } void deserialize(AggregateDataPtr __restrict place, ReadBuffer & buf, std::optional version, Arena *) const override @@ -138,13 +144,13 @@ public: if (*version >= 1) DB::readBoolText(this->data(place).init, buf); - this->data(place).rbs.read(buf); + this->data(place).roaring_bitmap_with_small_set.read(buf); } void insertResultInto(AggregateDataPtr __restrict place, IColumn & to, Arena *) const override { assert_cast &>(to).getData().push_back( - static_cast(this->data(place).rbs.size())); + static_cast(this->data(place).roaring_bitmap_with_small_set.size())); } }; @@ -154,7 +160,7 @@ class BitmapAndPolicy { public: static constexpr auto name = "groupBitmapAnd"; - static void apply(Data & lhs, const Data & rhs) { lhs.rbs.rb_and(rhs.rbs); } + static void apply(Data & lhs, const Data & rhs) { lhs.roaring_bitmap_with_small_set.rb_and(rhs.roaring_bitmap_with_small_set); } }; template @@ -162,7 +168,7 @@ class BitmapOrPolicy { public: static constexpr auto name = "groupBitmapOr"; - static void apply(Data & lhs, const Data & rhs) { lhs.rbs.rb_or(rhs.rbs); } + static void apply(Data & lhs, const Data & rhs) { lhs.roaring_bitmap_with_small_set.rb_or(rhs.roaring_bitmap_with_small_set); } }; template @@ -170,7 +176,7 @@ class BitmapXorPolicy { public: static constexpr auto name = "groupBitmapXor"; - static void apply(Data & lhs, const Data & rhs) { lhs.rbs.rb_xor(rhs.rbs); } + static void apply(Data & lhs, const Data & rhs) { lhs.roaring_bitmap_with_small_set.rb_xor(rhs.roaring_bitmap_with_small_set); } }; template diff --git a/src/AggregateFunctions/AggregateFunctionGroupBitmapData.h b/src/AggregateFunctions/AggregateFunctionGroupBitmapData.h index 801526432ae..62017251108 100644 --- a/src/AggregateFunctions/AggregateFunctionGroupBitmapData.h +++ b/src/AggregateFunctions/AggregateFunctionGroupBitmapData.h @@ -20,6 +20,12 @@ namespace DB { +namespace ErrorCodes +{ + extern const int TOO_LARGE_ARRAY_SIZE; + extern const int INCORRECT_DATA; +} + enum BitmapKind { Small = 0, @@ -41,20 +47,19 @@ private: using ValueBuffer = std::vector; using RoaringBitmap = std::conditional_t= 8, roaring::Roaring64Map, roaring::Roaring>; using Value = std::conditional_t= 8, UInt64, UInt32>; - std::shared_ptr rb = nullptr; + std::shared_ptr roaring_bitmap; void toLarge() { - rb = std::make_shared(); + roaring_bitmap = std::make_shared(); for (const auto & x : small) - rb->add(static_cast(x.getValue())); + roaring_bitmap->add(static_cast(x.getValue())); small.clear(); } public: - bool isLarge() const { return rb != nullptr; } - - bool isSmall() const { return rb == nullptr; } + bool isLarge() const { return roaring_bitmap != nullptr; } + bool isSmall() const { return roaring_bitmap == nullptr; } void add(T value) { @@ -63,17 +68,19 @@ public: if (small.find(value) == small.end()) { if (!small.full()) + { small.insert(value); + } else { toLarge(); - rb->add(static_cast(value)); + roaring_bitmap->add(static_cast(value)); } } } else { - rb->add(static_cast(value)); + roaring_bitmap->add(static_cast(value)); } } @@ -82,7 +89,7 @@ public: if (isSmall()) return small.size(); else - return rb->cardinality(); + return roaring_bitmap->cardinality(); } void merge(const RoaringBitmapWithSmallSet & r1) @@ -92,7 +99,7 @@ public: if (isSmall()) toLarge(); - *rb |= *r1.rb; + *roaring_bitmap |= *r1.roaring_bitmap; } else { @@ -105,6 +112,7 @@ public: { UInt8 kind; readBinary(kind, in); + if (BitmapKind::Small == kind) { small.read(in); @@ -113,26 +121,39 @@ public: { size_t size; readVarUInt(size, in); + + static constexpr size_t max_size = 1_GiB; + + if (size == 0) + throw Exception(ErrorCodes::INCORRECT_DATA, "Incorrect size (0) in groupBitmap."); + if (size > max_size) + throw Exception(ErrorCodes::TOO_LARGE_ARRAY_SIZE, "Too large array size in groupBitmap."); + + /// TODO: this is unnecessary copying - it will be better to read and deserialize in one pass. std::unique_ptr buf(new char[size]); in.readStrict(buf.get(), size); - rb = std::make_shared(RoaringBitmap::read(buf.get())); + + roaring_bitmap = std::make_shared(RoaringBitmap::readSafe(buf.get(), size)); } + else + throw Exception(ErrorCodes::INCORRECT_DATA, "Unknown type of roaring bitmap"); } void write(DB::WriteBuffer & out) const { UInt8 kind = isLarge() ? BitmapKind::Bitmap : BitmapKind::Small; writeBinary(kind, out); + if (BitmapKind::Small == kind) { small.write(out); } else if (BitmapKind::Bitmap == kind) { - auto size = rb->getSizeInBytes(); + auto size = roaring_bitmap->getSizeInBytes(); writeVarUInt(size, out); std::unique_ptr buf(new char[size]); - rb->write(buf.get()); + roaring_bitmap->write(buf.get()); out.write(buf.get(), size); } } @@ -173,7 +194,7 @@ public: { for (const auto & x : small) { - if (r1.rb->contains(static_cast(x.getValue()))) + if (r1.roaring_bitmap->contains(static_cast(x.getValue()))) buffer.push_back(x.getValue()); } @@ -187,15 +208,18 @@ public: } else { - std::shared_ptr new_rb = r1.isSmall() ? r1.getNewRoaringBitmapFromSmall() : r1.rb; - *rb &= *new_rb; + std::shared_ptr new_rb = r1.isSmall() ? r1.getNewRoaringBitmapFromSmall() : r1.roaring_bitmap; + *roaring_bitmap &= *new_rb; } } /** * Computes the union between two bitmaps. */ - void rb_or(const RoaringBitmapWithSmallSet & r1) { merge(r1); } /// NOLINT + void rb_or(const RoaringBitmapWithSmallSet & r1) + { + merge(r1); /// NOLINT + } /** * Computes the symmetric difference (xor) between two bitmaps. @@ -205,8 +229,8 @@ public: if (isSmall()) toLarge(); - std::shared_ptr new_rb = r1.isSmall() ? r1.getNewRoaringBitmapFromSmall() : r1.rb; - *rb ^= *new_rb; + std::shared_ptr new_rb = r1.isSmall() ? r1.getNewRoaringBitmapFromSmall() : r1.roaring_bitmap; + *roaring_bitmap ^= *new_rb; } /** @@ -234,7 +258,7 @@ public: { for (const auto & x : small) { - if (!r1.rb->contains(static_cast(x.getValue()))) + if (!r1.roaring_bitmap->contains(static_cast(x.getValue()))) buffer.push_back(x.getValue()); } @@ -248,8 +272,8 @@ public: } else { - std::shared_ptr new_rb = r1.isSmall() ? r1.getNewRoaringBitmapFromSmall() : r1.rb; - *rb -= *new_rb; + std::shared_ptr new_rb = r1.isSmall() ? r1.getNewRoaringBitmapFromSmall() : r1.roaring_bitmap; + *roaring_bitmap -= *new_rb; } } @@ -269,14 +293,14 @@ public: { for (const auto & x : small) { - if (r1.rb->contains(static_cast(x.getValue()))) + if (r1.roaring_bitmap->contains(static_cast(x.getValue()))) ++ret; } } else { - std::shared_ptr new_rb = r1.isSmall() ? r1.getNewRoaringBitmapFromSmall() : r1.rb; - ret = (*rb & *new_rb).cardinality(); + std::shared_ptr new_rb = r1.isSmall() ? r1.getNewRoaringBitmapFromSmall() : r1.roaring_bitmap; + ret = (*roaring_bitmap & *new_rb).cardinality(); } return ret; } @@ -321,8 +345,8 @@ public: if (isSmall()) toLarge(); - std::shared_ptr new_rb = r1.isSmall() ? r1.getNewRoaringBitmapFromSmall() : r1.rb; - return *rb == *new_rb; + std::shared_ptr new_rb = r1.isSmall() ? r1.getNewRoaringBitmapFromSmall() : r1.roaring_bitmap; + return *roaring_bitmap == *new_rb; } /** @@ -343,7 +367,7 @@ public: { for (const auto & x : small) { - if (r1.rb->contains(static_cast(x.getValue()))) + if (r1.roaring_bitmap->contains(static_cast(x.getValue()))) return 1; } } @@ -352,13 +376,13 @@ public: { for (const auto & x : r1.small) { - if (rb->contains(static_cast(x.getValue()))) + if (roaring_bitmap->contains(static_cast(x.getValue()))) return 1; } } else { - if ((*rb & *r1.rb).cardinality() > 0) + if ((*roaring_bitmap & *r1.roaring_bitmap).cardinality() > 0) return 1; } @@ -396,7 +420,7 @@ public: // greater then r1 is not a subset. for (const auto & x : small) { - if (!r1.rb->contains(static_cast(x.getValue())) && ++r1_size > small.size()) + if (!r1.roaring_bitmap->contains(static_cast(x.getValue())) && ++r1_size > small.size()) return 0; } } @@ -405,13 +429,13 @@ public: { for (const auto & x : r1.small) { - if (!rb->contains(static_cast(x.getValue()))) + if (!roaring_bitmap->contains(static_cast(x.getValue()))) return 0; } } else { - if (!r1.rb->isSubset(*rb)) + if (!r1.roaring_bitmap->isSubset(*roaring_bitmap)) return 0; } return 1; @@ -428,46 +452,7 @@ public: if (isSmall()) return small.find(static_cast(x)) != small.end(); else - return rb->contains(static_cast(x)); - } - - /** - * Remove value - */ - void rb_remove(UInt64 x) /// NOLINT - { - if (!std::is_same_v && x > rb_max()) - return; - - if (isSmall()) - toLarge(); - - rb->remove(x); - } - - /** - * compute (in place) the negation of the roaring bitmap within a specified - * interval: [range_start, range_end). The number of negated values is - * range_end - range_start. - * Areas outside the range are passed through unchanged. - */ - void rb_flip(UInt64 begin, UInt64 end) /// NOLINT - { - if (isSmall()) - toLarge(); - - rb->flip(begin, end); - } - - /** - * returns the number of integers that are smaller or equal to offsetid. - */ - UInt64 rb_rank(UInt64 x) /// NOLINT - { - if (isSmall()) - toLarge(); - - return rb->rank(x); + return roaring_bitmap->contains(static_cast(x)); } /** @@ -487,7 +472,7 @@ public: } else { - for (auto it = rb->begin(); it != rb->end(); ++it) + for (auto it = roaring_bitmap->begin(); it != roaring_bitmap->end(); ++it) { res.emplace_back(*it); ++count; @@ -519,7 +504,7 @@ public: } else { - for (auto it = rb->begin(); it != rb->end(); ++it) + for (auto it = roaring_bitmap->begin(); it != roaring_bitmap->end(); ++it) { if (*it < range_start) continue; @@ -569,7 +554,7 @@ public: else { UInt64 count = 0; - for (auto it = rb->begin(); it != rb->end(); ++it) + for (auto it = roaring_bitmap->begin(); it != roaring_bitmap->end(); ++it) { if (*it < range_start) continue; @@ -607,11 +592,11 @@ public: { UInt64 count = 0; UInt64 offset_count = 0; - auto it = rb->begin(); - for (;it != rb->end() && offset_count < offset; ++it) + auto it = roaring_bitmap->begin(); + for (;it != roaring_bitmap->end() && offset_count < offset; ++it) ++offset_count; - for (;it != rb->end() && count < limit; ++it, ++count) + for (;it != roaring_bitmap->end() && count < limit; ++it, ++count) r1.add(*it); return count; } @@ -633,7 +618,7 @@ public: return min_val; } else - return rb->minimum(); + return roaring_bitmap->minimum(); } UInt64 rb_max() const /// NOLINT @@ -652,7 +637,7 @@ public: return max_val; } else - return rb->maximum(); + return roaring_bitmap->maximum(); } /** @@ -668,9 +653,9 @@ public: { if (from_vals[i] == to_vals[i]) continue; - bool changed = rb->removeChecked(static_cast(from_vals[i])); + bool changed = roaring_bitmap->removeChecked(static_cast(from_vals[i])); if (changed) - rb->add(static_cast(to_vals[i])); + roaring_bitmap->add(static_cast(to_vals[i])); } } }; @@ -680,7 +665,7 @@ struct AggregateFunctionGroupBitmapData { // If false, all bitmap operations will be treated as merge to initialize the state bool init = false; - RoaringBitmapWithSmallSet rbs; + RoaringBitmapWithSmallSet roaring_bitmap_with_small_set; static const char * name() { return "groupBitmap"; } }; diff --git a/src/AggregateFunctions/AggregateFunctionGroupUniqArray.h b/src/AggregateFunctions/AggregateFunctionGroupUniqArray.h index f8e426363d8..bc7ccb08267 100644 --- a/src/AggregateFunctions/AggregateFunctionGroupUniqArray.h +++ b/src/AggregateFunctions/AggregateFunctionGroupUniqArray.h @@ -181,7 +181,6 @@ public: auto & set = this->data(place).value; size_t size; readVarUInt(size, buf); - //TODO: set.reserve(size); for (size_t i = 0; i < size; ++i) set.insert(readStringBinaryInto(*arena, buf)); diff --git a/src/AggregateFunctions/AggregateFunctionHistogram.h b/src/AggregateFunctions/AggregateFunctionHistogram.h index 35e5f241ec9..62ed071856a 100644 --- a/src/AggregateFunctions/AggregateFunctionHistogram.h +++ b/src/AggregateFunctions/AggregateFunctionHistogram.h @@ -292,6 +292,9 @@ public: readVarUInt(size, buf); if (size > max_bins * 2) throw Exception(ErrorCodes::TOO_LARGE_ARRAY_SIZE, "Too many bins"); + static constexpr size_t max_size = 1_GiB; + if (size > max_size) + throw Exception(ErrorCodes::TOO_LARGE_ARRAY_SIZE, "Too large array size in histogram."); buf.readStrict(reinterpret_cast(points), size * sizeof(WeightedValue)); } diff --git a/src/AggregateFunctions/AggregateFunctionMap.h b/src/AggregateFunctions/AggregateFunctionMap.h index 91530698bf4..55f6611974e 100644 --- a/src/AggregateFunctions/AggregateFunctionMap.h +++ b/src/AggregateFunctions/AggregateFunctionMap.h @@ -61,15 +61,11 @@ struct AggregateFunctionMapCombinatorData static void writeKey(String key, WriteBuffer & buf) { - writeVarUInt(key.size(), buf); - writeString(key, buf); + writeStringBinary(key, buf); } static void readKey(String & key, ReadBuffer & buf) { - UInt64 size; - readVarUInt(size, buf); - key.resize(size); - buf.readStrict(key.data(), size); + readStringBinary(key, buf); } }; diff --git a/src/AggregateFunctions/AggregateFunctionSecondMoment.cpp b/src/AggregateFunctions/AggregateFunctionSecondMoment.cpp new file mode 100644 index 00000000000..123baac3e37 --- /dev/null +++ b/src/AggregateFunctions/AggregateFunctionSecondMoment.cpp @@ -0,0 +1,24 @@ +#include +#include + + +namespace DB +{ + +template using AggregateFunctionSecondMoment = AggregateFunctionVarianceSimple>; + +void registerAggregateFunctionsStatisticsSecondMoment(AggregateFunctionFactory & factory) +{ + factory.registerFunction("varSamp", createAggregateFunctionStatisticsUnary); + factory.registerFunction("varPop", createAggregateFunctionStatisticsUnary); + factory.registerFunction("stddevSamp", createAggregateFunctionStatisticsUnary); + factory.registerFunction("stddevPop", createAggregateFunctionStatisticsUnary); + + /// Synonyms for compatibility. + factory.registerAlias("VAR_SAMP", "varSamp", AggregateFunctionFactory::CaseInsensitive); + factory.registerAlias("VAR_POP", "varPop", AggregateFunctionFactory::CaseInsensitive); + factory.registerAlias("STDDEV_SAMP", "stddevSamp", AggregateFunctionFactory::CaseInsensitive); + factory.registerAlias("STDDEV_POP", "stddevPop", AggregateFunctionFactory::CaseInsensitive); +} + +} diff --git a/src/AggregateFunctions/AggregateFunctionSequenceNextNode.h b/src/AggregateFunctions/AggregateFunctionSequenceNextNode.h index 76610772b22..4fd7db4160e 100644 --- a/src/AggregateFunctions/AggregateFunctionSequenceNextNode.h +++ b/src/AggregateFunctions/AggregateFunctionSequenceNextNode.h @@ -86,7 +86,7 @@ struct NodeBase { UInt64 size; readVarUInt(size, buf); - if unlikely (size > max_node_size_deserialize) + if (unlikely(size > max_node_size_deserialize)) throw Exception(ErrorCodes::TOO_LARGE_ARRAY_SIZE, "Too large node state size"); Node * node = reinterpret_cast(arena->alignedAlloc(sizeof(Node) + size, alignof(Node))); @@ -323,6 +323,9 @@ public: if (unlikely(size == 0)) return; + if (unlikely(size > max_node_size_deserialize)) + throw Exception(ErrorCodes::TOO_LARGE_ARRAY_SIZE, "Too large array size"); + auto & value = data(place).value; value.resize(size, arena); diff --git a/src/AggregateFunctions/AggregateFunctionStatisticsSimple.cpp b/src/AggregateFunctions/AggregateFunctionStatisticsSimple.cpp deleted file mode 100644 index d06c1619b9f..00000000000 --- a/src/AggregateFunctions/AggregateFunctionStatisticsSimple.cpp +++ /dev/null @@ -1,80 +0,0 @@ -#include -#include -#include -#include - - -namespace DB -{ -struct Settings; - -namespace ErrorCodes -{ - extern const int ILLEGAL_TYPE_OF_ARGUMENT; -} - -namespace -{ - -template